[llvm] a387bce - [MLGO] Upstream the corpus extraction tooling (#72319)

Fri Jan 19 17:23:55 PST 2024

Author: Aiden Grossman
Date: 2024-01-19T17:23:51-08:00
New Revision: a387bce4bcbaeb28bf4510817ce54602e2f7a21d

URL: https://github.com/llvm/llvm-project/commit/a387bce4bcbaeb28bf4510817ce54602e2f7a21d
DIFF: https://github.com/llvm/llvm-project/commit/a387bce4bcbaeb28bf4510817ce54602e2f7a21d.diff

LOG: [MLGO] Upstream the corpus extraction tooling (#72319)

This patch upstreams some of the MLGO utilities, particularly the corpus
extraction tooling, into LLVM proper. The motivation for this patch is
available in the RFC.


https://discourse.llvm.org/t/rfc-upstreaming-elements-of-the-mlgo-tooling/74939

Added: 
    llvm/utils/mlgo-utils/CMakeLists.txt
    llvm/utils/mlgo-utils/README.md
    llvm/utils/mlgo-utils/mlgo/__init__.py
    llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
    llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
    llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
    llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
    llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
    llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py
    llvm/utils/mlgo-utils/pyproject.toml
    llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
    llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
    llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
    llvm/utils/mlgo-utils/tests/lit.cfg
    llvm/utils/mlgo-utils/tests/lit.local.cfg
    llvm/utils/mlgo-utils/tests/lit.site.cfg.in

Modified: 
    llvm/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 61ab69d237470f2..1d230004e6c34ec 100644

--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1197,6 +1197,7 @@ if( LLVM_INCLUDE_UTILS )
   add_subdirectory(utils/UnicodeData)
   add_subdirectory(utils/yaml-bench)
   add_subdirectory(utils/split-file)
+  add_subdirectory(utils/mlgo-utils)
   if( LLVM_INCLUDE_TESTS )
     add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
   endif()

diff  --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt
new file mode 100644
index 000000000000000..7b303c7639401ae
--- /dev/null
+++ b/llvm/utils/mlgo-utils/CMakeLists.txt
@@ -0,0 +1,11 @@
+configure_lit_site_cfg(
+  "${CMAKE_CURRENT_SOURCE_DIR}/tests/lit.site.cfg.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg"
+)
+
+add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS "FileCheck" "not" "count"
+)
+
+set_target_properties(check-mlgo-utils PROPERTIES FOLDER "Tests")

diff  --git a/llvm/utils/mlgo-utils/README.md b/llvm/utils/mlgo-utils/README.md
new file mode 100644
index 000000000000000..12e9375f23edac2
--- /dev/null
+++ b/llvm/utils/mlgo-utils/README.md
@@ -0,0 +1,12 @@
+# MLGO Python Utilities
+
+This folder contains MLGO Python utilities, particularly infrastructure
+to help enable ML applications within LLVM, especially tooling to extract
+corpora that can be used in downstream projects to train ML models and perform
+other tasks that benefit from having a large amount of data.
+
+### Python Versioning
+
+Due to type annotations, the MLGO tooling currently only supports a Python
+version greater than 3.8, deviating from the current LLVM project-wide
+minimum supported version of Python 3.6.

diff  --git a/llvm/utils/mlgo-utils/mlgo/__init__.py b/llvm/utils/mlgo-utils/mlgo/__init__.py
new file mode 100644
index 000000000000000..bcb5de2ff4d5752
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/__init__.py
@@ -0,0 +1,6 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+__versioninfo__ = (18, 0, 0)
+__version__ = ".".join(str(v) for v in __versioninfo__) + "dev"

diff  --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
new file mode 100644
index 000000000000000..9aabd87b4688e00
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1,48 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+  --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+from absl import app
+from absl import flags
+
+from mlgo.corpus import combine_training_corpus_lib
+
+flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+
+    combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
+
+
+def entrypoint():
+    app.run(main)
+
+
+if __name__ == "__main__":
+    entrypoint()

diff  --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
new file mode 100644
index 000000000000000..e2ae8699ec31801
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
@@ -0,0 +1,38 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Library for combining training corpora."""
+
+import os
+import json
+import glob
+
+from absl import logging
+
+_FILE_NAME = "corpus_description.json"
+
+
+def combine_corpus(root_dir: str) -> None:
+    module_names = []
+    output_corpus_description = {}
+
+    corpus_description_glob = os.path.join(root_dir, "*/" + _FILE_NAME)
+    for corpus_description_path in glob.glob(corpus_description_glob):
+        logging.info("processing %s", corpus_description_path)
+
+        with open(corpus_description_path, encoding="utf-8") as f:
+            corpus_description = json.load(f)
+            sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
+            module_names.extend(
+                [os.path.join(sub_dir, name) for name in corpus_description["modules"]]
+            )
+            del corpus_description["modules"]
+            if len(output_corpus_description) == 0:
+                output_corpus_description = corpus_description
+            elif corpus_description != output_corpus_description:
+                raise ValueError("Input corpora 
diff er by more than modules.")
+
+    output_corpus_description["modules"] = module_names
+
+    with open(os.path.join(root_dir, _FILE_NAME), "w") as f:
+        json.dump(output_corpus_description, f, indent=2)

diff  --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
new file mode 100644
index 000000000000000..9463e61dc534fed
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -0,0 +1,165 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, pass an integer representing the desired
+verbosity to the --verbosity flag. Use 0 for all logs, status information,
+and detailed debug information, -1 for solely warnings, and -2 to not produce
+any output.
+"""
+
+import json
+import multiprocessing
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from mlgo.corpus import extract_ir_lib
+
+flags.DEFINE_string(
+    "input",
+    None,
+    "Input file or directory - either compile_commands.json, a linker parameter"
+    "list, or a path to a directory containing object files.",
+)
+flags.DEFINE_enum(
+    "input_type",
+    "json",
+    ["json", "params", "directory"],
+    "Input file type - json, params, or directory. params latter refers to lld"
+    "params.",
+)
+flags.DEFINE_string("output_dir", None, "Output directory")
+flags.DEFINE_integer(
+    "num_workers",
+    None,
+    "Number of parallel workers for objcopy. `None` for maximum available.",
+)
+flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
+flags.DEFINE_string(
+    "obj_base_dir",
+    "",
+    "Base directory for object files. Defaults to current working dir.",
+)
+flags.DEFINE_string(
+    "cmd_filter",
+    None,
+    "Include only those modules with a command line matching this regexp. "
+    "Setting it to None for not filtering. Note that the regexp is applied "
+    "independently for each separate command line option. For example, ^-Oz$ "
+    "will match Oz - built binaries. Does not work with thinlto_build=lld.",
+)
+flags.DEFINE_enum(
+    "thinlto_build",
+    None,
+    ["distributed", "local"],
+    "Set if the build was performed with either 'distributed' or "
+    "'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
+    "The build is assumed to have had "
+    "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
+    "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
+    "passed in the local case.",
+)
+flags.DEFINE_string(
+    "cmd_section_name",
+    ".llvmcmd",
+    "The section name passed to llvm-objcopy. For ELF object files, the "
+    "default .llvmcmd is correct. For Mach-O object files, one should use "
+    "something like __LLVM,__cmdline",
+)
+flags.DEFINE_string(
+    "bitcode_section_name",
+    ".llvmbc",
+    "The section name passed to llvm-objcopy. For ELF object files, the "
+    "default .llvmbc is correct. For Mach-O object files, one should use "
+    "__LLVM,__bitcode",
+)
+
+flags.mark_flag_as_required("output_dir")
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+
+    objs = []
+    if FLAGS.input is not None and FLAGS.thinlto_build == "local":
+        raise ValueError("--thinlto_build=local cannot be run with --input")
+    if FLAGS.input is None:
+        if FLAGS.thinlto_build != "local":
+            raise ValueError("--input or --thinlto_build=local must be provided")
+        objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
+    elif FLAGS.input_type == "json":
+        with open(FLAGS.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_compile_commands(
+                json.load(f), FLAGS.output_dir
+            )
+    elif FLAGS.input_type == "params":
+        if not FLAGS.obj_base_dir:
+            logging.info(
+                "-obj_base_dir is unspecified, assuming current directory."
+                "If no objects are found, use this option to specify the root"
+                "directory for the object file paths in the input file."
+            )
+        with open(FLAGS.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_lld_params(
+                [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
+            )
+    elif FLAGS.input_type == "directory":
+        logging.warning(
+            "Using the directory input is only recommended if the build system"
+            "your project uses does not support any structured output that"
+            "ml-compiler-opt understands. If your build system provides a"
+            "structured compilation database, use that instead"
+        )
+        objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+    else:
+        logging.error("Unknown input type: %s", FLAGS.input_type)
+
+    relative_output_paths = extract_ir_lib.run_extraction(
+        objs,
+        FLAGS.num_workers,
+        FLAGS.llvm_objcopy_path,
+        FLAGS.cmd_filter,
+        FLAGS.thinlto_build,
+        FLAGS.cmd_section_name,
+        FLAGS.bitcode_section_name,
+    )
+
+    extract_ir_lib.write_corpus_manifest(
+        FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
+    )
+
+    logging.info(
+        "Converted %d files out of %d",
+        len(objs) - relative_output_paths.count(None),
+        len(objs),
+    )
+
+
+def entrypoint():
+    multiprocessing.set_start_method("fork")
+    app.run(main)
+
+
+if __name__ == "__main__":
+    entrypoint()

diff  --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
new file mode 100644
index 000000000000000..9c828ce1eb631fe
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
@@ -0,0 +1,395 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Library functions for IR extraction."""
+
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import multiprocessing
+import functools
+import json
+
+from typing import Dict, List, Optional
+
+from absl import logging
+
+_UNSPECIFIED_OVERRIDE = ["<UNSPECIFIED>"]
+
+
+# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
+# \0 - separated list of strings, to a \n one.
+def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
+    """Determine if the module should be included."""
+    if match_regexp is None:
+        return True
+    lines = cmdline.split("\0")
+    return any(len(re.findall(match_regexp, l)) for l in lines)
+
+
+def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
+    opts = cmdline.split("\0")
+    for option in opts:
+        if option.startswith("-fthinlto-index"):
+            return os.path.join(basedir, option.split("=")[1])
+    return None
+
+
+class TrainingIRExtractor:
+    """IR and command line extraction from an object file."""
+
+    def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
+        """Set up a TrainingIRExtractor.
+
+        Args:
+          obj_relative_path: relative path to the input object file. It will be also
+            used to construct the absolute path of the output IR and cmd files, by
+            appending it to output_base_dir.
+          output_base_dir: the directory under which the output will be produced.
+          obj_base_dir: the base directory for all the input object files.
+        """
+        self._obj_relative_path = obj_relative_path
+        self._output_base_dir = output_base_dir
+        self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ""
+
+    def obj_base_dir(self):
+        return self._obj_base_dir
+
+    def output_base_dir(self):
+        return self._output_base_dir
+
+    def relative_output_path(self):
+        return self._obj_relative_path
+
+    def input_obj(self):
+        return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+
+    def lld_src_bc(self):
+        # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+        # IR bitcode saved by lld. It is hardcoded into lld.
+        return os.path.join(
+            self._obj_base_dir, self._obj_relative_path + ".3.import.bc"
+        )
+
+    def lld_src_thinlto(self):
+        return os.path.join(self._obj_base_dir, self._obj_relative_path + ".thinlto.bc")
+
+    def dest_dir(self):
+        return os.path.join(
+            self.output_base_dir(), os.path.dirname(self._obj_relative_path)
+        )
+
+    def module_name(self):
+        return os.path.basename(self._obj_relative_path)
+
+    def cmd_file(self):
+        return os.path.join(self.dest_dir(), self.module_name() + ".cmd")
+
+    def bc_file(self):
+        return os.path.join(self.dest_dir(), self.module_name() + ".bc")
+
+    def thinlto_index_file(self):
+        return os.path.join(self.dest_dir(), self.module_name() + ".thinlto.bc")
+
+    def _get_extraction_cmd_command(
+        self, llvm_objcopy_path: str, cmd_section_name: str
+    ):
+        """Get llvm-objcopy and process args to a produce a command string that,
+        when invoked, will extract the cmd section info ths self.cmd_file() file.
+        """
+        return [
+            llvm_objcopy_path,
+            "--dump-section=" + cmd_section_name + "=" + self.cmd_file(),
+            self.input_obj(),
+            "/dev/null",
+        ]
+
+    def _get_extraction_bc_command(
+        self, llvm_objcopy_path: str, bitcode_section_name: str
+    ):
+        """Gets llvm-objcopy and process args to produce a command string that,
+        when invoked, will extract the bitcode section into the self.bc_file()
+        file.
+        """
+        return [
+            llvm_objcopy_path,
+            "--dump-section=" + bitcode_section_name + "=" + self.bc_file(),
+            self.input_obj(),
+            "/dev/null",
+        ]
+
+    def _extract_clang_artifacts(
+        self,
+        llvm_objcopy_path: str,
+        cmd_filter: str,
+        is_thinlto: bool,
+        cmd_section_name: str,
+        bitcode_section_name: str,
+    ) -> Optional[str]:
+        """Run llvm-objcopy to extract the .bc and command line."""
+        if not os.path.exists(self.input_obj()):
+            logging.info("%s does not exist.", self.input_obj())
+            return None
+        os.makedirs(self.dest_dir(), exist_ok=True)
+        try:
+            subprocess.check_output(
+                self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
+                stderr=subprocess.STDOUT,
+                encoding="utf-8",
+            )
+            if cmd_filter is not None or is_thinlto:
+                with open(self.cmd_file(), encoding="utf-8") as f:
+                    lines = f.readlines()
+                assert len(lines) == 1
+                cmdline = lines[0]
+                if not should_include_module(cmdline, cmd_filter):
+                    logging.info(
+                        "Excluding module %s because it does not match the filter",
+                        self.input_obj(),
+                    )
+                    os.remove(self.cmd_file())
+                    return None
+                if is_thinlto:
+                    index_file = get_thinlto_index(cmdline, self.obj_base_dir())
+                    shutil.copy(index_file, self.thinlto_index_file())
+
+            subprocess.check_output(
+                self._get_extraction_bc_command(
+                    llvm_objcopy_path, bitcode_section_name
+                ),
+                stderr=subprocess.STDOUT,
+                encoding="utf-8",
+            )
+        except subprocess.CalledProcessError as e:
+            # This may happen if  .o file was build from asm (.S source).
+            logging.warning("%s was not processed: %s", self.input_obj(), e)
+            logging.info(e.output)
+            return None
+        assert (
+            os.path.exists(self.cmd_file())
+            and os.path.exists(self.bc_file())
+            and (not is_thinlto or os.path.exists(self.thinlto_index_file()))
+        )
+        return self.relative_output_path()
+
+    def _extract_lld_artifacts(self) -> Optional[str]:
+        """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation."""
+        if not os.path.exists(self.lld_src_bc()):
+            logging.info("%s does not exist.", self.lld_src_bc())
+            return None
+        if not os.path.exists(self.lld_src_thinlto()):
+            logging.info("%s does not exist.", self.lld_src_thinlto())
+            return None
+        os.makedirs(self.dest_dir(), exist_ok=True)
+
+        # Copy over the files
+        shutil.copy(self.lld_src_bc(), self.bc_file())
+        shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
+
+        assert os.path.exists(self.bc_file())
+        assert os.path.exists(self.thinlto_index_file())
+        return self._obj_relative_path
+
+    def extract(
+        self,
+        llvm_objcopy_path: Optional[str] = None,
+        cmd_filter: Optional[str] = None,
+        thinlto_build: Optional[str] = None,
+        cmd_section_name: Optional[str] = ".llvmcmd",
+        bitcode_section_name: Optional[str] = ".llvmbc",
+    ) -> Optional[str]:
+        if thinlto_build == "local":
+            return self._extract_lld_artifacts()
+        return self._extract_clang_artifacts(
+            llvm_objcopy_path=llvm_objcopy_path,
+            cmd_filter=cmd_filter,
+            is_thinlto=thinlto_build == "distributed",
+            cmd_section_name=cmd_section_name,
+            bitcode_section_name=bitcode_section_name,
+        )
+
+
+def convert_compile_command_to_objectfile(
+    command: Dict[str, str], output_dir: str
+) -> Optional[TrainingIRExtractor]:
+    obj_base_dir = command["directory"]
+    if "arguments" in command:
+        cmd_parts = command["arguments"]
+    elif "command" in command:
+        cmd_parts = command["command"].split()
+    else:
+        logging.info("compile_commands element has no command and arguments")
+        return None
+
+    try:
+        obj_index = cmd_parts.index("-o") + 1
+    except ValueError:
+        # This could happen if there are non-clang commands in compile_commands.json
+        logging.info("Command has no -o option: %s", " ".join(cmd_parts))
+        return None
+    obj_rel_path = cmd_parts[obj_index]
+    # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
+    return TrainingIRExtractor(
+        obj_relative_path=obj_rel_path,
+        output_base_dir=output_dir,
+        obj_base_dir=obj_base_dir,
+    )
+
+
+def load_from_compile_commands(
+    json_array: List[Dict[str, str]], output_dir: str
+) -> List[TrainingIRExtractor]:
+    objs = [
+        convert_compile_command_to_objectfile(cmd, output_dir) for cmd in json_array
+    ]
+    # Filter out None, in case there were non-clang commands in the .json
+    return [obj for obj in objs if obj is not None]
+
+
+def load_from_lld_params(
+    params_array: List[str], obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+    """Create an ObjectFile array based on lld's parameters."""
+    # yank out -o and the output. After that, anything not starting with '-', and
+    # ending in a '.o', is an object file.
+    try:
+        minus_o_idx = params_array.index("-o")
+        del params_array[minus_o_idx : minus_o_idx + 2]
+        just_obj_paths = [
+            o for o in params_array if not o.startswith("-") and o.endswith(".o")
+        ]
+    except ValueError:
+        logging.info("This params file does not have an explicit -o option.")
+        just_obj_paths = params_array
+
+    def make_obj(obj_file: str) -> TrainingIRExtractor:
+        return TrainingIRExtractor(
+            obj_relative_path=obj_file,
+            output_base_dir=output_dir,
+            obj_base_dir=obj_base_dir,
+        )
+
+    return [make_obj(obj_file) for obj_file in just_obj_paths]
+
+
+def load_from_directory(
+    obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+    """Create an object file array by globbing an entire drectory.
+
+    Args:
+      obj_base_dir: The base build directory that all object files will be
+        written out as being relative to.
+      output_dir: The output directory where extracted .bc and .cmd files should
+        be placed.
+    """
+    paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.o")]
+
+    def make_spec(obj_file: str):
+        return TrainingIRExtractor(
+            obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
+            output_base_dir=output_dir,
+            obj_base_dir=obj_base_dir,
+        )
+
+    return [make_spec(path) for path in paths]
+
+
+def load_for_lld_thinlto(
+    obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+    # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+    # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
+    # are also emitted next to the postimport bitcode, with the suffix
+    # .thinlto.bc instead
+    paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.3.import.bc")]
+
+    def make_spec(obj_file: str):
+        return TrainingIRExtractor(
+            # Cut away .3.import.bc
+            obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
+            output_base_dir=output_dir,
+            obj_base_dir=obj_base_dir,
+        )
+
+    return [make_spec(path) for path in paths]
+
+
+def run_extraction(
+    objs: List[TrainingIRExtractor],
+    num_workers: int,
+    llvm_objcopy_path: str,
+    cmd_filter: str,
+    thinlto_build: str,
+    cmd_section_name: str,
+    bitcode_section_name: str,
+):
+    """Extracts all specified object files into the corpus directory.
+
+    Args:
+      objs: A list of TrainingIRExtractor Objects that represent the object files
+        to extract bitcode/commands from.
+      num_workers: The number of parallel processes to spawn to run the
+        extraction.
+      llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
+      cmd_filter: A regular expression that is used to select for compilations
+        performed with specific flags. If you want to include all compilations,
+        set this to None.
+      thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
+        Set this to None if the build was not done with ThinLTO.
+      cmd_section_name: The name of the command line section created by the
+        bitcode embedding.
+      bitcode_section_name: The name of the bitcode section created by the
+        bitcode embedding.
+    """
+    extract_artifacts = functools.partial(
+        TrainingIRExtractor.extract,
+        llvm_objcopy_path=llvm_objcopy_path,
+        cmd_filter=cmd_filter,
+        thinlto_build=thinlto_build,
+        cmd_section_name=cmd_section_name,
+        bitcode_section_name=bitcode_section_name,
+    )
+
+    with multiprocessing.Pool(num_workers) as pool:
+        relative_output_paths = pool.map(extract_artifacts, objs)
+        pool.close()
+        pool.join()
+    return relative_output_paths
+
+
+def write_corpus_manifest(
+    thinlto_build: str, relative_output_paths: List[str], output_dir: str
+):
+    """Writes a corpus_manifest.json containing all necessary information about
+    the corpus.
+
+    Args:
+      thinlto_build: Whether or not the build was done with ThinLTO and if so,
+        what kind of ThinLTO. Set this to none if the build was not performed with
+        ThinLTO.
+      relative_output_paths: The relative (to the corpus directory) output paths
+        of all the bitcode files that should be placed in the corpus manifest
+      output_dir: The corpus directory where the corpus manifest should be
+        placed.
+    """
+    # This comes first rather than later so global_command_override is at the top
+    # of the .json after being written
+    if thinlto_build == "local":
+        corpus_description = {"global_command_override": _UNSPECIFIED_OVERRIDE}
+    else:
+        corpus_description = {}
+
+    corpus_description.update(
+        {
+            "has_thinlto": thinlto_build is not None,
+            "modules": [path for path in relative_output_paths if path is not None],
+        }
+    )
+
+    with open(
+        os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
+    ) as f:
+        json.dump(corpus_description, f, indent=2)

diff  --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
new file mode 100644
index 000000000000000..edb0ecd853de246
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -0,0 +1,54 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+  --input_dir=<path to input directory> \
+  --output_dir=<path to output directory> \
+  --default_args="<list of space separated flags>"
+"""
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from mlgo.corpus import make_corpus_lib
+
+flags.DEFINE_string("input_dir", None, "The input directory.")
+flags.DEFINE_string("output_dir", None, "The output directory.")
+flags.DEFINE_string(
+    "default_args",
+    "",
+    "The compiler flags to compile with when using downstream tooling.",
+)
+
+flags.mark_flag_as_required("input_dir")
+flags.mark_flag_as_required("output_dir")
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+    logging.warning(
+        "Using this tool does not guarantee that the bitcode is taken at "
+        "the correct stage for consumption during model training. Make "
+        "sure to validate assumptions about where the bitcode is coming "
+        "from before using it in production."
+    )
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
+    make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, FLAGS.output_dir)
+    make_corpus_lib.write_corpus_manifest(
+        relative_paths, FLAGS.output_dir, FLAGS.default_args.split()
+    )
+
+
+def entrypoint():
+    app.run(main)
+
+
+if __name__ == "__main__":
+    entrypoint()

diff  --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py
new file mode 100644
index 000000000000000..697c97ebf6ee29b
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py
@@ -0,0 +1,77 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Library functions for making a corpus from arbitrary bitcode."""
+
+import pathlib
+import os
+import shutil
+import json
+
+from typing import List, Optional
+
+BITCODE_EXTENSION = ".bc"
+
+
+def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]:
+    """Finds bitcode files to extract from a given directory.
+
+    Args:
+      bitcode_base_dir: The base directory where the bitcode to be copied
+        is from.
+      output_dir: The directory to place the bitcode in.
+
+    Returns an array of paths representing the relative path to the bitcode
+    file from the base direcotry.
+    """
+    paths = [
+        str(p)[: -len(BITCODE_EXTENSION)]
+        for p in pathlib.Path(bitcode_base_dir).glob("**/*" + BITCODE_EXTENSION)
+    ]
+
+    return [os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths]
+
+
+def copy_bitcode(
+    relative_paths: List[str], bitcode_base_dir: str, output_dir: str
+) -> None:
+    """Copies bitcode files from the base directory to the output directory.
+
+    Args:
+      relative_paths: An array of relative paths to bitcode files that are copied
+        over to the output directory, preserving relative location.
+      bitcode_base_dir: The base directory where the bitcode is located.
+      output_dir: The output directory to place the bitcode in.
+    """
+    for relative_path in relative_paths:
+        base_path = os.path.join(bitcode_base_dir, relative_path + BITCODE_EXTENSION)
+        destination_path = os.path.join(output_dir, relative_path + BITCODE_EXTENSION)
+        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+        shutil.copy(base_path, destination_path)
+
+
+def write_corpus_manifest(
+    relative_output_paths: List[str],
+    output_dir: str,
+    default_args: Optional[List[str]] = None,
+) -> None:
+    """Creates a corpus manifest describing the bitcode that has been found.
+
+    Args:
+      relative_output_paths: A list of paths to each bitcode file relative to the
+        output directory.
+      outout_dir: The output directory where the corpus is being created.
+      default_args: An array of compiler flags that should be used to compile
+        the bitcode when using further downstream tooling."""
+    if default_args is None:
+        default_args = []
+    corpus_description = {
+        "global_command_override": default_args,
+        "has_thinlto": False,
+        "modules": [path for path in relative_output_paths if path is not None],
+    }
+
+    with open(
+        os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
+    ) as description_file:
+        json.dump(corpus_description, description_file, indent=2)

diff  --git a/llvm/utils/mlgo-utils/pyproject.toml b/llvm/utils/mlgo-utils/pyproject.toml
new file mode 100644
index 000000000000000..be2af86cd05df30
--- /dev/null
+++ b/llvm/utils/mlgo-utils/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "mlgo"
+description = "Tooling for ML in LLVM"
+readme = "README.md"
+requires-python = ">=3.8,<3.11"
+dependencies = [
+  "absl-py>=1.0.0"
+]
+dynamic = ["version"]
+license = {text = "Apache-2.0 WITH LLVM-exception"}
+classifiers = [
+  "License :: OSI Approved :: Apache Software License"
+]
+
+[tool.setuptools.dynamic]
+version = {attr = "mlgo.__version__"}
+
+[project.scripts]
+combine_training_corpus = "mlgo.combine_training_corpus:entrypoint"
+extract_ir = "mlgo.extract_ir:entrypoint"
+make_corpus = "mlgo.make_corpus:entrypoint"

diff  --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
new file mode 100644
index 000000000000000..dbfdc78a87533b5
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
@@ -0,0 +1,132 @@
+# REQUIRES: python-38, absl
+
+## Test the functionality of combine_training_corpus_lib
+
+import json
+import os
+import sys
+
+from mlgo.corpus import combine_training_corpus_lib
+
+## Test that combining two training corpora works as expected
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: mkdir %t.dir/subcorpus2
+# RUN: %python %s test_combine_corpus %t.dir | FileCheck %s --check-prefix CHECK-COMBINE-CORPUS
+
+
+def test_combine_corpus(corpus_dir):
+    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+    subcorpus2_dir = os.path.join(corpus_dir, "subcorpus2")
+    subcorpus1_description = {
+        "has_thinlto": False,
+        "modules": ["test1.o", "test2.o"],
+    }
+    subcorpus2_description = {
+        "has_thinlto": False,
+        "modules": ["test3.o", "test4.o"],
+    }
+    with open(
+        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+    ) as corpus1_description_handle:
+        json.dump(subcorpus1_description, corpus1_description_handle)
+    with open(
+        os.path.join(subcorpus2_dir, "corpus_description.json"), "w"
+    ) as corpus2_description_handle:
+        json.dump(subcorpus2_description, corpus2_description_handle)
+    combine_training_corpus_lib.combine_corpus(corpus_dir)
+    with open(
+        os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+    ) as combined_corpus_description_file:
+        combined_corpus_description = json.load(combined_corpus_description_file)
+    print(combined_corpus_description["has_thinlto"])
+    # CHECK-COMBINE-CORPUS: False
+    for module in sorted(combined_corpus_description["modules"]):
+        print(module)
+    # CHECK-COMBINE-CORPUS: subcorpus1/test1.o
+    # CHECK-COMBINE-CORPUS: subcorpus1/test2.o
+    # CHECK-COMBINE-CORPUS: subcorpus2/test3.o
+    # CHECK-COMBINE-CORPUS: subcorpus2/test4.o
+
+
+## Test that we handle the empty folder case gracefully
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: mkdir %t.dir/empty_dir
+# RUN: %python %s test_empty_folder %t.dir | FileCheck %s --check-prefix CHECK-EMPTY-DIR
+
+
+def test_empty_folder(corpus_dir):
+    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+    subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
+    with open(
+        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+    ) as subcorpus1_description_handle:
+        json.dump(subcorpus1_description, subcorpus1_description_handle)
+    combine_training_corpus_lib.combine_corpus(corpus_dir)
+    with open(
+        os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+    ) as combined_corpus_description_file:
+        combined_corpus_description = json.load(combined_corpus_description_file)
+    print(len(combined_corpus_description["modules"]))
+    # CHECK-EMPTY-DIR: 2
+
+
+## Test that we ignore extra files that will not end up contributing to the
+## corpus.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: touch %t.dir/empty.log
+# RUN: %python %s test_ignore_extra_file %t.dir | FileCheck %s --check-prefix CHECK-IGNORE-EXTRA-FILE
+
+
+def test_ignore_extra_file(corpus_dir):
+    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+    subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
+    with open(
+        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+    ) as subcorpus1_description_handle:
+        json.dump(subcorpus1_description, subcorpus1_description_handle)
+    combine_training_corpus_lib.combine_corpus(corpus_dir)
+    with open(
+        os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+    ) as combined_corpus_description_file:
+        combined_corpus_description = json.load(combined_corpus_description_file)
+    print(len(combined_corpus_description["modules"]))
+    # CHECK-IGNORE-EXTRA-FILE: 2
+
+
+## Test that we raise an error in the case where the corpora 
diff er in a
+## substantial way.
+
+# RUN: rm -rf  %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: mkdir %t.dir/subcorpus2
+# RUN: %python %s test_
diff erent_corpora %t.dir | FileCheck %s --check-prefix CHECK-DIFFERENT-CORPORA
+
+
+def test_
diff erent_corpora(corpus_dir):
+    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+    subcorpus2_dir = os.path.join(corpus_dir, "subcorpus2")
+    subcorpus1_description = {"has_thinlto": False, "modules": ["test1.o"]}
+    subcorpus2_description = {"has_thinlto": True, "modules": ["test2.o"]}
+    with open(
+        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+    ) as subcorpus1_description_handle:
+        json.dump(subcorpus1_description, subcorpus1_description_handle)
+    with open(
+        os.path.join(subcorpus2_dir, "corpus_description.json"), "w"
+    ) as subcorpus2_description_handle:
+        json.dump(subcorpus2_description, subcorpus2_description_handle)
+    try:
+        combine_training_corpus_lib.combine_corpus(corpus_dir)
+    except ValueError:
+        print("ValueError")
+        # CHECK-DIFFERENT-CORPORA: ValueError
+
+
+if __name__ == "__main__":
+    globals()[sys.argv[1]](*sys.argv[2:])

diff  --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
new file mode 100644
index 000000000000000..3ed52ec21de953c
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
@@ -0,0 +1,339 @@
+# REQUIRES: python-38, absl
+
+## Test the functionality of extract_ir_lib
+
+import os.path
+import sys
+
+from mlgo.corpus import extract_ir_lib
+
+## Test that we can convert a compilation database with a single compilation
+## command in it.
+
+# RUN: %python %s test_one_conversion | FileCheck %s --check-prefix CHECK-ONE-CONVERSION
+
+
+def test_one_conversion():
+    obj = extract_ir_lib.convert_compile_command_to_objectfile(
+        {
+            "directory": "/output/directory",
+            "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
+            "file": "/some/path/lib/foo/bar.cc",
+        },
+        "/corpus/destination/path",
+    )
+    print(obj.input_obj())
+    # CHECK-ONE-CONVERSION: /output/directory/lib/bar.o
+    print(obj.relative_output_path())
+    # CHECK-ONE-CONVERSION: lib/bar.o
+    print(obj.cmd_file())
+    # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.cmd
+    print(obj.bc_file())
+    # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.bc
+    print(obj.thinlto_index_file())
+    # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.thinlto.bc
+
+
+## Test that we can convert an arguments style compilation database
+
+# RUN: %python %s test_one_conversion_arguments_style | FileCheck %s --check-prefix CHECK-ARGUMENTS-STYLE
+
+
+def test_one_conversion_arguments_style():
+    obj = extract_ir_lib.convert_compile_command_to_objectfile(
+        {
+            "directory": "/output/directory",
+            "arguments": [
+                "-cc1",
+                "-c",
+                "/some/path/lib/foo/bar.cc",
+                "-o",
+                "lib/bar.o",
+            ],
+            "file": "/some/path/lib/foo/bar.cc",
+        },
+        "/corpus/destination/path",
+    )
+    print(obj.input_obj())
+    # CHECK-ARGUMENTS-STYLE: /output/directory/lib/bar.o
+    print(obj.relative_output_path())
+    # CHECK-ARGUMENTS-STYLE: lib/bar.o
+    print(obj.cmd_file())
+    # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.cmd
+    print(obj.bc_file())
+    # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.bc
+    print(obj.thinlto_index_file())
+    # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.thinlto.bc
+
+
+## Test that converting multiple files works as well
+
+# RUN: %python %s test_multiple_conversion | FileCheck %s --check-prefix CHECK-MULTIPLE-CONVERSION
+
+
+def test_multiple_conversion():
+    res = extract_ir_lib.load_from_compile_commands(
+        [
+            {
+                "directory": "/output/directory",
+                "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
+                "file": "/some/path/lib/foo/bar.cc",
+            },
+            {
+                "directory": "/output/directory",
+                "command": "-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o",
+                "file": "/some/path/lib/foo/baz.cc",
+            },
+        ],
+        "/corpus/destination/path",
+    )
+    res = list(res)
+    print(res[0].input_obj())
+    # CHECK-MULTIPLE-CONVERSION: /output/directory/lib/bar.o
+    print(res[0].relative_output_path())
+    # CHECK-MULTIPLE-CONVERSION: lib/bar.o
+    print(res[0].cmd_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.cmd
+    print(res[0].bc_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.bc
+    print(res[0].thinlto_index_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.thinlto.bc
+
+    print(res[1].input_obj(), "/output/directory/lib/other/baz.o")
+    # CHECK-MULTIPLE-CONVERSION: /output/directory/lib/other/baz.o
+    print(res[1].relative_output_path(), "lib/other/baz.o")
+    # CHECK-MULTIPLE-CONVERSION: lib/other/baz.o
+    print(res[1].cmd_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.cmd
+    print(res[1].bc_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.bc
+    print(res[1].thinlto_index_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.thinlto.bc
+
+
+## Test that we generate the correct objcopy commands for extracting commands
+
+# RUN: %python %s test_command_extraction | FileCheck %s --check-prefix CHECK-COMMAND-EXTRACT
+
+
+def test_command_extraction():
+    obj = extract_ir_lib.TrainingIRExtractor(
+        obj_relative_path="lib/obj_file.o",
+        output_base_dir="/where/corpus/goes",
+        obj_base_dir="/foo/bar",
+    )
+    extraction_cmd1 = obj._get_extraction_cmd_command(
+        "/bin/llvm_objcopy_path", ".llvmcmd"
+    )
+    for part in extraction_cmd1:
+        print(part)
+    # CHECK-COMMAND-EXTRACT: /bin/llvm_objcopy_path
+    # CHECK-COMMAND-EXTRACT: --dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd
+    # CHECK-COMMAND-EXTRACT: /foo/bar/lib/obj_file.o
+    # CHECK-COMMAND-EXTRACT: /dev/null
+
+    extraction_cmd2 = obj._get_extraction_bc_command(
+        "/bin/llvm_objcopy_path", ".llvmbc"
+    )
+    for part in extraction_cmd2:
+        print(part)
+    # CHECK-COMMAND-EXTRACT: /bin/llvm_objcopy_path
+    # CHECK-COMMAND-EXTRACT: --dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc
+    # CHECK-COMMAND-EXTRACT: /foo/bar/lib/obj_file.o
+    # CHECK-COMMAND-EXTRACT: /dev/null
+
+
+## Test that we generate the correct extraction commands without specifying
+## an output base directory.
+
+# RUN: %python %s test_command_extraction_no_basedir | FileCheck %s --check-prefix CHECK-COMMAND-EXTRACT-NOBASEDIR
+
+
+def test_command_extraction_no_basedir():
+    obj = extract_ir_lib.TrainingIRExtractor("lib/obj_file.o", "/where/corpus/goes")
+    extraction_cmd1 = obj._get_extraction_cmd_command(
+        "/bin/llvm_objcopy_path", ".llvmcmd"
+    )
+    for part in extraction_cmd1:
+        print(part)
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /bin/llvm_objcopy_path
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: --dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: lib/obj_file.o
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /dev/null
+
+    extraction_cmd2 = obj._get_extraction_bc_command(
+        "/bin/llvm_objcopy_path", ".llvmbc"
+    )
+    for part in extraction_cmd2:
+        print(part)
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /bin/llvm_objcopy_path
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: --dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: lib/obj_file.o
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /dev/null
+
+
+## Test that we can extract a corpus from lld parameters
+
+# RUN: %python %s test_lld_params | FileCheck %s --check-prefix CHECK-LLD-PARAMS
+
+
+def test_lld_params():
+    lld_opts = [
+        "-o",
+        "output/dir/exe",
+        "lib/obj1.o",
+        "somelib.a",
+        "-W,blah",
+        "lib/dir/obj2.o",
+    ]
+    obj = extract_ir_lib.load_from_lld_params(lld_opts, "/some/path", "/tmp/out")
+    print(obj[0].input_obj())
+    # CHECK-LLD-PARAMS: /some/path/lib/obj1.o
+    print(obj[0].relative_output_path())
+    # CHECK-LLD-PARAMS: lib/obj1.o
+    print(obj[0].cmd_file())
+    # CHECK-LLD-PARAMS: /tmp/out/lib/obj1.o.cmd
+    print(obj[0].thinlto_index_file())
+    # CHECK-LLD-PARAMS: /tmp/out/lib/obj1.o.thinlto.bc
+    print(obj[1].input_obj())
+    # CHECK-LLD-PARMAS: /some/path/lib/dir/obj2.o
+
+
+## Test that we can load a corpus from a directory containing object files
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subdir
+# RUN: touch %t.dir/subdir/test1.o
+# RUN: touch %t.dir/subdir/test2.o
+# RUN: %python %s test_load_from_directory %t.dir | FileCheck %s --check-prefix CHECK-LOAD-DIR
+
+
+def test_load_from_directory(tempdir):
+    objs = extract_ir_lib.load_from_directory(tempdir, "/output")
+    for index, obj in enumerate(sorted(objs, key=lambda x: x._obj_relative_path)):
+        print(obj._obj_relative_path, f"subdir/test{index + 1:d}.o")
+        # CHECK-LOAD-DIR: subdir/test1.o
+        # Explicitly check for equality here as we can not check within
+        # FileCheck the exact value as lit substitutions do not work in
+        # FileCheck lines.
+        print(obj._obj_base_dir == tempdir)
+        # CHECK-LOAD-DIR: True
+        print(obj._output_base_dir)
+        # CHECK-LOAD-DIR /output
+
+
+## Test that we can load a corpus in the lld thinLTO case
+
+# RUN: rm -rf %.dir && mkdir %t.dir
+# RUN: touch %t.dir/1.3.import.bc
+# RUN: touch %t.dir/2.3.import.bc
+# RUN: touch %t.dir/3.3.import.bc
+# RUN: touch %t.dir/1.thinlto.bc
+# RUN: touch %t.dir/2.thinlto.bc
+# RUN: touch %t.dir/3.thinlto.bc
+# RUN: %python %s test_lld_thinlto_discovery %t.dir | FileCheck %s --check-prefix CHECK-LLD-THINLTO-DISCOVERY
+
+
+def test_lld_thinlto_discovery(tempdir):
+    obj = extract_ir_lib.load_for_lld_thinlto(tempdir, "/output")
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+        print(o._obj_relative_path)
+        # Explicitly check for equality as we can not check within FileCheck
+        # using the lit substitution for the temp dir
+        print(o._obj_base_dir == tempdir)
+        print(o._output_base_dir)  # outdir
+    # CHECK-LLD-THINLTO-DISCOVERY: 1
+    # CHECK-LLD-THINLTO-DISCOVERY: True
+    # CHECK-LLD-THINLTO-DISCOVERY: /output
+    # CHECK-LLD-THINLTO-DISCOVERY: 2
+    # CHECK-LLD-THINLTO-DISCOVERY: True
+    # CHECK-LLD-THINLTO-DISCOVERY: /output
+    # CHECK-LLD-THINLTO-DISCOVERY: 3
+    # CHECK-LLD-THINLTO-DISCOVERY: True
+    # CHECK-LLD-THINLTO-DISCOVERY: /output
+
+
+## Test that we can load a corpus in the nested lld thinLTO case
+
+# RUN: mkdir %t.dir/nest
+# RUN: mv %t.dir/*.bc %t.dir/nest
+# RUN: %python %s test_lld_thinlto_discovery_nested %t.dir | FileCheck %s --check-prefix CHECK-LLD-THINLTO-DISCOVERY-NESTED
+
+
+def test_lld_thinlto_discovery_nested(outer):
+    obj = extract_ir_lib.load_for_lld_thinlto(outer, "/output")
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+        print(o._obj_relative_path)
+        print(o._obj_base_dir == outer)
+        print(o._output_base_dir)
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/1
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/2
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/3
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
+
+
+## Test the lld extraction works as expected
+
+# RUN: rm -rf  %t.dir.out && mkdir %t.dir.out
+# RUN: %python %s test_lld_thinlto_extraction %t.dir %t.dir.out | FileCheck %s --check-prefix CHECK-LLD-THINLTO-EXTRACTION-PY
+# ls %t.dir.out/nest | FileChceck %s --check-prefix CHECK-LLD-THINLTO-EXTRACTION
+
+# CHECK-LLD-THINLTO-EXTRACTION: 1
+# CHECK-LLD-THINLTO-EXTRACTION: 2
+# CHECK-LLD-THINLTO-EXTRACTION: 3
+# CHECK-LLD-THINLTO-EXTRACTION: 1.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 2.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 3.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 1.thinlto.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 2.thinlto.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 3.thinlto.bc
+
+
+def test_lld_thinlto_extraction(outer, outdir):
+    obj = extract_ir_lib.load_for_lld_thinlto(outer, outdir)
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+        mod_path = o.extract(thinlto_build="local")
+        print(mod_path)
+    # CHECK-LLD-THINLTO-EXTRACTION-PY: 1
+    # CHECK-LLD-THINLTO-EXTRACTION-PY: 2
+    # CHECK-LLD-THINLTO-EXTRACTION-PY: 3
+
+
+## Test that filtering works correctly
+
+# RUN: %python %s test_filtering | FileCheck %s --check-prefix CHECK-TEST-FILTERING
+
+
+def test_filtering():
+    cmdline = "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o"
+    print(extract_ir_lib.should_include_module(cmdline, None))
+    # CHECK-TEST-FILTERING: True
+    print(extract_ir_lib.should_include_module(cmdline, ".*"))
+    # CHECK-TEST-FILTERING: True
+    print(extract_ir_lib.should_include_module(cmdline, "^-Oz$"))
+    # CHECK-TEST-FILTERING: True
+    print(extract_ir_lib.should_include_module(cmdline, "^-O3$"))
+    # CHECK-TEST-FILTERING: False
+
+
+## Test that we extract the thinLTO index correctly
+
+# RUN: %python %s test_thinlto_index_extractor | FileCheck %s --check-prefix CHECK-THINLTO-INDEX-EXTRACTOR
+
+
+def test_thinlto_index_extractor():
+    cmdline = (
+        "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/"
+        "out.o\0-fthinlto-index=foo/bar.thinlto.bc"
+    )
+    print(extract_ir_lib.get_thinlto_index(cmdline, "/the/base/dir"))
+    # CHECK-THINLTO-INDEX-EXTRACTOR: /the/base/dir/foo/bar.thinlto.bc
+
+
+if __name__ == "__main__":
+    globals()[sys.argv[1]](*sys.argv[2:])

diff  --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
new file mode 100644
index 000000000000000..0f970414b1aecde
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
@@ -0,0 +1,71 @@
+# REQUIRES: python-38, absl
+
+## Test the functionality of make_corpus_lib
+
+import json
+import os
+import sys
+
+from mlgo.corpus import make_corpus_lib
+
+## Test that when we load the bitcode from a directory using the
+## load_bitcode_from_directory function, we get the expected results.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/nested
+# RUN: touch %t.dir/nested/test1.bc
+# RUN: touch %t.dir/nested/test2.bc
+# RUN: %python %s test_load_bitcode_from_directory %t.dir | FileCheck %s --check-prefix CHECK-LOAD
+
+
+def test_load_bitcode_from_directory(work_dir):
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(work_dir)
+    relative_paths = sorted(relative_paths)
+    for relative_path in relative_paths:
+        print(relative_path)
+    # CHECK-LOAD: nested/test1
+    # CHECK-LOAD: nested/test2
+
+
+## Test that when we copy the bitcode given a list of relative paths, the
+## appropriate files are copied over.
+
+# RUN: rm -rf %t.dir1 && mkdir %t.dir1
+# RUN: %python %s test_copy_bitcode %t.dir %t.dir1
+# RUN: ls %t.dir1/nested | FileCheck %s --check-prefix CHECK-COPY
+
+# CHECK-COPY: test1.bc
+# CHECK-COPY: test2.bc
+
+
+def test_copy_bitcode(directory, output_dir):
+    relative_paths = ["nested/test1", "nested/test2"]
+    make_corpus_lib.copy_bitcode(relative_paths, directory, output_dir)
+
+
+## Test that we get the expected corpus manifest when writing a corpus
+## manifest to the specificed directory.
+
+# RUN: %python %s test_write_corpus_manifest %t.dir1 | FileCheck %s --check-prefix CHECK-MANIFEST
+
+
+def test_write_corpus_manifest(output_dir):
+    relative_output_paths = ["test/test1", "test/test2"]
+    default_args = ["-O3", "-c"]
+    make_corpus_lib.write_corpus_manifest(
+        relative_output_paths, output_dir, default_args
+    )
+    with open(
+        os.path.join(output_dir, "corpus_description.json"), encoding="utf-8"
+    ) as corpus_description_file:
+        corpus_description = json.load(corpus_description_file)
+    print(corpus_description["global_command_override"])
+    # CHECK-MANIFEST: ['-O3', '-c']
+    print(corpus_description["has_thinlto"])
+    # CHECK-MANIFEST: False
+    print(corpus_description["modules"])
+    # CHECK-MANIFEST: ['test/test1', 'test/test2']
+
+
+if __name__ == "__main__":
+    globals()[sys.argv[1]](*sys.argv[2:])

diff  --git a/llvm/utils/mlgo-utils/tests/lit.cfg b/llvm/utils/mlgo-utils/tests/lit.cfg
new file mode 100644
index 000000000000000..055f0945942fc1c
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/lit.cfg
@@ -0,0 +1,15 @@
+import lit.formats
+
+from lit.llvm import llvm_config
+
+config.name = "mlgo-utils"
+config.test_format = lit.formats.ShTest(execute_external=False)
+
+config.suffixes = [".py"]
+
+config.test_source_root = os.path.dirname(__file__)
+config.test_exec_root = config.obj_root
+
+config.environment["PYTHONPATH"] = os.path.join(config.src_root, "utils", "mlgo-utils")
+
+llvm_config.use_default_substitutions()

diff  --git a/llvm/utils/mlgo-utils/tests/lit.local.cfg b/llvm/utils/mlgo-utils/tests/lit.local.cfg
new file mode 100644
index 000000000000000..90cdf8ba618ed8f
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/lit.local.cfg
@@ -0,0 +1,14 @@
+import sys
+
+# TODO(boomanaiden154): Remove this flag once the minimum Python version for
+# the entire project has been bumped to 3.8.
+if sys.version_info > (3,8):
+    config.available_features.add("python-38")
+
+# TODO(boomanaiden154): Remove this flag once the scripts are converted to
+# not use absl anymore.
+try:
+    import absl
+    config.available_features.add("absl")
+except:
+    pass

diff  --git a/llvm/utils/mlgo-utils/tests/lit.site.cfg.in b/llvm/utils/mlgo-utils/tests/lit.site.cfg.in
new file mode 100644
index 000000000000000..22e1524e6a8fd20
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/lit.site.cfg.in
@@ -0,0 +1,10 @@
+ at LIT_SITE_CFG_IN_HEADER@
+
+config.src_root = "@LLVM_SOURCE_DIR@"
+config.obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+lit_config.load_config(config, "@LLVM_SOURCE_DIR@/utils/mlgo-utils/tests/lit.cfg")