[clang-tools-extra] [llvm] [MLGO] Upstream the corpus extraction tooling (PR #72319)

Sun Jan 14 21:03:16 PST 2024

https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/72319

>From c3f723c8a975cc5e075d56350645b0be486f3cda Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Tue, 14 Nov 2023 14:20:24 -0800
Subject: [PATCH] [MLGO] Upstream the corpus extraction tooling

---
 llvm/py/Pyproject.toml                        |   1 +
 llvm/py/src/mlgo/combine_training_corpus.py   |  55 +++
 .../src/mlgo/combine_training_corpus_lib.py   |  50 +++
 .../src/mlgo/combine_training_corpus_test.py  | 104 +++++
 llvm/py/src/mlgo/extract_ir.py                | 142 +++++++
 llvm/py/src/mlgo/extract_ir_lib.py            | 373 ++++++++++++++++++
 llvm/py/src/mlgo/extract_ir_test.py           | 231 +++++++++++
 llvm/py/src/mlgo/make_corpus.py               |  58 +++
 llvm/py/src/mlgo/make_corpus_lib.py           |  90 +++++
 llvm/py/src/mlgo/make_corpus_test.py          |  66 ++++
 10 files changed, 1170 insertions(+)
 create mode 100644 llvm/py/Pyproject.toml
 create mode 100644 llvm/py/src/mlgo/combine_training_corpus.py
 create mode 100644 llvm/py/src/mlgo/combine_training_corpus_lib.py
 create mode 100644 llvm/py/src/mlgo/combine_training_corpus_test.py
 create mode 100644 llvm/py/src/mlgo/extract_ir.py
 create mode 100644 llvm/py/src/mlgo/extract_ir_lib.py
 create mode 100644 llvm/py/src/mlgo/extract_ir_test.py
 create mode 100644 llvm/py/src/mlgo/make_corpus.py
 create mode 100644 llvm/py/src/mlgo/make_corpus_lib.py
 create mode 100644 llvm/py/src/mlgo/make_corpus_test.py

diff --git a/llvm/py/Pyproject.toml b/llvm/py/Pyproject.toml
new file mode 100644
index 00000000000000..dcf2c804da5e19
--- /dev/null
+++ b/llvm/py/Pyproject.toml
@@ -0,0 +1 @@
+# Placeholder
diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/py/src/mlgo/combine_training_corpus.py
new file mode 100644
index 00000000000000..94ee1cbac9cea4
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+  --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+from absl import app
+from absl import flags
+
+from compiler_opt.tools import combine_training_corpus_lib
+
+flags.DEFINE_string('root_dir', '', 'root dir of module paths to combine.')
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/py/src/mlgo/combine_training_corpus_lib.py
new file mode 100644
index 00000000000000..0359961266a240
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus_lib.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library for combining training corpora."""
+
+import os
+import json
+
+from absl import logging
+
+import tensorflow as tf
+
+_FILE_NAME = 'corpus_description.json'
+
+
+def combine_corpus(root_dir: str) -> None:
+  module_names = []
+  output_corpus_description = {}
+
+  corpus_description_glob = os.path.join(root_dir, '*/' + _FILE_NAME)
+  for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
+    logging.info('processing %s', corpus_description_path)
+
+    with tf.io.gfile.GFile(corpus_description_path, 'r') as f:
+      corpus_description = json.load(f)
+      sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
+      module_names.extend([
+          os.path.join(sub_dir, name) for name in corpus_description['modules']
+      ])
+      del corpus_description['modules']
+      if len(output_corpus_description) == 0:
+        output_corpus_description = corpus_description
+      elif corpus_description != output_corpus_description:
+        raise ValueError('Input corpora differ by more than modules.')
+
+  output_corpus_description['modules'] = module_names
+
+  with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), 'w') as f:
+    json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/py/src/mlgo/combine_training_corpus_test.py
new file mode 100644
index 00000000000000..47dd602967b68f
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus_test.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for combining training corpora."""
+
+import json
+import os
+
+from absl.testing import absltest
+
+from compiler_opt.tools import combine_training_corpus_lib
+
+
+class CombineTrainingCorpusTest(absltest.TestCase):
+
+  def test_combine_corpus(self):
+    corpus_dir = self.create_tempdir()
+    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+    subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
+    subcorpus1_description = {
+        'has_thinlto': False,
+        'modules': ['test1.o', 'test2.o']
+    }
+    subcorpus2_description = {
+        'has_thinlto': False,
+        'modules': ['test3.o', 'test4.o']
+    }
+    subcorpus1_description_file = subcorpus1_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus2_description_file = subcorpus2_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+    subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+    with open(
+        os.path.join(corpus_dir, 'corpus_description.json'),
+        encoding='utf-8') as combined_corpus_description_file:
+      combined_corpus_description = json.load(combined_corpus_description_file)
+    self.assertEqual(combined_corpus_description['has_thinlto'], False)
+    self.assertLen(combined_corpus_description['modules'], 4)
+    self.assertIn('subcorpus1/test1.o', combined_corpus_description['modules'])
+    self.assertIn('subcorpus1/test2.o', combined_corpus_description['modules'])
+    self.assertIn('subcorpus2/test3.o', combined_corpus_description['modules'])
+    self.assertIn('subcorpus2/test4.o', combined_corpus_description['modules'])
+
+  def test_empty_folder(self):
+    corpus_dir = self.create_tempdir()
+    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+    _ = corpus_dir.mkdir(dir_path='empty_dir')
+    subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
+    subcorpus1_description_file = subcorpus1_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+    with open(
+        os.path.join(corpus_dir, 'corpus_description.json'),
+        encoding='utf-8') as combined_corpus_description_file:
+      combined_corpus_description = json.load(combined_corpus_description_file)
+    self.assertLen(combined_corpus_description['modules'], 2)
+
+  def test_ignore_extra_file(self):
+    corpus_dir = self.create_tempdir()
+    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+    _ = corpus_dir.create_file(file_path='empty.log')
+    subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
+    subcorpus1_description_file = subcorpus1_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+    with open(
+        os.path.join(corpus_dir, 'corpus_description.json'),
+        encoding='utf-8') as combined_corpus_description_file:
+      combined_corpus_description = json.load(combined_corpus_description_file)
+    self.assertLen(combined_corpus_description['modules'], 2)
+
+  def test_different_corpora(self):
+    corpus_dir = self.create_tempdir()
+    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+    subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
+    subcorpus1_description = {'has_thinlto': False, 'modules': ['test1.o']}
+    subcorpus2_description = {'has_thinlto': True, 'modules': ['test2.o']}
+    subcorpus1_description_file = subcorpus1_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus2_description_file = subcorpus2_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+    subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+    self.assertRaises(ValueError, combine_training_corpus_lib.combine_corpus,
+                      corpus_dir.full_path)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/py/src/mlgo/extract_ir.py
new file mode 100644
index 00000000000000..2a1ef3978888d6
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir.py
@@ -0,0 +1,142 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, pass an integer representing the desired
+verbosity to the --verbosity flag. Use 0 for all logs, status information,
+and detailed debug information, -1 for solely warnings, and -2 to not produce
+any output.
+"""
+
+import json
+import multiprocessing
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from compiler_opt.tools import extract_ir_lib
+
+flags.DEFINE_string(
+    'input', None,
+    'Input file or directory - either compile_commands.json, a linker parameter'
+    'list, or a path to a directory containing object files.')
+flags.DEFINE_enum(
+    'input_type', 'json', ['json', 'params', 'directory'],
+    'Input file type - json, params, or directory. params latter refers to lld'
+    'params.')
+flags.DEFINE_string('output_dir', None, 'Output directory')
+flags.DEFINE_integer(
+    'num_workers', None,
+    'Number of parallel workers for objcopy. `None` for maximum available.')
+flags.DEFINE_string('llvm_objcopy_path', 'llvm-objcopy', 'Path to llvm-objcopy')
+flags.DEFINE_string(
+    'obj_base_dir', '',
+    'Base directory for object files. Defaults to current working dir.')
+flags.DEFINE_string(
+    'cmd_filter', None,
+    'Include only those modules with a command line matching this regexp. '
+    'Setting it to None for not filtering. Note that the regexp is applied '
+    'independently for each separate command line option. For example, ^-Oz$ '
+    'will match Oz - built binaries. Does not work with thinlto_build=lld.')
+flags.DEFINE_enum(
+    'thinlto_build', None, ['distributed', 'local'],
+    'Set if the build was performed with either \'distributed\' or '
+    '\'local\' ThinLTO. This ensures the thinlto.bc files are also copied. '
+    'The build is assumed to have had '
+    '-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed '
+    'case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files '
+    'passed in the local case.')
+flags.DEFINE_string(
+    'cmd_section_name', '.llvmcmd',
+    'The section name passed to llvm-objcopy. For ELF object files, the '
+    'default .llvmcmd is correct. For Mach-O object files, one should use '
+    'something like __LLVM,__cmdline')
+flags.DEFINE_string(
+    'bitcode_section_name', '.llvmbc',
+    'The section name passed to llvm-objcopy. For ELF object files, the '
+    'default .llvmbc is correct. For Mach-O object files, one should use '
+    '__LLVM,__bitcode')
+
+flags.mark_flag_as_required('output_dir')
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  objs = []
+  if FLAGS.input is not None and FLAGS.thinlto_build == 'local':
+    raise ValueError('--thinlto_build=local cannot be run with --input')
+  if FLAGS.input is None:
+    if FLAGS.thinlto_build != 'local':
+      raise ValueError('--input or --thinlto_build=local must be provided')
+    objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir,
+                                               FLAGS.output_dir)
+  elif FLAGS.input_type == 'json':
+    with open(FLAGS.input, encoding='utf-8') as f:
+      objs = extract_ir_lib.load_from_compile_commands(
+          json.load(f), FLAGS.output_dir)
+  elif FLAGS.input_type == 'params':
+    if not FLAGS.obj_base_dir:
+      logging.info(
+          '-obj_base_dir is unspecified, assuming current directory.'
+          'If no objects are found, use this option to specify the root'
+          'directory for the object file paths in the input file.')
+    with open(FLAGS.input, encoding='utf-8') as f:
+      objs = extract_ir_lib.load_from_lld_params(
+          [l.strip() for l in f.readlines()], FLAGS.obj_base_dir,
+          FLAGS.output_dir)
+  elif FLAGS.input_type == 'directory':
+    logging.warning(
+        'Using the directory input is only recommended if the build system'
+        'your project uses does not support any structured output that'
+        'ml-compiler-opt understands. If your build system provides a'
+        'structured compilation database, use that instead')
+    objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+  else:
+    logging.error('Unknown input type: %s', FLAGS.input_type)
+
+  relative_output_paths = extract_ir_lib.run_extraction(
+      objs, FLAGS.num_workers, FLAGS.llvm_objcopy_path, FLAGS.cmd_filter,
+      FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name)
+
+  extract_ir_lib.write_corpus_manifest(FLAGS.thinlto_build,
+                                       relative_output_paths, FLAGS.output_dir)
+
+  logging.info('Converted %d files out of %d',
+               len(objs) - relative_output_paths.count(None), len(objs))
+
+
+if __name__ == '__main__':
+  multiprocessing.set_start_method('fork')
+  app.run(main)
diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/py/src/mlgo/extract_ir_lib.py
new file mode 100644
index 00000000000000..c1d2a54b9a9e7c
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir_lib.py
@@ -0,0 +1,373 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library functions for IR extraction."""
+
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import multiprocessing
+import functools
+import json
+
+from typing import Dict, List, Optional
+
+from absl import logging
+
+from compiler_opt.rl import constant
+
+
+# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
+# \0 - separated list of strings, to a \n one.
+def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
+  """Determine if the module should be included."""
+  if match_regexp is None:
+    return True
+  lines = cmdline.split('\0')
+  return any(len(re.findall(match_regexp, l)) for l in lines)
+
+
+def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
+  opts = cmdline.split('\0')
+  for option in opts:
+    if option.startswith('-fthinlto-index'):
+      return os.path.join(basedir, option.split('=')[1])
+  return None
+
+
+class TrainingIRExtractor:
+  """IR and command line extraction from an object file."""
+
+  def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
+    """Set up a TrainingIRExtractor.
+
+    Args:
+      obj_relative_path: relative path to the input object file. It will be also
+        used to construct the absolute path of the output IR and cmd files, by
+        appending it to output_base_dir.
+      output_base_dir: the directory under which the output will be produced.
+      obj_base_dir: the base directory for all the input object files.
+    """
+    self._obj_relative_path = obj_relative_path
+    self._output_base_dir = output_base_dir
+    self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ''
+
+  def obj_base_dir(self):
+    return self._obj_base_dir
+
+  def output_base_dir(self):
+    return self._output_base_dir
+
+  def relative_output_path(self):
+    return self._obj_relative_path
+
+  def input_obj(self):
+    return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+
+  def lld_src_bc(self):
+    # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+    # IR bitcode saved by lld. It is hardcoded into lld.
+    return os.path.join(self._obj_base_dir,
+                        self._obj_relative_path + '.3.import.bc')
+
+  def lld_src_thinlto(self):
+    return os.path.join(self._obj_base_dir,
+                        self._obj_relative_path + '.thinlto.bc')
+
+  def dest_dir(self):
+    return os.path.join(self.output_base_dir(),
+                        os.path.dirname(self._obj_relative_path))
+
+  def module_name(self):
+    return os.path.basename(self._obj_relative_path)
+
+  def cmd_file(self):
+    return os.path.join(self.dest_dir(), self.module_name() + '.cmd')
+
+  def bc_file(self):
+    return os.path.join(self.dest_dir(), self.module_name() + '.bc')
+
+  def thinlto_index_file(self):
+    return os.path.join(self.dest_dir(), self.module_name() + '.thinlto.bc')
+
+  def _get_extraction_cmd_command(self, llvm_objcopy_path: str,
+                                  cmd_section_name: str):
+    """Get llvm-objcopy and process args to a produce a command string that,
+    when invoked, will extract the cmd section info ths self.cmd_file() file.
+    """
+    return [
+        llvm_objcopy_path,
+        '--dump-section=' + cmd_section_name + '=' + self.cmd_file(),
+        self.input_obj(), '/dev/null'
+    ]
+
+  def _get_extraction_bc_command(self, llvm_objcopy_path: str,
+                                 bitcode_section_name: str):
+    """Gets llvm-objcopy and process args to produce a command string that,
+    when invoked, will extract the bitcode section into the self.bc_file()
+    file.
+    """
+    return [
+        llvm_objcopy_path,
+        '--dump-section=' + bitcode_section_name + '=' + self.bc_file(),
+        self.input_obj(), '/dev/null'
+    ]
+
+  def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str,
+                               is_thinlto: bool, cmd_section_name: str,
+                               bitcode_section_name: str) -> Optional[str]:
+    """Run llvm-objcopy to extract the .bc and command line."""
+    if not os.path.exists(self.input_obj()):
+      logging.info('%s does not exist.', self.input_obj())
+      return None
+    os.makedirs(self.dest_dir(), exist_ok=True)
+    try:
+      subprocess.check_output(
+          self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
+          stderr=subprocess.STDOUT,
+          encoding='utf-8')
+      if cmd_filter is not None or is_thinlto:
+        with open(self.cmd_file(), encoding='utf-8') as f:
+          lines = f.readlines()
+        assert len(lines) == 1
+        cmdline = lines[0]
+        if not should_include_module(cmdline, cmd_filter):
+          logging.info(
+              'Excluding module %s because it does not match the filter',
+              self.input_obj())
+          os.remove(self.cmd_file())
+          return None
+        if is_thinlto:
+          index_file = get_thinlto_index(cmdline, self.obj_base_dir())
+          shutil.copy(index_file, self.thinlto_index_file())
+
+      subprocess.check_output(
+          self._get_extraction_bc_command(llvm_objcopy_path,
+                                          bitcode_section_name),
+          stderr=subprocess.STDOUT,
+          encoding='utf-8')
+    except subprocess.CalledProcessError as e:
+      # This may happen if  .o file was build from asm (.S source).
+      logging.warning('%s was not processed: %s', self.input_obj(), e)
+      logging.info(e.output)
+      return None
+    assert (os.path.exists(self.cmd_file()) and
+            os.path.exists(self.bc_file()) and
+            (not is_thinlto or os.path.exists(self.thinlto_index_file())))
+    return self.relative_output_path()
+
+  def _extract_lld_artifacts(self) -> Optional[str]:
+    """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.
+    """
+    if not os.path.exists(self.lld_src_bc()):
+      logging.info('%s does not exist.', self.lld_src_bc())
+      return None
+    if not os.path.exists(self.lld_src_thinlto()):
+      logging.info('%s does not exist.', self.lld_src_thinlto())
+      return None
+    os.makedirs(self.dest_dir(), exist_ok=True)
+
+    # Copy over the files
+    shutil.copy(self.lld_src_bc(), self.bc_file())
+    shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
+
+    assert os.path.exists(self.bc_file())
+    assert os.path.exists(self.thinlto_index_file())
+    return self._obj_relative_path
+
+  def extract(self,
+              llvm_objcopy_path: Optional[str] = None,
+              cmd_filter: Optional[str] = None,
+              thinlto_build: Optional[str] = None,
+              cmd_section_name: Optional[str] = '.llvmcmd',
+              bitcode_section_name: Optional[str] = '.llvmbc') -> Optional[str]:
+    if thinlto_build == 'local':
+      return self._extract_lld_artifacts()
+    return self._extract_clang_artifacts(
+        llvm_objcopy_path=llvm_objcopy_path,
+        cmd_filter=cmd_filter,
+        is_thinlto=thinlto_build == 'distributed',
+        cmd_section_name=cmd_section_name,
+        bitcode_section_name=bitcode_section_name)
+
+
+def convert_compile_command_to_objectfile(
+    command: Dict[str, str], output_dir: str) -> Optional[TrainingIRExtractor]:
+  obj_base_dir = command['directory']
+  if 'arguments' in command:
+    cmd_parts = command['arguments']
+  elif 'command' in command:
+    cmd_parts = command['command'].split()
+  else:
+    logging.info('compile_commands element has no command and arguments')
+    return None
+
+  try:
+    obj_index = cmd_parts.index('-o') + 1
+  except ValueError:
+    # This could happen if there are non-clang commands in compile_commands.json
+    logging.info('Command has no -o option: %s', ' '.join(cmd_parts))
+    return None
+  obj_rel_path = cmd_parts[obj_index]
+  # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
+  return TrainingIRExtractor(
+      obj_relative_path=obj_rel_path,
+      output_base_dir=output_dir,
+      obj_base_dir=obj_base_dir)
+
+
+def load_from_compile_commands(json_array: List[Dict[str, str]],
+                               output_dir: str) -> List[TrainingIRExtractor]:
+  objs = [
+      convert_compile_command_to_objectfile(cmd, output_dir)
+      for cmd in json_array
+  ]
+  # Filter out None, in case there were non-clang commands in the .json
+  return [obj for obj in objs if obj is not None]
+
+
+def load_from_lld_params(params_array: List[str], obj_base_dir: str,
+                         output_dir: str) -> List[TrainingIRExtractor]:
+  """Create an ObjectFile array based on lld's parameters."""
+  # yank out -o and the output. After that, anything not starting with '-', and
+  # ending in a '.o', is an object file.
+  try:
+    minus_o_idx = params_array.index('-o')
+    del params_array[minus_o_idx:minus_o_idx + 2]
+    just_obj_paths = [
+        o for o in params_array if not o.startswith('-') and o.endswith('.o')
+    ]
+  except ValueError:
+    logging.info('This params file does not have an explicit -o option.')
+    just_obj_paths = params_array
+
+  def make_obj(obj_file: str) -> TrainingIRExtractor:
+    return TrainingIRExtractor(
+        obj_relative_path=obj_file,
+        output_base_dir=output_dir,
+        obj_base_dir=obj_base_dir)
+
+  return [make_obj(obj_file) for obj_file in just_obj_paths]
+
+
+def load_from_directory(obj_base_dir: str,
+                        output_dir: str) -> List[TrainingIRExtractor]:
+  """Create an object file array by globbing an entire drectory.
+
+  Args:
+    obj_base_dir: The base build directory that all object files will be
+      written out as being relative to.
+    output_dir: The output directory where extracted .bc and .cmd files should
+      be placed.
+  """
+  paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.o')]
+
+  def make_spec(obj_file: str):
+    return TrainingIRExtractor(
+        obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
+        output_base_dir=output_dir,
+        obj_base_dir=obj_base_dir)
+
+  return [make_spec(path) for path in paths]
+
+
+def load_for_lld_thinlto(obj_base_dir: str,
+                         output_dir: str) -> List[TrainingIRExtractor]:
+  # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+  # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
+  # are also emitted next to the postimport bitcode, with the suffix
+  # .thinlto.bc instead
+  paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.3.import.bc')]
+
+  def make_spec(obj_file: str):
+    return TrainingIRExtractor(
+        # Cut away .3.import.bc
+        obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
+        output_base_dir=output_dir,
+        obj_base_dir=obj_base_dir)
+
+  return [make_spec(path) for path in paths]
+
+
+def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
+                   llvm_objcopy_path: str, cmd_filter: str, thinlto_build: str,
+                   cmd_section_name: str, bitcode_section_name: str):
+  """Extracts all specified object files into the corpus directory.
+
+  Args:
+    objs: A list of TrainingIRExtractor Objects that represent the object files
+      to extract bitcode/commands from.
+    num_workers: The number of parallel processes to spawn to run the
+      extraction.
+    llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
+    cmd_filter: A regular expression that is used to select for compilations
+      performed with specific flags. If you want to include all compilations,
+      set this to None.
+    thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
+      Set this to None if the build was not done with ThinLTO.
+    cmd_section_name: The name of the command line section created by the
+      bitcode embedding.
+    bitcode_section_name: The name of the bitcode section created by the
+      bitcode embedding.
+  """
+  extract_artifacts = functools.partial(
+      TrainingIRExtractor.extract,
+      llvm_objcopy_path=llvm_objcopy_path,
+      cmd_filter=cmd_filter,
+      thinlto_build=thinlto_build,
+      cmd_section_name=cmd_section_name,
+      bitcode_section_name=bitcode_section_name)
+
+  with multiprocessing.Pool(num_workers) as pool:
+    relative_output_paths = pool.map(extract_artifacts, objs)
+    pool.close()
+    pool.join()
+  return relative_output_paths
+
+
+def write_corpus_manifest(thinlto_build: str, relative_output_paths: List[str],
+                          output_dir: str):
+  """Writes a corpus_manifest.json containing all necessary information about
+  the corpus.
+
+  Args:
+    thinlto_build: Whether or not the build was done with ThinLTO and if so,
+      what kind of ThinLTO. Set this to none if the build was not performed with
+      ThinLTO.
+    relative_output_paths: The relative (to the corpus directory) output paths
+      of all the bitcode files that should be placed in the corpus manifest
+    output_dir: The corpus directory where the corpus manifest should be
+      placed.
+  """
+  # This comes first rather than later so global_command_override is at the top
+  # of the .json after being written
+  if thinlto_build == 'local':
+    corpus_description = {
+        'global_command_override': constant.UNSPECIFIED_OVERRIDE
+    }
+  else:
+    corpus_description = {}
+
+  corpus_description.update({
+      'has_thinlto': thinlto_build is not None,
+      'modules': [path for path in relative_output_paths if path is not None]
+  })
+
+  with open(
+      os.path.join(output_dir, 'corpus_description.json'),
+      'w',
+      encoding='utf-8') as f:
+    json.dump(corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/py/src/mlgo/extract_ir_test.py
new file mode 100644
index 00000000000000..8811134aab4fce
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir_test.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for compiler_opt.tools.extract_ir."""
+
+# pylint: disable=protected-access
+import os.path
+
+from absl.testing import absltest
+
+from compiler_opt.tools import extract_ir_lib
+
+
+class ExtractIrTest(absltest.TestCase):
+
+  def test_one_conversion(self):
+    obj = extract_ir_lib.convert_compile_command_to_objectfile(
+        {
+            'directory': '/output/directory',
+            'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
+            'file': '/some/path/lib/foo/bar.cc'
+        }, '/corpus/destination/path')
+    self.assertIsNotNone(obj)
+    # pytype: disable=attribute-error
+    # Pytype complains about obj being None
+    self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
+    self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
+    self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
+    self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+    self.assertEqual(obj.thinlto_index_file(),
+                     '/corpus/destination/path/lib/bar.o.thinlto.bc')
+    # pytype: enable=attribute-error
+
+  def test_one_conversion_arguments_style(self):
+    obj = extract_ir_lib.convert_compile_command_to_objectfile(
+        {
+            'directory': '/output/directory',
+            'arguments':
+                ['-cc1', '-c', '/some/path/lib/foo/bar.cc', '-o', 'lib/bar.o'],
+            'file': '/some/path/lib/foo/bar.cc'
+        }, '/corpus/destination/path')
+    self.assertIsNotNone(obj)
+    # pytype: disable=attribute-error
+    # Pytype complains about obj being None
+    self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
+    self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
+    self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
+    self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+    self.assertEqual(obj.thinlto_index_file(),
+                     '/corpus/destination/path/lib/bar.o.thinlto.bc')
+    # pytype: enable=attribute-error
+
+  def test_arr_conversion(self):
+    res = extract_ir_lib.load_from_compile_commands([{
+        'directory': '/output/directory',
+        'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
+        'file': '/some/path/lib/foo/bar.cc'
+    }, {
+        'directory': '/output/directory',
+        'command': '-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o',
+        'file': '/some/path/lib/foo/baz.cc'
+    }], '/corpus/destination/path')
+    res = list(res)
+    self.assertLen(res, 2)
+    self.assertEqual(res[0].input_obj(), '/output/directory/lib/bar.o')
+    self.assertEqual(res[0].relative_output_path(), 'lib/bar.o')
+    self.assertEqual(res[0].cmd_file(),
+                     '/corpus/destination/path/lib/bar.o.cmd')
+    self.assertEqual(res[0].bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+    self.assertEqual(res[0].thinlto_index_file(),
+                     '/corpus/destination/path/lib/bar.o.thinlto.bc')
+
+    self.assertEqual(res[1].input_obj(), '/output/directory/lib/other/baz.o')
+    self.assertEqual(res[1].relative_output_path(), 'lib/other/baz.o')
+    self.assertEqual(res[1].cmd_file(),
+                     '/corpus/destination/path/lib/other/baz.o.cmd')
+    self.assertEqual(res[1].bc_file(),
+                     '/corpus/destination/path/lib/other/baz.o.bc')
+    self.assertEqual(res[1].thinlto_index_file(),
+                     '/corpus/destination/path/lib/other/baz.o.thinlto.bc')
+
+  def test_command_extraction(self):
+    obj = extract_ir_lib.TrainingIRExtractor(
+        obj_relative_path='lib/obj_file.o',
+        output_base_dir='/where/corpus/goes',
+        obj_base_dir='/foo/bar')
+    self.assertEqual(
+        obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
+            '/bin/llvm_objcopy_path',
+            '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
+            '/foo/bar/lib/obj_file.o', '/dev/null'
+        ])
+    self.assertEqual(
+        obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
+            '/bin/llvm_objcopy_path',
+            '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
+            '/foo/bar/lib/obj_file.o', '/dev/null'
+        ])
+
+  def test_command_extraction_no_basedir(self):
+    obj = extract_ir_lib.TrainingIRExtractor('lib/obj_file.o',
+                                             '/where/corpus/goes')
+    self.assertEqual(
+        obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
+            '/bin/llvm_objcopy_path',
+            '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
+            'lib/obj_file.o', '/dev/null'
+        ])
+    self.assertEqual(
+        obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
+            '/bin/llvm_objcopy_path',
+            '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
+            'lib/obj_file.o', '/dev/null'
+        ])
+
+  def test_lld_params(self):
+    lld_opts = [
+        '-o', 'output/dir/exe', 'lib/obj1.o', 'somelib.a', '-W,blah',
+        'lib/dir/obj2.o'
+    ]
+    obj = extract_ir_lib.load_from_lld_params(lld_opts, '/some/path',
+                                              '/tmp/out')
+    self.assertLen(obj, 2)
+    self.assertEqual(obj[0].input_obj(), '/some/path/lib/obj1.o')
+    self.assertEqual(obj[0].relative_output_path(), 'lib/obj1.o')
+    self.assertEqual(obj[0].cmd_file(), '/tmp/out/lib/obj1.o.cmd')
+    self.assertEqual(obj[0].thinlto_index_file(),
+                     '/tmp/out/lib/obj1.o.thinlto.bc')
+    self.assertEqual(obj[1].input_obj(), '/some/path/lib/dir/obj2.o')
+
+  def test_load_from_directory(self):
+    tempdir = self.create_tempdir()
+    subdir = tempdir.mkdir(dir_path='subdir')
+    subdir.create_file(file_path='test1.o')
+    subdir.create_file(file_path='test2.o')
+    outdir = self.create_tempdir()
+    objs = extract_ir_lib.load_from_directory(tempdir.full_path,
+                                              outdir.full_path)
+    self.assertLen(objs, 2)
+    for index, obj in enumerate(
+        sorted(objs, key=lambda x: x._obj_relative_path)):
+      self.assertEqual(obj._obj_relative_path, f'subdir/test{index + 1:d}.o')
+      self.assertEqual(obj._obj_base_dir, tempdir.full_path)
+      self.assertEqual(obj._output_base_dir, outdir.full_path)
+
+  def test_lld_thinlto_discovery(self):
+    tempdir = self.create_tempdir()
+    tempdir.create_file(file_path='1.3.import.bc')
+    tempdir.create_file(file_path='2.3.import.bc')
+    tempdir.create_file(file_path='3.3.import.bc')
+    tempdir.create_file(file_path='1.thinlto.bc')
+    tempdir.create_file(file_path='2.thinlto.bc')
+    tempdir.create_file(file_path='3.thinlto.bc')
+    outdir = self.create_tempdir()
+    obj = extract_ir_lib.load_for_lld_thinlto(tempdir.full_path,
+                                              outdir.full_path)
+    self.assertLen(obj, 3)
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+      self.assertEqual(o._obj_relative_path, f'{i + 1:d}')
+      self.assertEqual(o._obj_base_dir, tempdir.full_path)
+      self.assertEqual(o._output_base_dir, outdir.full_path)
+
+  def test_lld_thinlto_discovery_nested(self):
+    outer = self.create_tempdir()
+    tempdir = outer.mkdir(dir_path='nest')
+    tempdir.create_file(file_path='1.3.import.bc')
+    tempdir.create_file(file_path='2.3.import.bc')
+    tempdir.create_file(file_path='3.3.import.bc')
+    tempdir.create_file(file_path='1.thinlto.bc')
+    tempdir.create_file(file_path='2.thinlto.bc')
+    tempdir.create_file(file_path='3.thinlto.bc')
+    outdir = self.create_tempdir()
+    obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+    self.assertLen(obj, 3)
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+      self.assertEqual(o._obj_relative_path, f'nest/{i + 1:d}')
+      self.assertEqual(o._obj_base_dir, outer.full_path)
+      self.assertEqual(o._output_base_dir, outdir.full_path)
+
+  def test_lld_thinlto_extraction(self):
+    outer = self.create_tempdir()
+    tempdir = outer.mkdir(dir_path='nest')
+    tempdir.create_file(file_path='1.3.import.bc')
+    tempdir.create_file(file_path='2.3.import.bc')
+    tempdir.create_file(file_path='3.3.import.bc')
+    tempdir.create_file(file_path='1.thinlto.bc')
+    tempdir.create_file(file_path='2.thinlto.bc')
+    tempdir.create_file(file_path='3.thinlto.bc')
+    outdir = self.create_tempdir()
+    obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+      mod_path = o.extract(thinlto_build='local')
+      self.assertEqual(mod_path, f'nest/{i + 1:d}')
+    self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/1.bc')))
+    self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/2.bc')))
+    self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/3.bc')))
+    self.assertTrue(
+        os.path.exists(os.path.join(outdir.full_path, 'nest/1.thinlto.bc')))
+    self.assertTrue(
+        os.path.exists(os.path.join(outdir.full_path, 'nest/2.thinlto.bc')))
+    self.assertTrue(
+        os.path.exists(os.path.join(outdir.full_path, 'nest/3.thinlto.bc')))
+
+  def test_filtering(self):
+    cmdline = '-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o'
+    self.assertTrue(extract_ir_lib.should_include_module(cmdline, None))
+    self.assertTrue(extract_ir_lib.should_include_module(cmdline, '.*'))
+    self.assertTrue(extract_ir_lib.should_include_module(cmdline, '^-Oz$'))
+    self.assertFalse(extract_ir_lib.should_include_module(cmdline, '^-O3$'))
+
+  def test_thinlto_index_extractor(self):
+    cmdline = ('-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/'
+               'out.o\0-fthinlto-index=foo/bar.thinlto.bc')
+    self.assertEqual(
+        extract_ir_lib.get_thinlto_index(cmdline, '/the/base/dir'),
+        '/the/base/dir/foo/bar.thinlto.bc')
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/py/src/mlgo/make_corpus.py
new file mode 100644
index 00000000000000..24493d894be723
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus.py
@@ -0,0 +1,58 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+  --input_dir=<path to input directory> \
+  --output_dir=<path to output directory> \
+  --default_args="<list of space separated flags>"
+"""
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from compiler_opt.tools import make_corpus_lib
+
+flags.DEFINE_string('input_dir', None, 'The input directory.')
+flags.DEFINE_string('output_dir', None, 'The output directory.')
+flags.DEFINE_string(
+    'default_args', '',
+    'The compiler flags to compile with when using downstream tooling.')
+
+flags.mark_flag_as_required('input_dir')
+flags.mark_flag_as_required('output_dir')
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  logging.warning(
+      'Using this tool does not guarantee that the bitcode is taken at '
+      'the correct stage for consumption during model training. Make '
+      'sure to validate assumptions about where the bitcode is coming '
+      'from before using it in production.')
+  relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
+  make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir,
+                               FLAGS.output_dir)
+  make_corpus_lib.write_corpus_manifest(relative_paths, FLAGS.output_dir,
+                                        FLAGS.default_args.split())
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/py/src/mlgo/make_corpus_lib.py
new file mode 100644
index 00000000000000..3598fc12a04d14
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus_lib.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library functions for making a corpus from arbitrary bitcode."""
+
+import pathlib
+import os
+import shutil
+import json
+
+from typing import List, Optional
+
+BITCODE_EXTENSION = '.bc'
+
+
+def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]:
+  """Finds bitcode files to extract from a given directory.
+
+  Args:
+    bitcode_base_dir: The base directory where the bitcode to be copied
+      is from.
+    output_dir: The directory to place the bitcode in.
+
+  Returns an array of paths representing the relative path to the bitcode
+  file from the base direcotry.
+  """
+  paths = [
+      str(p)[:-len(BITCODE_EXTENSION)]
+      for p in pathlib.Path(bitcode_base_dir).glob('**/*' + BITCODE_EXTENSION)
+  ]
+
+  return [
+      os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths
+  ]
+
+
+def copy_bitcode(relative_paths: List[str], bitcode_base_dir: str,
+                 output_dir: str) -> None:
+  """Copies bitcode files from the base directory to the output directory.
+
+  Args:
+    relative_paths: An array of relative paths to bitcode files that are copied
+      over to the output directory, preserving relative location.
+    bitcode_base_dir: The base directory where the bitcode is located.
+    output_dir: The output directory to place the bitcode in.
+  """
+  for relative_path in relative_paths:
+    base_path = os.path.join(bitcode_base_dir,
+                             relative_path + BITCODE_EXTENSION)
+    destination_path = os.path.join(output_dir,
+                                    relative_path + BITCODE_EXTENSION)
+    os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+    shutil.copy(base_path, destination_path)
+
+
+def write_corpus_manifest(relative_output_paths: List[str],
+                          output_dir: str,
+                          default_args: Optional[List[str]] = None) -> None:
+  """Creates a corpus manifest describing the bitcode that has been found.
+
+  Args:
+    relative_output_paths: A list of paths to each bitcode file relative to the
+      output directory.
+    outout_dir: The output directory where the corpus is being created.
+    default_args: An array of compiler flags that should be used to compile
+      the bitcode when using further downstream tooling."""
+  if default_args is None:
+    default_args = []
+  corpus_description = {
+      'global_command_override': default_args,
+      'has_thinlto': False,
+      'modules': [path for path in relative_output_paths if path is not None]
+  }
+
+  with open(
+      os.path.join(output_dir, 'corpus_description.json'),
+      'w',
+      encoding='utf-8') as description_file:
+    json.dump(corpus_description, description_file, indent=2)
diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/py/src/mlgo/make_corpus_test.py
new file mode 100644
index 00000000000000..8ed598695d06ee
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus_test.py
@@ -0,0 +1,66 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test for compiler_opt.tools.make_corpus_lib"""
+
+import json
+import os
+
+from absl.testing import absltest
+
+from compiler_opt.tools import make_corpus_lib
+
+
+class MakeCorpusTest(absltest.TestCase):
+
+  def test_load_bitcode_from_directory(self):
+    outer = self.create_tempdir()
+    tempdir = outer.mkdir(dir_path='nested')
+    tempdir.create_file('test1.bc')
+    tempdir.create_file('test2.bc')
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(outer)
+    relative_paths = sorted(relative_paths)
+    self.assertEqual(relative_paths[0], 'nested/test1')
+    self.assertEqual(relative_paths[1], 'nested/test2')
+
+  def test_copy_bitcode(self):
+    build_dir = self.create_tempdir()
+    nested_dir = build_dir.mkdir(dir_path='nested')
+    nested_dir.create_file('test1.bc')
+    nested_dir.create_file('test2.bc')
+    relative_paths = ['nested/test1', 'nested/test2']
+    corpus_dir = self.create_tempdir()
+    make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
+    output_files = sorted(os.listdir(os.path.join(corpus_dir, './nested')))
+    self.assertEqual(output_files[0], 'test1.bc')
+    self.assertEqual(output_files[1], 'test2.bc')
+
+  def test_write_corpus_manifest(self):
+    relative_output_paths = ['test/test1', 'test/test2']
+    output_dir = self.create_tempdir()
+    default_args = ['-O3', '-c']
+    make_corpus_lib.write_corpus_manifest(relative_output_paths, output_dir,
+                                          default_args)
+    with open(
+        os.path.join(output_dir, 'corpus_description.json'),
+        encoding='utf-8') as corpus_description_file:
+      corpus_description = json.load(corpus_description_file)
+    self.assertEqual(corpus_description['global_command_override'],
+                     default_args)
+    self.assertEqual(corpus_description['has_thinlto'], False)
+    self.assertEqual(corpus_description['modules'], relative_output_paths)
+
+
+if __name__ == '__main__':
+  absltest.main()