[llvm] [clang-tools-extra] [MLGO] Upstream the corpus extraction tooling (PR #72319)

Aiden Grossman via cfe-commits cfe-commits at lists.llvm.org
Sun Jan 14 22:23:49 PST 2024


https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/72319

>From c3f723c8a975cc5e075d56350645b0be486f3cda Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Tue, 14 Nov 2023 14:20:24 -0800
Subject: [PATCH 1/7] [MLGO] Upstream the corpus extraction tooling

---
 llvm/py/Pyproject.toml                        |   1 +
 llvm/py/src/mlgo/combine_training_corpus.py   |  55 +++
 .../src/mlgo/combine_training_corpus_lib.py   |  50 +++
 .../src/mlgo/combine_training_corpus_test.py  | 104 +++++
 llvm/py/src/mlgo/extract_ir.py                | 142 +++++++
 llvm/py/src/mlgo/extract_ir_lib.py            | 373 ++++++++++++++++++
 llvm/py/src/mlgo/extract_ir_test.py           | 231 +++++++++++
 llvm/py/src/mlgo/make_corpus.py               |  58 +++
 llvm/py/src/mlgo/make_corpus_lib.py           |  90 +++++
 llvm/py/src/mlgo/make_corpus_test.py          |  66 ++++
 10 files changed, 1170 insertions(+)
 create mode 100644 llvm/py/Pyproject.toml
 create mode 100644 llvm/py/src/mlgo/combine_training_corpus.py
 create mode 100644 llvm/py/src/mlgo/combine_training_corpus_lib.py
 create mode 100644 llvm/py/src/mlgo/combine_training_corpus_test.py
 create mode 100644 llvm/py/src/mlgo/extract_ir.py
 create mode 100644 llvm/py/src/mlgo/extract_ir_lib.py
 create mode 100644 llvm/py/src/mlgo/extract_ir_test.py
 create mode 100644 llvm/py/src/mlgo/make_corpus.py
 create mode 100644 llvm/py/src/mlgo/make_corpus_lib.py
 create mode 100644 llvm/py/src/mlgo/make_corpus_test.py

diff --git a/llvm/py/Pyproject.toml b/llvm/py/Pyproject.toml
new file mode 100644
index 00000000000000..dcf2c804da5e19
--- /dev/null
+++ b/llvm/py/Pyproject.toml
@@ -0,0 +1 @@
+# Placeholder
diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/py/src/mlgo/combine_training_corpus.py
new file mode 100644
index 00000000000000..94ee1cbac9cea4
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+  --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+from absl import app
+from absl import flags
+
+from compiler_opt.tools import combine_training_corpus_lib
+
+flags.DEFINE_string('root_dir', '', 'root dir of module paths to combine.')
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/py/src/mlgo/combine_training_corpus_lib.py
new file mode 100644
index 00000000000000..0359961266a240
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus_lib.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library for combining training corpora."""
+
+import os
+import json
+
+from absl import logging
+
+import tensorflow as tf
+
+_FILE_NAME = 'corpus_description.json'
+
+
+def combine_corpus(root_dir: str) -> None:
+  module_names = []
+  output_corpus_description = {}
+
+  corpus_description_glob = os.path.join(root_dir, '*/' + _FILE_NAME)
+  for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
+    logging.info('processing %s', corpus_description_path)
+
+    with tf.io.gfile.GFile(corpus_description_path, 'r') as f:
+      corpus_description = json.load(f)
+      sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
+      module_names.extend([
+          os.path.join(sub_dir, name) for name in corpus_description['modules']
+      ])
+      del corpus_description['modules']
+      if len(output_corpus_description) == 0:
+        output_corpus_description = corpus_description
+      elif corpus_description != output_corpus_description:
+        raise ValueError('Input corpora differ by more than modules.')
+
+  output_corpus_description['modules'] = module_names
+
+  with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), 'w') as f:
+    json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/py/src/mlgo/combine_training_corpus_test.py
new file mode 100644
index 00000000000000..47dd602967b68f
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus_test.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for combining training corpora."""
+
+import json
+import os
+
+from absl.testing import absltest
+
+from compiler_opt.tools import combine_training_corpus_lib
+
+
+class CombineTrainingCorpusTest(absltest.TestCase):
+
+  def test_combine_corpus(self):
+    corpus_dir = self.create_tempdir()
+    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+    subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
+    subcorpus1_description = {
+        'has_thinlto': False,
+        'modules': ['test1.o', 'test2.o']
+    }
+    subcorpus2_description = {
+        'has_thinlto': False,
+        'modules': ['test3.o', 'test4.o']
+    }
+    subcorpus1_description_file = subcorpus1_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus2_description_file = subcorpus2_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+    subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+    with open(
+        os.path.join(corpus_dir, 'corpus_description.json'),
+        encoding='utf-8') as combined_corpus_description_file:
+      combined_corpus_description = json.load(combined_corpus_description_file)
+    self.assertEqual(combined_corpus_description['has_thinlto'], False)
+    self.assertLen(combined_corpus_description['modules'], 4)
+    self.assertIn('subcorpus1/test1.o', combined_corpus_description['modules'])
+    self.assertIn('subcorpus1/test2.o', combined_corpus_description['modules'])
+    self.assertIn('subcorpus2/test3.o', combined_corpus_description['modules'])
+    self.assertIn('subcorpus2/test4.o', combined_corpus_description['modules'])
+
+  def test_empty_folder(self):
+    corpus_dir = self.create_tempdir()
+    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+    _ = corpus_dir.mkdir(dir_path='empty_dir')
+    subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
+    subcorpus1_description_file = subcorpus1_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+    with open(
+        os.path.join(corpus_dir, 'corpus_description.json'),
+        encoding='utf-8') as combined_corpus_description_file:
+      combined_corpus_description = json.load(combined_corpus_description_file)
+    self.assertLen(combined_corpus_description['modules'], 2)
+
+  def test_ignore_extra_file(self):
+    corpus_dir = self.create_tempdir()
+    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+    _ = corpus_dir.create_file(file_path='empty.log')
+    subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
+    subcorpus1_description_file = subcorpus1_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+    with open(
+        os.path.join(corpus_dir, 'corpus_description.json'),
+        encoding='utf-8') as combined_corpus_description_file:
+      combined_corpus_description = json.load(combined_corpus_description_file)
+    self.assertLen(combined_corpus_description['modules'], 2)
+
+  def test_different_corpora(self):
+    corpus_dir = self.create_tempdir()
+    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+    subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
+    subcorpus1_description = {'has_thinlto': False, 'modules': ['test1.o']}
+    subcorpus2_description = {'has_thinlto': True, 'modules': ['test2.o']}
+    subcorpus1_description_file = subcorpus1_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus2_description_file = subcorpus2_dir.create_file(
+        file_path='corpus_description.json')
+    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+    subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+    self.assertRaises(ValueError, combine_training_corpus_lib.combine_corpus,
+                      corpus_dir.full_path)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/py/src/mlgo/extract_ir.py
new file mode 100644
index 00000000000000..2a1ef3978888d6
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir.py
@@ -0,0 +1,142 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, pass an integer representing the desired
+verbosity to the --verbosity flag. Use 0 for all logs, status information,
+and detailed debug information, -1 for solely warnings, and -2 to not produce
+any output.
+"""
+
+import json
+import multiprocessing
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from compiler_opt.tools import extract_ir_lib
+
+flags.DEFINE_string(
+    'input', None,
+    'Input file or directory - either compile_commands.json, a linker parameter'
+    'list, or a path to a directory containing object files.')
+flags.DEFINE_enum(
+    'input_type', 'json', ['json', 'params', 'directory'],
+    'Input file type - json, params, or directory. params latter refers to lld'
+    'params.')
+flags.DEFINE_string('output_dir', None, 'Output directory')
+flags.DEFINE_integer(
+    'num_workers', None,
+    'Number of parallel workers for objcopy. `None` for maximum available.')
+flags.DEFINE_string('llvm_objcopy_path', 'llvm-objcopy', 'Path to llvm-objcopy')
+flags.DEFINE_string(
+    'obj_base_dir', '',
+    'Base directory for object files. Defaults to current working dir.')
+flags.DEFINE_string(
+    'cmd_filter', None,
+    'Include only those modules with a command line matching this regexp. '
+    'Setting it to None for not filtering. Note that the regexp is applied '
+    'independently for each separate command line option. For example, ^-Oz$ '
+    'will match Oz - built binaries. Does not work with thinlto_build=lld.')
+flags.DEFINE_enum(
+    'thinlto_build', None, ['distributed', 'local'],
+    'Set if the build was performed with either \'distributed\' or '
+    '\'local\' ThinLTO. This ensures the thinlto.bc files are also copied. '
+    'The build is assumed to have had '
+    '-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed '
+    'case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files '
+    'passed in the local case.')
+flags.DEFINE_string(
+    'cmd_section_name', '.llvmcmd',
+    'The section name passed to llvm-objcopy. For ELF object files, the '
+    'default .llvmcmd is correct. For Mach-O object files, one should use '
+    'something like __LLVM,__cmdline')
+flags.DEFINE_string(
+    'bitcode_section_name', '.llvmbc',
+    'The section name passed to llvm-objcopy. For ELF object files, the '
+    'default .llvmbc is correct. For Mach-O object files, one should use '
+    '__LLVM,__bitcode')
+
+flags.mark_flag_as_required('output_dir')
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  objs = []
+  if FLAGS.input is not None and FLAGS.thinlto_build == 'local':
+    raise ValueError('--thinlto_build=local cannot be run with --input')
+  if FLAGS.input is None:
+    if FLAGS.thinlto_build != 'local':
+      raise ValueError('--input or --thinlto_build=local must be provided')
+    objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir,
+                                               FLAGS.output_dir)
+  elif FLAGS.input_type == 'json':
+    with open(FLAGS.input, encoding='utf-8') as f:
+      objs = extract_ir_lib.load_from_compile_commands(
+          json.load(f), FLAGS.output_dir)
+  elif FLAGS.input_type == 'params':
+    if not FLAGS.obj_base_dir:
+      logging.info(
+          '-obj_base_dir is unspecified, assuming current directory.'
+          'If no objects are found, use this option to specify the root'
+          'directory for the object file paths in the input file.')
+    with open(FLAGS.input, encoding='utf-8') as f:
+      objs = extract_ir_lib.load_from_lld_params(
+          [l.strip() for l in f.readlines()], FLAGS.obj_base_dir,
+          FLAGS.output_dir)
+  elif FLAGS.input_type == 'directory':
+    logging.warning(
+        'Using the directory input is only recommended if the build system'
+        'your project uses does not support any structured output that'
+        'ml-compiler-opt understands. If your build system provides a'
+        'structured compilation database, use that instead')
+    objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+  else:
+    logging.error('Unknown input type: %s', FLAGS.input_type)
+
+  relative_output_paths = extract_ir_lib.run_extraction(
+      objs, FLAGS.num_workers, FLAGS.llvm_objcopy_path, FLAGS.cmd_filter,
+      FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name)
+
+  extract_ir_lib.write_corpus_manifest(FLAGS.thinlto_build,
+                                       relative_output_paths, FLAGS.output_dir)
+
+  logging.info('Converted %d files out of %d',
+               len(objs) - relative_output_paths.count(None), len(objs))
+
+
+if __name__ == '__main__':
+  multiprocessing.set_start_method('fork')
+  app.run(main)
diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/py/src/mlgo/extract_ir_lib.py
new file mode 100644
index 00000000000000..c1d2a54b9a9e7c
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir_lib.py
@@ -0,0 +1,373 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library functions for IR extraction."""
+
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import multiprocessing
+import functools
+import json
+
+from typing import Dict, List, Optional
+
+from absl import logging
+
+from compiler_opt.rl import constant
+
+
+# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
+# \0 - separated list of strings, to a \n one.
+def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
+  """Determine if the module should be included."""
+  if match_regexp is None:
+    return True
+  lines = cmdline.split('\0')
+  return any(len(re.findall(match_regexp, l)) for l in lines)
+
+
+def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
+  opts = cmdline.split('\0')
+  for option in opts:
+    if option.startswith('-fthinlto-index'):
+      return os.path.join(basedir, option.split('=')[1])
+  return None
+
+
+class TrainingIRExtractor:
+  """IR and command line extraction from an object file."""
+
+  def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
+    """Set up a TrainingIRExtractor.
+
+    Args:
+      obj_relative_path: relative path to the input object file. It will be also
+        used to construct the absolute path of the output IR and cmd files, by
+        appending it to output_base_dir.
+      output_base_dir: the directory under which the output will be produced.
+      obj_base_dir: the base directory for all the input object files.
+    """
+    self._obj_relative_path = obj_relative_path
+    self._output_base_dir = output_base_dir
+    self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ''
+
+  def obj_base_dir(self):
+    return self._obj_base_dir
+
+  def output_base_dir(self):
+    return self._output_base_dir
+
+  def relative_output_path(self):
+    return self._obj_relative_path
+
+  def input_obj(self):
+    return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+
+  def lld_src_bc(self):
+    # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+    # IR bitcode saved by lld. It is hardcoded into lld.
+    return os.path.join(self._obj_base_dir,
+                        self._obj_relative_path + '.3.import.bc')
+
+  def lld_src_thinlto(self):
+    return os.path.join(self._obj_base_dir,
+                        self._obj_relative_path + '.thinlto.bc')
+
+  def dest_dir(self):
+    return os.path.join(self.output_base_dir(),
+                        os.path.dirname(self._obj_relative_path))
+
+  def module_name(self):
+    return os.path.basename(self._obj_relative_path)
+
+  def cmd_file(self):
+    return os.path.join(self.dest_dir(), self.module_name() + '.cmd')
+
+  def bc_file(self):
+    return os.path.join(self.dest_dir(), self.module_name() + '.bc')
+
+  def thinlto_index_file(self):
+    return os.path.join(self.dest_dir(), self.module_name() + '.thinlto.bc')
+
+  def _get_extraction_cmd_command(self, llvm_objcopy_path: str,
+                                  cmd_section_name: str):
+    """Get llvm-objcopy and process args to a produce a command string that,
+    when invoked, will extract the cmd section info ths self.cmd_file() file.
+    """
+    return [
+        llvm_objcopy_path,
+        '--dump-section=' + cmd_section_name + '=' + self.cmd_file(),
+        self.input_obj(), '/dev/null'
+    ]
+
+  def _get_extraction_bc_command(self, llvm_objcopy_path: str,
+                                 bitcode_section_name: str):
+    """Gets llvm-objcopy and process args to produce a command string that,
+    when invoked, will extract the bitcode section into the self.bc_file()
+    file.
+    """
+    return [
+        llvm_objcopy_path,
+        '--dump-section=' + bitcode_section_name + '=' + self.bc_file(),
+        self.input_obj(), '/dev/null'
+    ]
+
+  def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str,
+                               is_thinlto: bool, cmd_section_name: str,
+                               bitcode_section_name: str) -> Optional[str]:
+    """Run llvm-objcopy to extract the .bc and command line."""
+    if not os.path.exists(self.input_obj()):
+      logging.info('%s does not exist.', self.input_obj())
+      return None
+    os.makedirs(self.dest_dir(), exist_ok=True)
+    try:
+      subprocess.check_output(
+          self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
+          stderr=subprocess.STDOUT,
+          encoding='utf-8')
+      if cmd_filter is not None or is_thinlto:
+        with open(self.cmd_file(), encoding='utf-8') as f:
+          lines = f.readlines()
+        assert len(lines) == 1
+        cmdline = lines[0]
+        if not should_include_module(cmdline, cmd_filter):
+          logging.info(
+              'Excluding module %s because it does not match the filter',
+              self.input_obj())
+          os.remove(self.cmd_file())
+          return None
+        if is_thinlto:
+          index_file = get_thinlto_index(cmdline, self.obj_base_dir())
+          shutil.copy(index_file, self.thinlto_index_file())
+
+      subprocess.check_output(
+          self._get_extraction_bc_command(llvm_objcopy_path,
+                                          bitcode_section_name),
+          stderr=subprocess.STDOUT,
+          encoding='utf-8')
+    except subprocess.CalledProcessError as e:
+      # This may happen if  .o file was build from asm (.S source).
+      logging.warning('%s was not processed: %s', self.input_obj(), e)
+      logging.info(e.output)
+      return None
+    assert (os.path.exists(self.cmd_file()) and
+            os.path.exists(self.bc_file()) and
+            (not is_thinlto or os.path.exists(self.thinlto_index_file())))
+    return self.relative_output_path()
+
+  def _extract_lld_artifacts(self) -> Optional[str]:
+    """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.
+    """
+    if not os.path.exists(self.lld_src_bc()):
+      logging.info('%s does not exist.', self.lld_src_bc())
+      return None
+    if not os.path.exists(self.lld_src_thinlto()):
+      logging.info('%s does not exist.', self.lld_src_thinlto())
+      return None
+    os.makedirs(self.dest_dir(), exist_ok=True)
+
+    # Copy over the files
+    shutil.copy(self.lld_src_bc(), self.bc_file())
+    shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
+
+    assert os.path.exists(self.bc_file())
+    assert os.path.exists(self.thinlto_index_file())
+    return self._obj_relative_path
+
+  def extract(self,
+              llvm_objcopy_path: Optional[str] = None,
+              cmd_filter: Optional[str] = None,
+              thinlto_build: Optional[str] = None,
+              cmd_section_name: Optional[str] = '.llvmcmd',
+              bitcode_section_name: Optional[str] = '.llvmbc') -> Optional[str]:
+    if thinlto_build == 'local':
+      return self._extract_lld_artifacts()
+    return self._extract_clang_artifacts(
+        llvm_objcopy_path=llvm_objcopy_path,
+        cmd_filter=cmd_filter,
+        is_thinlto=thinlto_build == 'distributed',
+        cmd_section_name=cmd_section_name,
+        bitcode_section_name=bitcode_section_name)
+
+
+def convert_compile_command_to_objectfile(
+    command: Dict[str, str], output_dir: str) -> Optional[TrainingIRExtractor]:
+  obj_base_dir = command['directory']
+  if 'arguments' in command:
+    cmd_parts = command['arguments']
+  elif 'command' in command:
+    cmd_parts = command['command'].split()
+  else:
+    logging.info('compile_commands element has no command and arguments')
+    return None
+
+  try:
+    obj_index = cmd_parts.index('-o') + 1
+  except ValueError:
+    # This could happen if there are non-clang commands in compile_commands.json
+    logging.info('Command has no -o option: %s', ' '.join(cmd_parts))
+    return None
+  obj_rel_path = cmd_parts[obj_index]
+  # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
+  return TrainingIRExtractor(
+      obj_relative_path=obj_rel_path,
+      output_base_dir=output_dir,
+      obj_base_dir=obj_base_dir)
+
+
+def load_from_compile_commands(json_array: List[Dict[str, str]],
+                               output_dir: str) -> List[TrainingIRExtractor]:
+  objs = [
+      convert_compile_command_to_objectfile(cmd, output_dir)
+      for cmd in json_array
+  ]
+  # Filter out None, in case there were non-clang commands in the .json
+  return [obj for obj in objs if obj is not None]
+
+
+def load_from_lld_params(params_array: List[str], obj_base_dir: str,
+                         output_dir: str) -> List[TrainingIRExtractor]:
+  """Create an ObjectFile array based on lld's parameters."""
+  # yank out -o and the output. After that, anything not starting with '-', and
+  # ending in a '.o', is an object file.
+  try:
+    minus_o_idx = params_array.index('-o')
+    del params_array[minus_o_idx:minus_o_idx + 2]
+    just_obj_paths = [
+        o for o in params_array if not o.startswith('-') and o.endswith('.o')
+    ]
+  except ValueError:
+    logging.info('This params file does not have an explicit -o option.')
+    just_obj_paths = params_array
+
+  def make_obj(obj_file: str) -> TrainingIRExtractor:
+    return TrainingIRExtractor(
+        obj_relative_path=obj_file,
+        output_base_dir=output_dir,
+        obj_base_dir=obj_base_dir)
+
+  return [make_obj(obj_file) for obj_file in just_obj_paths]
+
+
+def load_from_directory(obj_base_dir: str,
+                        output_dir: str) -> List[TrainingIRExtractor]:
+  """Create an object file array by globbing an entire drectory.
+
+  Args:
+    obj_base_dir: The base build directory that all object files will be
+      written out as being relative to.
+    output_dir: The output directory where extracted .bc and .cmd files should
+      be placed.
+  """
+  paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.o')]
+
+  def make_spec(obj_file: str):
+    return TrainingIRExtractor(
+        obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
+        output_base_dir=output_dir,
+        obj_base_dir=obj_base_dir)
+
+  return [make_spec(path) for path in paths]
+
+
+def load_for_lld_thinlto(obj_base_dir: str,
+                         output_dir: str) -> List[TrainingIRExtractor]:
+  # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+  # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
+  # are also emitted next to the postimport bitcode, with the suffix
+  # .thinlto.bc instead
+  paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.3.import.bc')]
+
+  def make_spec(obj_file: str):
+    return TrainingIRExtractor(
+        # Cut away .3.import.bc
+        obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
+        output_base_dir=output_dir,
+        obj_base_dir=obj_base_dir)
+
+  return [make_spec(path) for path in paths]
+
+
+def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
+                   llvm_objcopy_path: str, cmd_filter: str, thinlto_build: str,
+                   cmd_section_name: str, bitcode_section_name: str):
+  """Extracts all specified object files into the corpus directory.
+
+  Args:
+    objs: A list of TrainingIRExtractor Objects that represent the object files
+      to extract bitcode/commands from.
+    num_workers: The number of parallel processes to spawn to run the
+      extraction.
+    llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
+    cmd_filter: A regular expression that is used to select for compilations
+      performed with specific flags. If you want to include all compilations,
+      set this to None.
+    thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
+      Set this to None if the build was not done with ThinLTO.
+    cmd_section_name: The name of the command line section created by the
+      bitcode embedding.
+    bitcode_section_name: The name of the bitcode section created by the
+      bitcode embedding.
+  """
+  extract_artifacts = functools.partial(
+      TrainingIRExtractor.extract,
+      llvm_objcopy_path=llvm_objcopy_path,
+      cmd_filter=cmd_filter,
+      thinlto_build=thinlto_build,
+      cmd_section_name=cmd_section_name,
+      bitcode_section_name=bitcode_section_name)
+
+  with multiprocessing.Pool(num_workers) as pool:
+    relative_output_paths = pool.map(extract_artifacts, objs)
+    pool.close()
+    pool.join()
+  return relative_output_paths
+
+
+def write_corpus_manifest(thinlto_build: str, relative_output_paths: List[str],
+                          output_dir: str):
+  """Writes a corpus_manifest.json containing all necessary information about
+  the corpus.
+
+  Args:
+    thinlto_build: Whether or not the build was done with ThinLTO and if so,
+      what kind of ThinLTO. Set this to none if the build was not performed with
+      ThinLTO.
+    relative_output_paths: The relative (to the corpus directory) output paths
+      of all the bitcode files that should be placed in the corpus manifest
+    output_dir: The corpus directory where the corpus manifest should be
+      placed.
+  """
+  # This comes first rather than later so global_command_override is at the top
+  # of the .json after being written
+  if thinlto_build == 'local':
+    corpus_description = {
+        'global_command_override': constant.UNSPECIFIED_OVERRIDE
+    }
+  else:
+    corpus_description = {}
+
+  corpus_description.update({
+      'has_thinlto': thinlto_build is not None,
+      'modules': [path for path in relative_output_paths if path is not None]
+  })
+
+  with open(
+      os.path.join(output_dir, 'corpus_description.json'),
+      'w',
+      encoding='utf-8') as f:
+    json.dump(corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/py/src/mlgo/extract_ir_test.py
new file mode 100644
index 00000000000000..8811134aab4fce
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir_test.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for compiler_opt.tools.extract_ir."""
+
+# pylint: disable=protected-access
+import os.path
+
+from absl.testing import absltest
+
+from compiler_opt.tools import extract_ir_lib
+
+
+class ExtractIrTest(absltest.TestCase):
+
+  def test_one_conversion(self):
+    obj = extract_ir_lib.convert_compile_command_to_objectfile(
+        {
+            'directory': '/output/directory',
+            'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
+            'file': '/some/path/lib/foo/bar.cc'
+        }, '/corpus/destination/path')
+    self.assertIsNotNone(obj)
+    # pytype: disable=attribute-error
+    # Pytype complains about obj being None
+    self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
+    self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
+    self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
+    self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+    self.assertEqual(obj.thinlto_index_file(),
+                     '/corpus/destination/path/lib/bar.o.thinlto.bc')
+    # pytype: enable=attribute-error
+
+  def test_one_conversion_arguments_style(self):
+    obj = extract_ir_lib.convert_compile_command_to_objectfile(
+        {
+            'directory': '/output/directory',
+            'arguments':
+                ['-cc1', '-c', '/some/path/lib/foo/bar.cc', '-o', 'lib/bar.o'],
+            'file': '/some/path/lib/foo/bar.cc'
+        }, '/corpus/destination/path')
+    self.assertIsNotNone(obj)
+    # pytype: disable=attribute-error
+    # Pytype complains about obj being None
+    self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
+    self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
+    self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
+    self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+    self.assertEqual(obj.thinlto_index_file(),
+                     '/corpus/destination/path/lib/bar.o.thinlto.bc')
+    # pytype: enable=attribute-error
+
+  def test_arr_conversion(self):
+    res = extract_ir_lib.load_from_compile_commands([{
+        'directory': '/output/directory',
+        'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
+        'file': '/some/path/lib/foo/bar.cc'
+    }, {
+        'directory': '/output/directory',
+        'command': '-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o',
+        'file': '/some/path/lib/foo/baz.cc'
+    }], '/corpus/destination/path')
+    res = list(res)
+    self.assertLen(res, 2)
+    self.assertEqual(res[0].input_obj(), '/output/directory/lib/bar.o')
+    self.assertEqual(res[0].relative_output_path(), 'lib/bar.o')
+    self.assertEqual(res[0].cmd_file(),
+                     '/corpus/destination/path/lib/bar.o.cmd')
+    self.assertEqual(res[0].bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+    self.assertEqual(res[0].thinlto_index_file(),
+                     '/corpus/destination/path/lib/bar.o.thinlto.bc')
+
+    self.assertEqual(res[1].input_obj(), '/output/directory/lib/other/baz.o')
+    self.assertEqual(res[1].relative_output_path(), 'lib/other/baz.o')
+    self.assertEqual(res[1].cmd_file(),
+                     '/corpus/destination/path/lib/other/baz.o.cmd')
+    self.assertEqual(res[1].bc_file(),
+                     '/corpus/destination/path/lib/other/baz.o.bc')
+    self.assertEqual(res[1].thinlto_index_file(),
+                     '/corpus/destination/path/lib/other/baz.o.thinlto.bc')
+
+  def test_command_extraction(self):
+    obj = extract_ir_lib.TrainingIRExtractor(
+        obj_relative_path='lib/obj_file.o',
+        output_base_dir='/where/corpus/goes',
+        obj_base_dir='/foo/bar')
+    self.assertEqual(
+        obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
+            '/bin/llvm_objcopy_path',
+            '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
+            '/foo/bar/lib/obj_file.o', '/dev/null'
+        ])
+    self.assertEqual(
+        obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
+            '/bin/llvm_objcopy_path',
+            '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
+            '/foo/bar/lib/obj_file.o', '/dev/null'
+        ])
+
+  def test_command_extraction_no_basedir(self):
+    obj = extract_ir_lib.TrainingIRExtractor('lib/obj_file.o',
+                                             '/where/corpus/goes')
+    self.assertEqual(
+        obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
+            '/bin/llvm_objcopy_path',
+            '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
+            'lib/obj_file.o', '/dev/null'
+        ])
+    self.assertEqual(
+        obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
+            '/bin/llvm_objcopy_path',
+            '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
+            'lib/obj_file.o', '/dev/null'
+        ])
+
+  def test_lld_params(self):
+    lld_opts = [
+        '-o', 'output/dir/exe', 'lib/obj1.o', 'somelib.a', '-W,blah',
+        'lib/dir/obj2.o'
+    ]
+    obj = extract_ir_lib.load_from_lld_params(lld_opts, '/some/path',
+                                              '/tmp/out')
+    self.assertLen(obj, 2)
+    self.assertEqual(obj[0].input_obj(), '/some/path/lib/obj1.o')
+    self.assertEqual(obj[0].relative_output_path(), 'lib/obj1.o')
+    self.assertEqual(obj[0].cmd_file(), '/tmp/out/lib/obj1.o.cmd')
+    self.assertEqual(obj[0].thinlto_index_file(),
+                     '/tmp/out/lib/obj1.o.thinlto.bc')
+    self.assertEqual(obj[1].input_obj(), '/some/path/lib/dir/obj2.o')
+
+  def test_load_from_directory(self):
+    tempdir = self.create_tempdir()
+    subdir = tempdir.mkdir(dir_path='subdir')
+    subdir.create_file(file_path='test1.o')
+    subdir.create_file(file_path='test2.o')
+    outdir = self.create_tempdir()
+    objs = extract_ir_lib.load_from_directory(tempdir.full_path,
+                                              outdir.full_path)
+    self.assertLen(objs, 2)
+    for index, obj in enumerate(
+        sorted(objs, key=lambda x: x._obj_relative_path)):
+      self.assertEqual(obj._obj_relative_path, f'subdir/test{index + 1:d}.o')
+      self.assertEqual(obj._obj_base_dir, tempdir.full_path)
+      self.assertEqual(obj._output_base_dir, outdir.full_path)
+
+  def test_lld_thinlto_discovery(self):
+    tempdir = self.create_tempdir()
+    tempdir.create_file(file_path='1.3.import.bc')
+    tempdir.create_file(file_path='2.3.import.bc')
+    tempdir.create_file(file_path='3.3.import.bc')
+    tempdir.create_file(file_path='1.thinlto.bc')
+    tempdir.create_file(file_path='2.thinlto.bc')
+    tempdir.create_file(file_path='3.thinlto.bc')
+    outdir = self.create_tempdir()
+    obj = extract_ir_lib.load_for_lld_thinlto(tempdir.full_path,
+                                              outdir.full_path)
+    self.assertLen(obj, 3)
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+      self.assertEqual(o._obj_relative_path, f'{i + 1:d}')
+      self.assertEqual(o._obj_base_dir, tempdir.full_path)
+      self.assertEqual(o._output_base_dir, outdir.full_path)
+
+  def test_lld_thinlto_discovery_nested(self):
+    outer = self.create_tempdir()
+    tempdir = outer.mkdir(dir_path='nest')
+    tempdir.create_file(file_path='1.3.import.bc')
+    tempdir.create_file(file_path='2.3.import.bc')
+    tempdir.create_file(file_path='3.3.import.bc')
+    tempdir.create_file(file_path='1.thinlto.bc')
+    tempdir.create_file(file_path='2.thinlto.bc')
+    tempdir.create_file(file_path='3.thinlto.bc')
+    outdir = self.create_tempdir()
+    obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+    self.assertLen(obj, 3)
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+      self.assertEqual(o._obj_relative_path, f'nest/{i + 1:d}')
+      self.assertEqual(o._obj_base_dir, outer.full_path)
+      self.assertEqual(o._output_base_dir, outdir.full_path)
+
+  def test_lld_thinlto_extraction(self):
+    outer = self.create_tempdir()
+    tempdir = outer.mkdir(dir_path='nest')
+    tempdir.create_file(file_path='1.3.import.bc')
+    tempdir.create_file(file_path='2.3.import.bc')
+    tempdir.create_file(file_path='3.3.import.bc')
+    tempdir.create_file(file_path='1.thinlto.bc')
+    tempdir.create_file(file_path='2.thinlto.bc')
+    tempdir.create_file(file_path='3.thinlto.bc')
+    outdir = self.create_tempdir()
+    obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+      mod_path = o.extract(thinlto_build='local')
+      self.assertEqual(mod_path, f'nest/{i + 1:d}')
+    self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/1.bc')))
+    self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/2.bc')))
+    self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/3.bc')))
+    self.assertTrue(
+        os.path.exists(os.path.join(outdir.full_path, 'nest/1.thinlto.bc')))
+    self.assertTrue(
+        os.path.exists(os.path.join(outdir.full_path, 'nest/2.thinlto.bc')))
+    self.assertTrue(
+        os.path.exists(os.path.join(outdir.full_path, 'nest/3.thinlto.bc')))
+
+  def test_filtering(self):
+    cmdline = '-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o'
+    self.assertTrue(extract_ir_lib.should_include_module(cmdline, None))
+    self.assertTrue(extract_ir_lib.should_include_module(cmdline, '.*'))
+    self.assertTrue(extract_ir_lib.should_include_module(cmdline, '^-Oz$'))
+    self.assertFalse(extract_ir_lib.should_include_module(cmdline, '^-O3$'))
+
+  def test_thinlto_index_extractor(self):
+    cmdline = ('-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/'
+               'out.o\0-fthinlto-index=foo/bar.thinlto.bc')
+    self.assertEqual(
+        extract_ir_lib.get_thinlto_index(cmdline, '/the/base/dir'),
+        '/the/base/dir/foo/bar.thinlto.bc')
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/py/src/mlgo/make_corpus.py
new file mode 100644
index 00000000000000..24493d894be723
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus.py
@@ -0,0 +1,58 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+  --input_dir=<path to input directory> \
+  --output_dir=<path to output directory> \
+  --default_args="<list of space separated flags>"
+"""
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from compiler_opt.tools import make_corpus_lib
+
+flags.DEFINE_string('input_dir', None, 'The input directory.')
+flags.DEFINE_string('output_dir', None, 'The output directory.')
+flags.DEFINE_string(
+    'default_args', '',
+    'The compiler flags to compile with when using downstream tooling.')
+
+flags.mark_flag_as_required('input_dir')
+flags.mark_flag_as_required('output_dir')
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  logging.warning(
+      'Using this tool does not guarantee that the bitcode is taken at '
+      'the correct stage for consumption during model training. Make '
+      'sure to validate assumptions about where the bitcode is coming '
+      'from before using it in production.')
+  relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
+  make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir,
+                               FLAGS.output_dir)
+  make_corpus_lib.write_corpus_manifest(relative_paths, FLAGS.output_dir,
+                                        FLAGS.default_args.split())
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/py/src/mlgo/make_corpus_lib.py
new file mode 100644
index 00000000000000..3598fc12a04d14
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus_lib.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library functions for making a corpus from arbitrary bitcode."""
+
+import pathlib
+import os
+import shutil
+import json
+
+from typing import List, Optional
+
+BITCODE_EXTENSION = '.bc'
+
+
+def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]:
+  """Finds bitcode files to extract from a given directory.
+
+  Args:
+    bitcode_base_dir: The base directory where the bitcode to be copied
+      is from.
+    output_dir: The directory to place the bitcode in.
+
+  Returns an array of paths representing the relative path to the bitcode
+  file from the base direcotry.
+  """
+  paths = [
+      str(p)[:-len(BITCODE_EXTENSION)]
+      for p in pathlib.Path(bitcode_base_dir).glob('**/*' + BITCODE_EXTENSION)
+  ]
+
+  return [
+      os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths
+  ]
+
+
+def copy_bitcode(relative_paths: List[str], bitcode_base_dir: str,
+                 output_dir: str) -> None:
+  """Copies bitcode files from the base directory to the output directory.
+
+  Args:
+    relative_paths: An array of relative paths to bitcode files that are copied
+      over to the output directory, preserving relative location.
+    bitcode_base_dir: The base directory where the bitcode is located.
+    output_dir: The output directory to place the bitcode in.
+  """
+  for relative_path in relative_paths:
+    base_path = os.path.join(bitcode_base_dir,
+                             relative_path + BITCODE_EXTENSION)
+    destination_path = os.path.join(output_dir,
+                                    relative_path + BITCODE_EXTENSION)
+    os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+    shutil.copy(base_path, destination_path)
+
+
+def write_corpus_manifest(relative_output_paths: List[str],
+                          output_dir: str,
+                          default_args: Optional[List[str]] = None) -> None:
+  """Creates a corpus manifest describing the bitcode that has been found.
+
+  Args:
+    relative_output_paths: A list of paths to each bitcode file relative to the
+      output directory.
+    outout_dir: The output directory where the corpus is being created.
+    default_args: An array of compiler flags that should be used to compile
+      the bitcode when using further downstream tooling."""
+  if default_args is None:
+    default_args = []
+  corpus_description = {
+      'global_command_override': default_args,
+      'has_thinlto': False,
+      'modules': [path for path in relative_output_paths if path is not None]
+  }
+
+  with open(
+      os.path.join(output_dir, 'corpus_description.json'),
+      'w',
+      encoding='utf-8') as description_file:
+    json.dump(corpus_description, description_file, indent=2)
diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/py/src/mlgo/make_corpus_test.py
new file mode 100644
index 00000000000000..8ed598695d06ee
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus_test.py
@@ -0,0 +1,66 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test for compiler_opt.tools.make_corpus_lib"""
+
+import json
+import os
+
+from absl.testing import absltest
+
+from compiler_opt.tools import make_corpus_lib
+
+
+class MakeCorpusTest(absltest.TestCase):
+
+  def test_load_bitcode_from_directory(self):
+    outer = self.create_tempdir()
+    tempdir = outer.mkdir(dir_path='nested')
+    tempdir.create_file('test1.bc')
+    tempdir.create_file('test2.bc')
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(outer)
+    relative_paths = sorted(relative_paths)
+    self.assertEqual(relative_paths[0], 'nested/test1')
+    self.assertEqual(relative_paths[1], 'nested/test2')
+
+  def test_copy_bitcode(self):
+    build_dir = self.create_tempdir()
+    nested_dir = build_dir.mkdir(dir_path='nested')
+    nested_dir.create_file('test1.bc')
+    nested_dir.create_file('test2.bc')
+    relative_paths = ['nested/test1', 'nested/test2']
+    corpus_dir = self.create_tempdir()
+    make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
+    output_files = sorted(os.listdir(os.path.join(corpus_dir, './nested')))
+    self.assertEqual(output_files[0], 'test1.bc')
+    self.assertEqual(output_files[1], 'test2.bc')
+
+  def test_write_corpus_manifest(self):
+    relative_output_paths = ['test/test1', 'test/test2']
+    output_dir = self.create_tempdir()
+    default_args = ['-O3', '-c']
+    make_corpus_lib.write_corpus_manifest(relative_output_paths, output_dir,
+                                          default_args)
+    with open(
+        os.path.join(output_dir, 'corpus_description.json'),
+        encoding='utf-8') as corpus_description_file:
+      corpus_description = json.load(corpus_description_file)
+    self.assertEqual(corpus_description['global_command_override'],
+                     default_args)
+    self.assertEqual(corpus_description['has_thinlto'], False)
+    self.assertEqual(corpus_description['modules'], relative_output_paths)
+
+
+if __name__ == '__main__':
+  absltest.main()

>From 3f8d1e7052734979806d94cccfde5a8a05f6dece Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 14 Jan 2024 21:14:47 -0800
Subject: [PATCH 2/7] Add proper copyright headers

---
 llvm/py/src/mlgo/combine_training_corpus.py     | 17 +++--------------
 llvm/py/src/mlgo/combine_training_corpus_lib.py | 17 +++--------------
 .../py/src/mlgo/combine_training_corpus_test.py | 17 +++--------------
 llvm/py/src/mlgo/extract_ir.py                  | 17 +++--------------
 llvm/py/src/mlgo/extract_ir_lib.py              | 17 +++--------------
 llvm/py/src/mlgo/extract_ir_test.py             | 17 +++--------------
 llvm/py/src/mlgo/make_corpus.py                 | 17 +++--------------
 llvm/py/src/mlgo/make_corpus_lib.py             | 17 +++--------------
 llvm/py/src/mlgo/make_corpus_test.py            | 17 +++--------------
 9 files changed, 27 insertions(+), 126 deletions(-)

diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/py/src/mlgo/combine_training_corpus.py
index 94ee1cbac9cea4..e62bcb61e9d9e1 100644
--- a/llvm/py/src/mlgo/combine_training_corpus.py
+++ b/llvm/py/src/mlgo/combine_training_corpus.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 r"""Combine multiple training corpus into a single training corpus.
 
 Currently only support the case that multiple corpus share the same
diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/py/src/mlgo/combine_training_corpus_lib.py
index 0359961266a240..1050e5099ae21c 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_lib.py
+++ b/llvm/py/src/mlgo/combine_training_corpus_lib.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Library for combining training corpora."""
 
 import os
diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/py/src/mlgo/combine_training_corpus_test.py
index 47dd602967b68f..3c793947db139e 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_test.py
+++ b/llvm/py/src/mlgo/combine_training_corpus_test.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Tests for combining training corpora."""
 
 import json
diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/py/src/mlgo/extract_ir.py
index 2a1ef3978888d6..58e31a0475e124 100644
--- a/llvm/py/src/mlgo/extract_ir.py
+++ b/llvm/py/src/mlgo/extract_ir.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Extract IR for training.
 
 Extract IR for training, either from a compile_commands.json file produced by
diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/py/src/mlgo/extract_ir_lib.py
index c1d2a54b9a9e7c..83d2b26d1f71ce 100644
--- a/llvm/py/src/mlgo/extract_ir_lib.py
+++ b/llvm/py/src/mlgo/extract_ir_lib.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Library functions for IR extraction."""
 
 import os
diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/py/src/mlgo/extract_ir_test.py
index 8811134aab4fce..d7de50530032cc 100644
--- a/llvm/py/src/mlgo/extract_ir_test.py
+++ b/llvm/py/src/mlgo/extract_ir_test.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Tests for compiler_opt.tools.extract_ir."""
 
 # pylint: disable=protected-access
diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/py/src/mlgo/make_corpus.py
index 24493d894be723..989d9790b5bcd9 100644
--- a/llvm/py/src/mlgo/make_corpus.py
+++ b/llvm/py/src/mlgo/make_corpus.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Tool for making a corpus from arbitrary bitcode.
 
 To create a corpus from a set of bitcode files in an input directory, run
diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/py/src/mlgo/make_corpus_lib.py
index 3598fc12a04d14..97db20a9859e17 100644
--- a/llvm/py/src/mlgo/make_corpus_lib.py
+++ b/llvm/py/src/mlgo/make_corpus_lib.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Library functions for making a corpus from arbitrary bitcode."""
 
 import pathlib
diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/py/src/mlgo/make_corpus_test.py
index 8ed598695d06ee..fcb861ebb91f32 100644
--- a/llvm/py/src/mlgo/make_corpus_test.py
+++ b/llvm/py/src/mlgo/make_corpus_test.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Test for compiler_opt.tools.make_corpus_lib"""
 
 import json

>From 2bc8ac318e02672f4bfe87df8cbe19a1c00205dc Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Mon, 15 Jan 2024 05:37:27 +0000
Subject: [PATCH 3/7] Format files using black

---
 llvm/py/src/mlgo/combine_training_corpus.py   |  12 +-
 .../src/mlgo/combine_training_corpus_lib.py   |  48 +-
 .../src/mlgo/combine_training_corpus_test.py  | 154 ++--
 llvm/py/src/mlgo/extract_ir.py                | 194 +++---
 llvm/py/src/mlgo/extract_ir_lib.py            | 655 +++++++++---------
 llvm/py/src/mlgo/extract_ir_test.py           | 447 ++++++------
 llvm/py/src/mlgo/make_corpus.py               |  43 +-
 llvm/py/src/mlgo/make_corpus_lib.py           | 106 ++-
 llvm/py/src/mlgo/make_corpus_test.py          |  81 ++-
 9 files changed, 923 insertions(+), 817 deletions(-)

diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/py/src/mlgo/combine_training_corpus.py
index e62bcb61e9d9e1..c14c9381a18a6b 100644
--- a/llvm/py/src/mlgo/combine_training_corpus.py
+++ b/llvm/py/src/mlgo/combine_training_corpus.py
@@ -28,17 +28,17 @@
 
 from compiler_opt.tools import combine_training_corpus_lib
 
-flags.DEFINE_string('root_dir', '', 'root dir of module paths to combine.')
+flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
 
 FLAGS = flags.FLAGS
 
 
 def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError('Too many command-line arguments.')
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
 
-  combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
+    combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
 
 
-if __name__ == '__main__':
-  app.run(main)
+if __name__ == "__main__":
+    app.run(main)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/py/src/mlgo/combine_training_corpus_lib.py
index 1050e5099ae21c..1de182e4cb80dd 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_lib.py
+++ b/llvm/py/src/mlgo/combine_training_corpus_lib.py
@@ -10,30 +10,30 @@
 
 import tensorflow as tf
 
-_FILE_NAME = 'corpus_description.json'
+_FILE_NAME = "corpus_description.json"
 
 
 def combine_corpus(root_dir: str) -> None:
-  module_names = []
-  output_corpus_description = {}
-
-  corpus_description_glob = os.path.join(root_dir, '*/' + _FILE_NAME)
-  for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
-    logging.info('processing %s', corpus_description_path)
-
-    with tf.io.gfile.GFile(corpus_description_path, 'r') as f:
-      corpus_description = json.load(f)
-      sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
-      module_names.extend([
-          os.path.join(sub_dir, name) for name in corpus_description['modules']
-      ])
-      del corpus_description['modules']
-      if len(output_corpus_description) == 0:
-        output_corpus_description = corpus_description
-      elif corpus_description != output_corpus_description:
-        raise ValueError('Input corpora differ by more than modules.')
-
-  output_corpus_description['modules'] = module_names
-
-  with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), 'w') as f:
-    json.dump(output_corpus_description, f, indent=2)
+    module_names = []
+    output_corpus_description = {}
+
+    corpus_description_glob = os.path.join(root_dir, "*/" + _FILE_NAME)
+    for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
+        logging.info("processing %s", corpus_description_path)
+
+        with tf.io.gfile.GFile(corpus_description_path, "r") as f:
+            corpus_description = json.load(f)
+            sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
+            module_names.extend(
+                [os.path.join(sub_dir, name) for name in corpus_description["modules"]]
+            )
+            del corpus_description["modules"]
+            if len(output_corpus_description) == 0:
+                output_corpus_description = corpus_description
+            elif corpus_description != output_corpus_description:
+                raise ValueError("Input corpora differ by more than modules.")
+
+    output_corpus_description["modules"] = module_names
+
+    with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), "w") as f:
+        json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/py/src/mlgo/combine_training_corpus_test.py
index 3c793947db139e..969d8472964971 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_test.py
+++ b/llvm/py/src/mlgo/combine_training_corpus_test.py
@@ -12,82 +12,88 @@
 
 
 class CombineTrainingCorpusTest(absltest.TestCase):
+    def test_combine_corpus(self):
+        corpus_dir = self.create_tempdir()
+        subcorpus1_dir = corpus_dir.mkdir(dir_path="subcorpus1")
+        subcorpus2_dir = corpus_dir.mkdir(dir_path="subcorpus2")
+        subcorpus1_description = {
+            "has_thinlto": False,
+            "modules": ["test1.o", "test2.o"],
+        }
+        subcorpus2_description = {
+            "has_thinlto": False,
+            "modules": ["test3.o", "test4.o"],
+        }
+        subcorpus1_description_file = subcorpus1_dir.create_file(
+            file_path="corpus_description.json"
+        )
+        subcorpus2_description_file = subcorpus2_dir.create_file(
+            file_path="corpus_description.json"
+        )
+        subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+        subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+        combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+        with open(
+            os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+        ) as combined_corpus_description_file:
+            combined_corpus_description = json.load(combined_corpus_description_file)
+        self.assertEqual(combined_corpus_description["has_thinlto"], False)
+        self.assertLen(combined_corpus_description["modules"], 4)
+        self.assertIn("subcorpus1/test1.o", combined_corpus_description["modules"])
+        self.assertIn("subcorpus1/test2.o", combined_corpus_description["modules"])
+        self.assertIn("subcorpus2/test3.o", combined_corpus_description["modules"])
+        self.assertIn("subcorpus2/test4.o", combined_corpus_description["modules"])
 
-  def test_combine_corpus(self):
-    corpus_dir = self.create_tempdir()
-    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
-    subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
-    subcorpus1_description = {
-        'has_thinlto': False,
-        'modules': ['test1.o', 'test2.o']
-    }
-    subcorpus2_description = {
-        'has_thinlto': False,
-        'modules': ['test3.o', 'test4.o']
-    }
-    subcorpus1_description_file = subcorpus1_dir.create_file(
-        file_path='corpus_description.json')
-    subcorpus2_description_file = subcorpus2_dir.create_file(
-        file_path='corpus_description.json')
-    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
-    subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
-    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
-    with open(
-        os.path.join(corpus_dir, 'corpus_description.json'),
-        encoding='utf-8') as combined_corpus_description_file:
-      combined_corpus_description = json.load(combined_corpus_description_file)
-    self.assertEqual(combined_corpus_description['has_thinlto'], False)
-    self.assertLen(combined_corpus_description['modules'], 4)
-    self.assertIn('subcorpus1/test1.o', combined_corpus_description['modules'])
-    self.assertIn('subcorpus1/test2.o', combined_corpus_description['modules'])
-    self.assertIn('subcorpus2/test3.o', combined_corpus_description['modules'])
-    self.assertIn('subcorpus2/test4.o', combined_corpus_description['modules'])
+    def test_empty_folder(self):
+        corpus_dir = self.create_tempdir()
+        subcorpus1_dir = corpus_dir.mkdir(dir_path="subcorpus1")
+        _ = corpus_dir.mkdir(dir_path="empty_dir")
+        subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
+        subcorpus1_description_file = subcorpus1_dir.create_file(
+            file_path="corpus_description.json"
+        )
+        subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+        combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+        with open(
+            os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+        ) as combined_corpus_description_file:
+            combined_corpus_description = json.load(combined_corpus_description_file)
+        self.assertLen(combined_corpus_description["modules"], 2)
 
-  def test_empty_folder(self):
-    corpus_dir = self.create_tempdir()
-    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
-    _ = corpus_dir.mkdir(dir_path='empty_dir')
-    subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
-    subcorpus1_description_file = subcorpus1_dir.create_file(
-        file_path='corpus_description.json')
-    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
-    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
-    with open(
-        os.path.join(corpus_dir, 'corpus_description.json'),
-        encoding='utf-8') as combined_corpus_description_file:
-      combined_corpus_description = json.load(combined_corpus_description_file)
-    self.assertLen(combined_corpus_description['modules'], 2)
+    def test_ignore_extra_file(self):
+        corpus_dir = self.create_tempdir()
+        subcorpus1_dir = corpus_dir.mkdir(dir_path="subcorpus1")
+        _ = corpus_dir.create_file(file_path="empty.log")
+        subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
+        subcorpus1_description_file = subcorpus1_dir.create_file(
+            file_path="corpus_description.json"
+        )
+        subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+        combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+        with open(
+            os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+        ) as combined_corpus_description_file:
+            combined_corpus_description = json.load(combined_corpus_description_file)
+        self.assertLen(combined_corpus_description["modules"], 2)
 
-  def test_ignore_extra_file(self):
-    corpus_dir = self.create_tempdir()
-    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
-    _ = corpus_dir.create_file(file_path='empty.log')
-    subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
-    subcorpus1_description_file = subcorpus1_dir.create_file(
-        file_path='corpus_description.json')
-    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
-    combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
-    with open(
-        os.path.join(corpus_dir, 'corpus_description.json'),
-        encoding='utf-8') as combined_corpus_description_file:
-      combined_corpus_description = json.load(combined_corpus_description_file)
-    self.assertLen(combined_corpus_description['modules'], 2)
+    def test_different_corpora(self):
+        corpus_dir = self.create_tempdir()
+        subcorpus1_dir = corpus_dir.mkdir(dir_path="subcorpus1")
+        subcorpus2_dir = corpus_dir.mkdir(dir_path="subcorpus2")
+        subcorpus1_description = {"has_thinlto": False, "modules": ["test1.o"]}
+        subcorpus2_description = {"has_thinlto": True, "modules": ["test2.o"]}
+        subcorpus1_description_file = subcorpus1_dir.create_file(
+            file_path="corpus_description.json"
+        )
+        subcorpus2_description_file = subcorpus2_dir.create_file(
+            file_path="corpus_description.json"
+        )
+        subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+        subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+        self.assertRaises(
+            ValueError, combine_training_corpus_lib.combine_corpus, corpus_dir.full_path
+        )
 
-  def test_different_corpora(self):
-    corpus_dir = self.create_tempdir()
-    subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
-    subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
-    subcorpus1_description = {'has_thinlto': False, 'modules': ['test1.o']}
-    subcorpus2_description = {'has_thinlto': True, 'modules': ['test2.o']}
-    subcorpus1_description_file = subcorpus1_dir.create_file(
-        file_path='corpus_description.json')
-    subcorpus2_description_file = subcorpus2_dir.create_file(
-        file_path='corpus_description.json')
-    subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
-    subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
-    self.assertRaises(ValueError, combine_training_corpus_lib.combine_corpus,
-                      corpus_dir.full_path)
 
-
-if __name__ == '__main__':
-  absltest.main()
+if __name__ == "__main__":
+    absltest.main()
diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/py/src/mlgo/extract_ir.py
index 58e31a0475e124..395a298ecec81d 100644
--- a/llvm/py/src/mlgo/extract_ir.py
+++ b/llvm/py/src/mlgo/extract_ir.py
@@ -34,98 +34,128 @@
 from compiler_opt.tools import extract_ir_lib
 
 flags.DEFINE_string(
-    'input', None,
-    'Input file or directory - either compile_commands.json, a linker parameter'
-    'list, or a path to a directory containing object files.')
+    "input",
+    None,
+    "Input file or directory - either compile_commands.json, a linker parameter"
+    "list, or a path to a directory containing object files.",
+)
 flags.DEFINE_enum(
-    'input_type', 'json', ['json', 'params', 'directory'],
-    'Input file type - json, params, or directory. params latter refers to lld'
-    'params.')
-flags.DEFINE_string('output_dir', None, 'Output directory')
+    "input_type",
+    "json",
+    ["json", "params", "directory"],
+    "Input file type - json, params, or directory. params latter refers to lld"
+    "params.",
+)
+flags.DEFINE_string("output_dir", None, "Output directory")
 flags.DEFINE_integer(
-    'num_workers', None,
-    'Number of parallel workers for objcopy. `None` for maximum available.')
-flags.DEFINE_string('llvm_objcopy_path', 'llvm-objcopy', 'Path to llvm-objcopy')
+    "num_workers",
+    None,
+    "Number of parallel workers for objcopy. `None` for maximum available.",
+)
+flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
 flags.DEFINE_string(
-    'obj_base_dir', '',
-    'Base directory for object files. Defaults to current working dir.')
+    "obj_base_dir",
+    "",
+    "Base directory for object files. Defaults to current working dir.",
+)
 flags.DEFINE_string(
-    'cmd_filter', None,
-    'Include only those modules with a command line matching this regexp. '
-    'Setting it to None for not filtering. Note that the regexp is applied '
-    'independently for each separate command line option. For example, ^-Oz$ '
-    'will match Oz - built binaries. Does not work with thinlto_build=lld.')
+    "cmd_filter",
+    None,
+    "Include only those modules with a command line matching this regexp. "
+    "Setting it to None for not filtering. Note that the regexp is applied "
+    "independently for each separate command line option. For example, ^-Oz$ "
+    "will match Oz - built binaries. Does not work with thinlto_build=lld.",
+)
 flags.DEFINE_enum(
-    'thinlto_build', None, ['distributed', 'local'],
-    'Set if the build was performed with either \'distributed\' or '
-    '\'local\' ThinLTO. This ensures the thinlto.bc files are also copied. '
-    'The build is assumed to have had '
-    '-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed '
-    'case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files '
-    'passed in the local case.')
+    "thinlto_build",
+    None,
+    ["distributed", "local"],
+    "Set if the build was performed with either 'distributed' or "
+    "'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
+    "The build is assumed to have had "
+    "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
+    "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
+    "passed in the local case.",
+)
 flags.DEFINE_string(
-    'cmd_section_name', '.llvmcmd',
-    'The section name passed to llvm-objcopy. For ELF object files, the '
-    'default .llvmcmd is correct. For Mach-O object files, one should use '
-    'something like __LLVM,__cmdline')
+    "cmd_section_name",
+    ".llvmcmd",
+    "The section name passed to llvm-objcopy. For ELF object files, the "
+    "default .llvmcmd is correct. For Mach-O object files, one should use "
+    "something like __LLVM,__cmdline",
+)
 flags.DEFINE_string(
-    'bitcode_section_name', '.llvmbc',
-    'The section name passed to llvm-objcopy. For ELF object files, the '
-    'default .llvmbc is correct. For Mach-O object files, one should use '
-    '__LLVM,__bitcode')
+    "bitcode_section_name",
+    ".llvmbc",
+    "The section name passed to llvm-objcopy. For ELF object files, the "
+    "default .llvmbc is correct. For Mach-O object files, one should use "
+    "__LLVM,__bitcode",
+)
 
-flags.mark_flag_as_required('output_dir')
+flags.mark_flag_as_required("output_dir")
 
 FLAGS = flags.FLAGS
 
 
 def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError('Too many command-line arguments.')
-
-  objs = []
-  if FLAGS.input is not None and FLAGS.thinlto_build == 'local':
-    raise ValueError('--thinlto_build=local cannot be run with --input')
-  if FLAGS.input is None:
-    if FLAGS.thinlto_build != 'local':
-      raise ValueError('--input or --thinlto_build=local must be provided')
-    objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir,
-                                               FLAGS.output_dir)
-  elif FLAGS.input_type == 'json':
-    with open(FLAGS.input, encoding='utf-8') as f:
-      objs = extract_ir_lib.load_from_compile_commands(
-          json.load(f), FLAGS.output_dir)
-  elif FLAGS.input_type == 'params':
-    if not FLAGS.obj_base_dir:
-      logging.info(
-          '-obj_base_dir is unspecified, assuming current directory.'
-          'If no objects are found, use this option to specify the root'
-          'directory for the object file paths in the input file.')
-    with open(FLAGS.input, encoding='utf-8') as f:
-      objs = extract_ir_lib.load_from_lld_params(
-          [l.strip() for l in f.readlines()], FLAGS.obj_base_dir,
-          FLAGS.output_dir)
-  elif FLAGS.input_type == 'directory':
-    logging.warning(
-        'Using the directory input is only recommended if the build system'
-        'your project uses does not support any structured output that'
-        'ml-compiler-opt understands. If your build system provides a'
-        'structured compilation database, use that instead')
-    objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
-  else:
-    logging.error('Unknown input type: %s', FLAGS.input_type)
-
-  relative_output_paths = extract_ir_lib.run_extraction(
-      objs, FLAGS.num_workers, FLAGS.llvm_objcopy_path, FLAGS.cmd_filter,
-      FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name)
-
-  extract_ir_lib.write_corpus_manifest(FLAGS.thinlto_build,
-                                       relative_output_paths, FLAGS.output_dir)
-
-  logging.info('Converted %d files out of %d',
-               len(objs) - relative_output_paths.count(None), len(objs))
-
-
-if __name__ == '__main__':
-  multiprocessing.set_start_method('fork')
-  app.run(main)
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+
+    objs = []
+    if FLAGS.input is not None and FLAGS.thinlto_build == "local":
+        raise ValueError("--thinlto_build=local cannot be run with --input")
+    if FLAGS.input is None:
+        if FLAGS.thinlto_build != "local":
+            raise ValueError("--input or --thinlto_build=local must be provided")
+        objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
+    elif FLAGS.input_type == "json":
+        with open(FLAGS.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_compile_commands(
+                json.load(f), FLAGS.output_dir
+            )
+    elif FLAGS.input_type == "params":
+        if not FLAGS.obj_base_dir:
+            logging.info(
+                "-obj_base_dir is unspecified, assuming current directory."
+                "If no objects are found, use this option to specify the root"
+                "directory for the object file paths in the input file."
+            )
+        with open(FLAGS.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_lld_params(
+                [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
+            )
+    elif FLAGS.input_type == "directory":
+        logging.warning(
+            "Using the directory input is only recommended if the build system"
+            "your project uses does not support any structured output that"
+            "ml-compiler-opt understands. If your build system provides a"
+            "structured compilation database, use that instead"
+        )
+        objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+    else:
+        logging.error("Unknown input type: %s", FLAGS.input_type)
+
+    relative_output_paths = extract_ir_lib.run_extraction(
+        objs,
+        FLAGS.num_workers,
+        FLAGS.llvm_objcopy_path,
+        FLAGS.cmd_filter,
+        FLAGS.thinlto_build,
+        FLAGS.cmd_section_name,
+        FLAGS.bitcode_section_name,
+    )
+
+    extract_ir_lib.write_corpus_manifest(
+        FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
+    )
+
+    logging.info(
+        "Converted %d files out of %d",
+        len(objs) - relative_output_paths.count(None),
+        len(objs),
+    )
+
+
+if __name__ == "__main__":
+    multiprocessing.set_start_method("fork")
+    app.run(main)
diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/py/src/mlgo/extract_ir_lib.py
index 83d2b26d1f71ce..ce6a4a17a8e6ac 100644
--- a/llvm/py/src/mlgo/extract_ir_lib.py
+++ b/llvm/py/src/mlgo/extract_ir_lib.py
@@ -22,341 +22,374 @@
 # TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
 # \0 - separated list of strings, to a \n one.
 def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
-  """Determine if the module should be included."""
-  if match_regexp is None:
-    return True
-  lines = cmdline.split('\0')
-  return any(len(re.findall(match_regexp, l)) for l in lines)
+    """Determine if the module should be included."""
+    if match_regexp is None:
+        return True
+    lines = cmdline.split("\0")
+    return any(len(re.findall(match_regexp, l)) for l in lines)
 
 
 def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
-  opts = cmdline.split('\0')
-  for option in opts:
-    if option.startswith('-fthinlto-index'):
-      return os.path.join(basedir, option.split('=')[1])
-  return None
+    opts = cmdline.split("\0")
+    for option in opts:
+        if option.startswith("-fthinlto-index"):
+            return os.path.join(basedir, option.split("=")[1])
+    return None
 
 
 class TrainingIRExtractor:
-  """IR and command line extraction from an object file."""
-
-  def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
-    """Set up a TrainingIRExtractor.
-
-    Args:
-      obj_relative_path: relative path to the input object file. It will be also
-        used to construct the absolute path of the output IR and cmd files, by
-        appending it to output_base_dir.
-      output_base_dir: the directory under which the output will be produced.
-      obj_base_dir: the base directory for all the input object files.
-    """
-    self._obj_relative_path = obj_relative_path
-    self._output_base_dir = output_base_dir
-    self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ''
-
-  def obj_base_dir(self):
-    return self._obj_base_dir
+    """IR and command line extraction from an object file."""
+
+    def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
+        """Set up a TrainingIRExtractor.
+
+        Args:
+          obj_relative_path: relative path to the input object file. It will be also
+            used to construct the absolute path of the output IR and cmd files, by
+            appending it to output_base_dir.
+          output_base_dir: the directory under which the output will be produced.
+          obj_base_dir: the base directory for all the input object files.
+        """
+        self._obj_relative_path = obj_relative_path
+        self._output_base_dir = output_base_dir
+        self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ""
+
+    def obj_base_dir(self):
+        return self._obj_base_dir
+
+    def output_base_dir(self):
+        return self._output_base_dir
+
+    def relative_output_path(self):
+        return self._obj_relative_path
+
+    def input_obj(self):
+        return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+
+    def lld_src_bc(self):
+        # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+        # IR bitcode saved by lld. It is hardcoded into lld.
+        return os.path.join(
+            self._obj_base_dir, self._obj_relative_path + ".3.import.bc"
+        )
+
+    def lld_src_thinlto(self):
+        return os.path.join(self._obj_base_dir, self._obj_relative_path + ".thinlto.bc")
+
+    def dest_dir(self):
+        return os.path.join(
+            self.output_base_dir(), os.path.dirname(self._obj_relative_path)
+        )
+
+    def module_name(self):
+        return os.path.basename(self._obj_relative_path)
+
+    def cmd_file(self):
+        return os.path.join(self.dest_dir(), self.module_name() + ".cmd")
+
+    def bc_file(self):
+        return os.path.join(self.dest_dir(), self.module_name() + ".bc")
+
+    def thinlto_index_file(self):
+        return os.path.join(self.dest_dir(), self.module_name() + ".thinlto.bc")
+
+    def _get_extraction_cmd_command(
+        self, llvm_objcopy_path: str, cmd_section_name: str
+    ):
+        """Get llvm-objcopy and process args to a produce a command string that,
+        when invoked, will extract the cmd section info ths self.cmd_file() file.
+        """
+        return [
+            llvm_objcopy_path,
+            "--dump-section=" + cmd_section_name + "=" + self.cmd_file(),
+            self.input_obj(),
+            "/dev/null",
+        ]
+
+    def _get_extraction_bc_command(
+        self, llvm_objcopy_path: str, bitcode_section_name: str
+    ):
+        """Gets llvm-objcopy and process args to produce a command string that,
+        when invoked, will extract the bitcode section into the self.bc_file()
+        file.
+        """
+        return [
+            llvm_objcopy_path,
+            "--dump-section=" + bitcode_section_name + "=" + self.bc_file(),
+            self.input_obj(),
+            "/dev/null",
+        ]
+
+    def _extract_clang_artifacts(
+        self,
+        llvm_objcopy_path: str,
+        cmd_filter: str,
+        is_thinlto: bool,
+        cmd_section_name: str,
+        bitcode_section_name: str,
+    ) -> Optional[str]:
+        """Run llvm-objcopy to extract the .bc and command line."""
+        if not os.path.exists(self.input_obj()):
+            logging.info("%s does not exist.", self.input_obj())
+            return None
+        os.makedirs(self.dest_dir(), exist_ok=True)
+        try:
+            subprocess.check_output(
+                self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
+                stderr=subprocess.STDOUT,
+                encoding="utf-8",
+            )
+            if cmd_filter is not None or is_thinlto:
+                with open(self.cmd_file(), encoding="utf-8") as f:
+                    lines = f.readlines()
+                assert len(lines) == 1
+                cmdline = lines[0]
+                if not should_include_module(cmdline, cmd_filter):
+                    logging.info(
+                        "Excluding module %s because it does not match the filter",
+                        self.input_obj(),
+                    )
+                    os.remove(self.cmd_file())
+                    return None
+                if is_thinlto:
+                    index_file = get_thinlto_index(cmdline, self.obj_base_dir())
+                    shutil.copy(index_file, self.thinlto_index_file())
+
+            subprocess.check_output(
+                self._get_extraction_bc_command(
+                    llvm_objcopy_path, bitcode_section_name
+                ),
+                stderr=subprocess.STDOUT,
+                encoding="utf-8",
+            )
+        except subprocess.CalledProcessError as e:
+            # This may happen if  .o file was build from asm (.S source).
+            logging.warning("%s was not processed: %s", self.input_obj(), e)
+            logging.info(e.output)
+            return None
+        assert (
+            os.path.exists(self.cmd_file())
+            and os.path.exists(self.bc_file())
+            and (not is_thinlto or os.path.exists(self.thinlto_index_file()))
+        )
+        return self.relative_output_path()
+
+    def _extract_lld_artifacts(self) -> Optional[str]:
+        """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation."""
+        if not os.path.exists(self.lld_src_bc()):
+            logging.info("%s does not exist.", self.lld_src_bc())
+            return None
+        if not os.path.exists(self.lld_src_thinlto()):
+            logging.info("%s does not exist.", self.lld_src_thinlto())
+            return None
+        os.makedirs(self.dest_dir(), exist_ok=True)
+
+        # Copy over the files
+        shutil.copy(self.lld_src_bc(), self.bc_file())
+        shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
+
+        assert os.path.exists(self.bc_file())
+        assert os.path.exists(self.thinlto_index_file())
+        return self._obj_relative_path
+
+    def extract(
+        self,
+        llvm_objcopy_path: Optional[str] = None,
+        cmd_filter: Optional[str] = None,
+        thinlto_build: Optional[str] = None,
+        cmd_section_name: Optional[str] = ".llvmcmd",
+        bitcode_section_name: Optional[str] = ".llvmbc",
+    ) -> Optional[str]:
+        if thinlto_build == "local":
+            return self._extract_lld_artifacts()
+        return self._extract_clang_artifacts(
+            llvm_objcopy_path=llvm_objcopy_path,
+            cmd_filter=cmd_filter,
+            is_thinlto=thinlto_build == "distributed",
+            cmd_section_name=cmd_section_name,
+            bitcode_section_name=bitcode_section_name,
+        )
 
-  def output_base_dir(self):
-    return self._output_base_dir
 
-  def relative_output_path(self):
-    return self._obj_relative_path
+def convert_compile_command_to_objectfile(
+    command: Dict[str, str], output_dir: str
+) -> Optional[TrainingIRExtractor]:
+    obj_base_dir = command["directory"]
+    if "arguments" in command:
+        cmd_parts = command["arguments"]
+    elif "command" in command:
+        cmd_parts = command["command"].split()
+    else:
+        logging.info("compile_commands element has no command and arguments")
+        return None
 
-  def input_obj(self):
-    return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+    try:
+        obj_index = cmd_parts.index("-o") + 1
+    except ValueError:
+        # This could happen if there are non-clang commands in compile_commands.json
+        logging.info("Command has no -o option: %s", " ".join(cmd_parts))
+        return None
+    obj_rel_path = cmd_parts[obj_index]
+    # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
+    return TrainingIRExtractor(
+        obj_relative_path=obj_rel_path,
+        output_base_dir=output_dir,
+        obj_base_dir=obj_base_dir,
+    )
 
-  def lld_src_bc(self):
-    # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
-    # IR bitcode saved by lld. It is hardcoded into lld.
-    return os.path.join(self._obj_base_dir,
-                        self._obj_relative_path + '.3.import.bc')
 
-  def lld_src_thinlto(self):
-    return os.path.join(self._obj_base_dir,
-                        self._obj_relative_path + '.thinlto.bc')
+def load_from_compile_commands(
+    json_array: List[Dict[str, str]], output_dir: str
+) -> List[TrainingIRExtractor]:
+    objs = [
+        convert_compile_command_to_objectfile(cmd, output_dir) for cmd in json_array
+    ]
+    # Filter out None, in case there were non-clang commands in the .json
+    return [obj for obj in objs if obj is not None]
 
-  def dest_dir(self):
-    return os.path.join(self.output_base_dir(),
-                        os.path.dirname(self._obj_relative_path))
 
-  def module_name(self):
-    return os.path.basename(self._obj_relative_path)
+def load_from_lld_params(
+    params_array: List[str], obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+    """Create an ObjectFile array based on lld's parameters."""
+    # yank out -o and the output. After that, anything not starting with '-', and
+    # ending in a '.o', is an object file.
+    try:
+        minus_o_idx = params_array.index("-o")
+        del params_array[minus_o_idx : minus_o_idx + 2]
+        just_obj_paths = [
+            o for o in params_array if not o.startswith("-") and o.endswith(".o")
+        ]
+    except ValueError:
+        logging.info("This params file does not have an explicit -o option.")
+        just_obj_paths = params_array
+
+    def make_obj(obj_file: str) -> TrainingIRExtractor:
+        return TrainingIRExtractor(
+            obj_relative_path=obj_file,
+            output_base_dir=output_dir,
+            obj_base_dir=obj_base_dir,
+        )
+
+    return [make_obj(obj_file) for obj_file in just_obj_paths]
+
+
+def load_from_directory(
+    obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+    """Create an object file array by globbing an entire drectory.
 
-  def cmd_file(self):
-    return os.path.join(self.dest_dir(), self.module_name() + '.cmd')
+    Args:
+      obj_base_dir: The base build directory that all object files will be
+        written out as being relative to.
+      output_dir: The output directory where extracted .bc and .cmd files should
+        be placed.
+    """
+    paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.o")]
 
-  def bc_file(self):
-    return os.path.join(self.dest_dir(), self.module_name() + '.bc')
+    def make_spec(obj_file: str):
+        return TrainingIRExtractor(
+            obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
+            output_base_dir=output_dir,
+            obj_base_dir=obj_base_dir,
+        )
 
-  def thinlto_index_file(self):
-    return os.path.join(self.dest_dir(), self.module_name() + '.thinlto.bc')
+    return [make_spec(path) for path in paths]
 
-  def _get_extraction_cmd_command(self, llvm_objcopy_path: str,
-                                  cmd_section_name: str):
-    """Get llvm-objcopy and process args to a produce a command string that,
-    when invoked, will extract the cmd section info ths self.cmd_file() file.
-    """
-    return [
-        llvm_objcopy_path,
-        '--dump-section=' + cmd_section_name + '=' + self.cmd_file(),
-        self.input_obj(), '/dev/null'
-    ]
 
-  def _get_extraction_bc_command(self, llvm_objcopy_path: str,
-                                 bitcode_section_name: str):
-    """Gets llvm-objcopy and process args to produce a command string that,
-    when invoked, will extract the bitcode section into the self.bc_file()
-    file.
-    """
-    return [
-        llvm_objcopy_path,
-        '--dump-section=' + bitcode_section_name + '=' + self.bc_file(),
-        self.input_obj(), '/dev/null'
-    ]
+def load_for_lld_thinlto(
+    obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+    # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+    # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
+    # are also emitted next to the postimport bitcode, with the suffix
+    # .thinlto.bc instead
+    paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.3.import.bc")]
+
+    def make_spec(obj_file: str):
+        return TrainingIRExtractor(
+            # Cut away .3.import.bc
+            obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
+            output_base_dir=output_dir,
+            obj_base_dir=obj_base_dir,
+        )
+
+    return [make_spec(path) for path in paths]
+
+
+def run_extraction(
+    objs: List[TrainingIRExtractor],
+    num_workers: int,
+    llvm_objcopy_path: str,
+    cmd_filter: str,
+    thinlto_build: str,
+    cmd_section_name: str,
+    bitcode_section_name: str,
+):
+    """Extracts all specified object files into the corpus directory.
 
-  def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str,
-                               is_thinlto: bool, cmd_section_name: str,
-                               bitcode_section_name: str) -> Optional[str]:
-    """Run llvm-objcopy to extract the .bc and command line."""
-    if not os.path.exists(self.input_obj()):
-      logging.info('%s does not exist.', self.input_obj())
-      return None
-    os.makedirs(self.dest_dir(), exist_ok=True)
-    try:
-      subprocess.check_output(
-          self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
-          stderr=subprocess.STDOUT,
-          encoding='utf-8')
-      if cmd_filter is not None or is_thinlto:
-        with open(self.cmd_file(), encoding='utf-8') as f:
-          lines = f.readlines()
-        assert len(lines) == 1
-        cmdline = lines[0]
-        if not should_include_module(cmdline, cmd_filter):
-          logging.info(
-              'Excluding module %s because it does not match the filter',
-              self.input_obj())
-          os.remove(self.cmd_file())
-          return None
-        if is_thinlto:
-          index_file = get_thinlto_index(cmdline, self.obj_base_dir())
-          shutil.copy(index_file, self.thinlto_index_file())
-
-      subprocess.check_output(
-          self._get_extraction_bc_command(llvm_objcopy_path,
-                                          bitcode_section_name),
-          stderr=subprocess.STDOUT,
-          encoding='utf-8')
-    except subprocess.CalledProcessError as e:
-      # This may happen if  .o file was build from asm (.S source).
-      logging.warning('%s was not processed: %s', self.input_obj(), e)
-      logging.info(e.output)
-      return None
-    assert (os.path.exists(self.cmd_file()) and
-            os.path.exists(self.bc_file()) and
-            (not is_thinlto or os.path.exists(self.thinlto_index_file())))
-    return self.relative_output_path()
-
-  def _extract_lld_artifacts(self) -> Optional[str]:
-    """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.
+    Args:
+      objs: A list of TrainingIRExtractor Objects that represent the object files
+        to extract bitcode/commands from.
+      num_workers: The number of parallel processes to spawn to run the
+        extraction.
+      llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
+      cmd_filter: A regular expression that is used to select for compilations
+        performed with specific flags. If you want to include all compilations,
+        set this to None.
+      thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
+        Set this to None if the build was not done with ThinLTO.
+      cmd_section_name: The name of the command line section created by the
+        bitcode embedding.
+      bitcode_section_name: The name of the bitcode section created by the
+        bitcode embedding.
     """
-    if not os.path.exists(self.lld_src_bc()):
-      logging.info('%s does not exist.', self.lld_src_bc())
-      return None
-    if not os.path.exists(self.lld_src_thinlto()):
-      logging.info('%s does not exist.', self.lld_src_thinlto())
-      return None
-    os.makedirs(self.dest_dir(), exist_ok=True)
-
-    # Copy over the files
-    shutil.copy(self.lld_src_bc(), self.bc_file())
-    shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
-
-    assert os.path.exists(self.bc_file())
-    assert os.path.exists(self.thinlto_index_file())
-    return self._obj_relative_path
-
-  def extract(self,
-              llvm_objcopy_path: Optional[str] = None,
-              cmd_filter: Optional[str] = None,
-              thinlto_build: Optional[str] = None,
-              cmd_section_name: Optional[str] = '.llvmcmd',
-              bitcode_section_name: Optional[str] = '.llvmbc') -> Optional[str]:
-    if thinlto_build == 'local':
-      return self._extract_lld_artifacts()
-    return self._extract_clang_artifacts(
+    extract_artifacts = functools.partial(
+        TrainingIRExtractor.extract,
         llvm_objcopy_path=llvm_objcopy_path,
         cmd_filter=cmd_filter,
-        is_thinlto=thinlto_build == 'distributed',
+        thinlto_build=thinlto_build,
         cmd_section_name=cmd_section_name,
-        bitcode_section_name=bitcode_section_name)
+        bitcode_section_name=bitcode_section_name,
+    )
 
+    with multiprocessing.Pool(num_workers) as pool:
+        relative_output_paths = pool.map(extract_artifacts, objs)
+        pool.close()
+        pool.join()
+    return relative_output_paths
 
-def convert_compile_command_to_objectfile(
-    command: Dict[str, str], output_dir: str) -> Optional[TrainingIRExtractor]:
-  obj_base_dir = command['directory']
-  if 'arguments' in command:
-    cmd_parts = command['arguments']
-  elif 'command' in command:
-    cmd_parts = command['command'].split()
-  else:
-    logging.info('compile_commands element has no command and arguments')
-    return None
-
-  try:
-    obj_index = cmd_parts.index('-o') + 1
-  except ValueError:
-    # This could happen if there are non-clang commands in compile_commands.json
-    logging.info('Command has no -o option: %s', ' '.join(cmd_parts))
-    return None
-  obj_rel_path = cmd_parts[obj_index]
-  # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
-  return TrainingIRExtractor(
-      obj_relative_path=obj_rel_path,
-      output_base_dir=output_dir,
-      obj_base_dir=obj_base_dir)
-
-
-def load_from_compile_commands(json_array: List[Dict[str, str]],
-                               output_dir: str) -> List[TrainingIRExtractor]:
-  objs = [
-      convert_compile_command_to_objectfile(cmd, output_dir)
-      for cmd in json_array
-  ]
-  # Filter out None, in case there were non-clang commands in the .json
-  return [obj for obj in objs if obj is not None]
-
-
-def load_from_lld_params(params_array: List[str], obj_base_dir: str,
-                         output_dir: str) -> List[TrainingIRExtractor]:
-  """Create an ObjectFile array based on lld's parameters."""
-  # yank out -o and the output. After that, anything not starting with '-', and
-  # ending in a '.o', is an object file.
-  try:
-    minus_o_idx = params_array.index('-o')
-    del params_array[minus_o_idx:minus_o_idx + 2]
-    just_obj_paths = [
-        o for o in params_array if not o.startswith('-') and o.endswith('.o')
-    ]
-  except ValueError:
-    logging.info('This params file does not have an explicit -o option.')
-    just_obj_paths = params_array
-
-  def make_obj(obj_file: str) -> TrainingIRExtractor:
-    return TrainingIRExtractor(
-        obj_relative_path=obj_file,
-        output_base_dir=output_dir,
-        obj_base_dir=obj_base_dir)
-
-  return [make_obj(obj_file) for obj_file in just_obj_paths]
-
-
-def load_from_directory(obj_base_dir: str,
-                        output_dir: str) -> List[TrainingIRExtractor]:
-  """Create an object file array by globbing an entire drectory.
 
-  Args:
-    obj_base_dir: The base build directory that all object files will be
-      written out as being relative to.
-    output_dir: The output directory where extracted .bc and .cmd files should
-      be placed.
-  """
-  paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.o')]
+def write_corpus_manifest(
+    thinlto_build: str, relative_output_paths: List[str], output_dir: str
+):
+    """Writes a corpus_manifest.json containing all necessary information about
+    the corpus.
 
-  def make_spec(obj_file: str):
-    return TrainingIRExtractor(
-        obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
-        output_base_dir=output_dir,
-        obj_base_dir=obj_base_dir)
-
-  return [make_spec(path) for path in paths]
-
-
-def load_for_lld_thinlto(obj_base_dir: str,
-                         output_dir: str) -> List[TrainingIRExtractor]:
-  # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
-  # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
-  # are also emitted next to the postimport bitcode, with the suffix
-  # .thinlto.bc instead
-  paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.3.import.bc')]
-
-  def make_spec(obj_file: str):
-    return TrainingIRExtractor(
-        # Cut away .3.import.bc
-        obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
-        output_base_dir=output_dir,
-        obj_base_dir=obj_base_dir)
-
-  return [make_spec(path) for path in paths]
-
-
-def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
-                   llvm_objcopy_path: str, cmd_filter: str, thinlto_build: str,
-                   cmd_section_name: str, bitcode_section_name: str):
-  """Extracts all specified object files into the corpus directory.
-
-  Args:
-    objs: A list of TrainingIRExtractor Objects that represent the object files
-      to extract bitcode/commands from.
-    num_workers: The number of parallel processes to spawn to run the
-      extraction.
-    llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
-    cmd_filter: A regular expression that is used to select for compilations
-      performed with specific flags. If you want to include all compilations,
-      set this to None.
-    thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
-      Set this to None if the build was not done with ThinLTO.
-    cmd_section_name: The name of the command line section created by the
-      bitcode embedding.
-    bitcode_section_name: The name of the bitcode section created by the
-      bitcode embedding.
-  """
-  extract_artifacts = functools.partial(
-      TrainingIRExtractor.extract,
-      llvm_objcopy_path=llvm_objcopy_path,
-      cmd_filter=cmd_filter,
-      thinlto_build=thinlto_build,
-      cmd_section_name=cmd_section_name,
-      bitcode_section_name=bitcode_section_name)
-
-  with multiprocessing.Pool(num_workers) as pool:
-    relative_output_paths = pool.map(extract_artifacts, objs)
-    pool.close()
-    pool.join()
-  return relative_output_paths
-
-
-def write_corpus_manifest(thinlto_build: str, relative_output_paths: List[str],
-                          output_dir: str):
-  """Writes a corpus_manifest.json containing all necessary information about
-  the corpus.
-
-  Args:
-    thinlto_build: Whether or not the build was done with ThinLTO and if so,
-      what kind of ThinLTO. Set this to none if the build was not performed with
-      ThinLTO.
-    relative_output_paths: The relative (to the corpus directory) output paths
-      of all the bitcode files that should be placed in the corpus manifest
-    output_dir: The corpus directory where the corpus manifest should be
-      placed.
-  """
-  # This comes first rather than later so global_command_override is at the top
-  # of the .json after being written
-  if thinlto_build == 'local':
-    corpus_description = {
-        'global_command_override': constant.UNSPECIFIED_OVERRIDE
-    }
-  else:
-    corpus_description = {}
-
-  corpus_description.update({
-      'has_thinlto': thinlto_build is not None,
-      'modules': [path for path in relative_output_paths if path is not None]
-  })
-
-  with open(
-      os.path.join(output_dir, 'corpus_description.json'),
-      'w',
-      encoding='utf-8') as f:
-    json.dump(corpus_description, f, indent=2)
+    Args:
+      thinlto_build: Whether or not the build was done with ThinLTO and if so,
+        what kind of ThinLTO. Set this to none if the build was not performed with
+        ThinLTO.
+      relative_output_paths: The relative (to the corpus directory) output paths
+        of all the bitcode files that should be placed in the corpus manifest
+      output_dir: The corpus directory where the corpus manifest should be
+        placed.
+    """
+    # This comes first rather than later so global_command_override is at the top
+    # of the .json after being written
+    if thinlto_build == "local":
+        corpus_description = {"global_command_override": constant.UNSPECIFIED_OVERRIDE}
+    else:
+        corpus_description = {}
+
+    corpus_description.update(
+        {
+            "has_thinlto": thinlto_build is not None,
+            "modules": [path for path in relative_output_paths if path is not None],
+        }
+    )
+
+    with open(
+        os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
+    ) as f:
+        json.dump(corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/py/src/mlgo/extract_ir_test.py
index d7de50530032cc..ae9b3b30f9a5c9 100644
--- a/llvm/py/src/mlgo/extract_ir_test.py
+++ b/llvm/py/src/mlgo/extract_ir_test.py
@@ -12,209 +12,246 @@
 
 
 class ExtractIrTest(absltest.TestCase):
+    def test_one_conversion(self):
+        obj = extract_ir_lib.convert_compile_command_to_objectfile(
+            {
+                "directory": "/output/directory",
+                "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
+                "file": "/some/path/lib/foo/bar.cc",
+            },
+            "/corpus/destination/path",
+        )
+        self.assertIsNotNone(obj)
+        # pytype: disable=attribute-error
+        # Pytype complains about obj being None
+        self.assertEqual(obj.input_obj(), "/output/directory/lib/bar.o")
+        self.assertEqual(obj.relative_output_path(), "lib/bar.o")
+        self.assertEqual(obj.cmd_file(), "/corpus/destination/path/lib/bar.o.cmd")
+        self.assertEqual(obj.bc_file(), "/corpus/destination/path/lib/bar.o.bc")
+        self.assertEqual(
+            obj.thinlto_index_file(), "/corpus/destination/path/lib/bar.o.thinlto.bc"
+        )
+        # pytype: enable=attribute-error
 
-  def test_one_conversion(self):
-    obj = extract_ir_lib.convert_compile_command_to_objectfile(
-        {
-            'directory': '/output/directory',
-            'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
-            'file': '/some/path/lib/foo/bar.cc'
-        }, '/corpus/destination/path')
-    self.assertIsNotNone(obj)
-    # pytype: disable=attribute-error
-    # Pytype complains about obj being None
-    self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
-    self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
-    self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
-    self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
-    self.assertEqual(obj.thinlto_index_file(),
-                     '/corpus/destination/path/lib/bar.o.thinlto.bc')
-    # pytype: enable=attribute-error
-
-  def test_one_conversion_arguments_style(self):
-    obj = extract_ir_lib.convert_compile_command_to_objectfile(
-        {
-            'directory': '/output/directory',
-            'arguments':
-                ['-cc1', '-c', '/some/path/lib/foo/bar.cc', '-o', 'lib/bar.o'],
-            'file': '/some/path/lib/foo/bar.cc'
-        }, '/corpus/destination/path')
-    self.assertIsNotNone(obj)
-    # pytype: disable=attribute-error
-    # Pytype complains about obj being None
-    self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
-    self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
-    self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
-    self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
-    self.assertEqual(obj.thinlto_index_file(),
-                     '/corpus/destination/path/lib/bar.o.thinlto.bc')
-    # pytype: enable=attribute-error
-
-  def test_arr_conversion(self):
-    res = extract_ir_lib.load_from_compile_commands([{
-        'directory': '/output/directory',
-        'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
-        'file': '/some/path/lib/foo/bar.cc'
-    }, {
-        'directory': '/output/directory',
-        'command': '-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o',
-        'file': '/some/path/lib/foo/baz.cc'
-    }], '/corpus/destination/path')
-    res = list(res)
-    self.assertLen(res, 2)
-    self.assertEqual(res[0].input_obj(), '/output/directory/lib/bar.o')
-    self.assertEqual(res[0].relative_output_path(), 'lib/bar.o')
-    self.assertEqual(res[0].cmd_file(),
-                     '/corpus/destination/path/lib/bar.o.cmd')
-    self.assertEqual(res[0].bc_file(), '/corpus/destination/path/lib/bar.o.bc')
-    self.assertEqual(res[0].thinlto_index_file(),
-                     '/corpus/destination/path/lib/bar.o.thinlto.bc')
-
-    self.assertEqual(res[1].input_obj(), '/output/directory/lib/other/baz.o')
-    self.assertEqual(res[1].relative_output_path(), 'lib/other/baz.o')
-    self.assertEqual(res[1].cmd_file(),
-                     '/corpus/destination/path/lib/other/baz.o.cmd')
-    self.assertEqual(res[1].bc_file(),
-                     '/corpus/destination/path/lib/other/baz.o.bc')
-    self.assertEqual(res[1].thinlto_index_file(),
-                     '/corpus/destination/path/lib/other/baz.o.thinlto.bc')
-
-  def test_command_extraction(self):
-    obj = extract_ir_lib.TrainingIRExtractor(
-        obj_relative_path='lib/obj_file.o',
-        output_base_dir='/where/corpus/goes',
-        obj_base_dir='/foo/bar')
-    self.assertEqual(
-        obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
-            '/bin/llvm_objcopy_path',
-            '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
-            '/foo/bar/lib/obj_file.o', '/dev/null'
-        ])
-    self.assertEqual(
-        obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
-            '/bin/llvm_objcopy_path',
-            '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
-            '/foo/bar/lib/obj_file.o', '/dev/null'
-        ])
-
-  def test_command_extraction_no_basedir(self):
-    obj = extract_ir_lib.TrainingIRExtractor('lib/obj_file.o',
-                                             '/where/corpus/goes')
-    self.assertEqual(
-        obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
-            '/bin/llvm_objcopy_path',
-            '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
-            'lib/obj_file.o', '/dev/null'
-        ])
-    self.assertEqual(
-        obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
-            '/bin/llvm_objcopy_path',
-            '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
-            'lib/obj_file.o', '/dev/null'
-        ])
-
-  def test_lld_params(self):
-    lld_opts = [
-        '-o', 'output/dir/exe', 'lib/obj1.o', 'somelib.a', '-W,blah',
-        'lib/dir/obj2.o'
-    ]
-    obj = extract_ir_lib.load_from_lld_params(lld_opts, '/some/path',
-                                              '/tmp/out')
-    self.assertLen(obj, 2)
-    self.assertEqual(obj[0].input_obj(), '/some/path/lib/obj1.o')
-    self.assertEqual(obj[0].relative_output_path(), 'lib/obj1.o')
-    self.assertEqual(obj[0].cmd_file(), '/tmp/out/lib/obj1.o.cmd')
-    self.assertEqual(obj[0].thinlto_index_file(),
-                     '/tmp/out/lib/obj1.o.thinlto.bc')
-    self.assertEqual(obj[1].input_obj(), '/some/path/lib/dir/obj2.o')
-
-  def test_load_from_directory(self):
-    tempdir = self.create_tempdir()
-    subdir = tempdir.mkdir(dir_path='subdir')
-    subdir.create_file(file_path='test1.o')
-    subdir.create_file(file_path='test2.o')
-    outdir = self.create_tempdir()
-    objs = extract_ir_lib.load_from_directory(tempdir.full_path,
-                                              outdir.full_path)
-    self.assertLen(objs, 2)
-    for index, obj in enumerate(
-        sorted(objs, key=lambda x: x._obj_relative_path)):
-      self.assertEqual(obj._obj_relative_path, f'subdir/test{index + 1:d}.o')
-      self.assertEqual(obj._obj_base_dir, tempdir.full_path)
-      self.assertEqual(obj._output_base_dir, outdir.full_path)
-
-  def test_lld_thinlto_discovery(self):
-    tempdir = self.create_tempdir()
-    tempdir.create_file(file_path='1.3.import.bc')
-    tempdir.create_file(file_path='2.3.import.bc')
-    tempdir.create_file(file_path='3.3.import.bc')
-    tempdir.create_file(file_path='1.thinlto.bc')
-    tempdir.create_file(file_path='2.thinlto.bc')
-    tempdir.create_file(file_path='3.thinlto.bc')
-    outdir = self.create_tempdir()
-    obj = extract_ir_lib.load_for_lld_thinlto(tempdir.full_path,
-                                              outdir.full_path)
-    self.assertLen(obj, 3)
-    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
-      self.assertEqual(o._obj_relative_path, f'{i + 1:d}')
-      self.assertEqual(o._obj_base_dir, tempdir.full_path)
-      self.assertEqual(o._output_base_dir, outdir.full_path)
-
-  def test_lld_thinlto_discovery_nested(self):
-    outer = self.create_tempdir()
-    tempdir = outer.mkdir(dir_path='nest')
-    tempdir.create_file(file_path='1.3.import.bc')
-    tempdir.create_file(file_path='2.3.import.bc')
-    tempdir.create_file(file_path='3.3.import.bc')
-    tempdir.create_file(file_path='1.thinlto.bc')
-    tempdir.create_file(file_path='2.thinlto.bc')
-    tempdir.create_file(file_path='3.thinlto.bc')
-    outdir = self.create_tempdir()
-    obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
-    self.assertLen(obj, 3)
-    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
-      self.assertEqual(o._obj_relative_path, f'nest/{i + 1:d}')
-      self.assertEqual(o._obj_base_dir, outer.full_path)
-      self.assertEqual(o._output_base_dir, outdir.full_path)
-
-  def test_lld_thinlto_extraction(self):
-    outer = self.create_tempdir()
-    tempdir = outer.mkdir(dir_path='nest')
-    tempdir.create_file(file_path='1.3.import.bc')
-    tempdir.create_file(file_path='2.3.import.bc')
-    tempdir.create_file(file_path='3.3.import.bc')
-    tempdir.create_file(file_path='1.thinlto.bc')
-    tempdir.create_file(file_path='2.thinlto.bc')
-    tempdir.create_file(file_path='3.thinlto.bc')
-    outdir = self.create_tempdir()
-    obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
-    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
-      mod_path = o.extract(thinlto_build='local')
-      self.assertEqual(mod_path, f'nest/{i + 1:d}')
-    self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/1.bc')))
-    self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/2.bc')))
-    self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/3.bc')))
-    self.assertTrue(
-        os.path.exists(os.path.join(outdir.full_path, 'nest/1.thinlto.bc')))
-    self.assertTrue(
-        os.path.exists(os.path.join(outdir.full_path, 'nest/2.thinlto.bc')))
-    self.assertTrue(
-        os.path.exists(os.path.join(outdir.full_path, 'nest/3.thinlto.bc')))
-
-  def test_filtering(self):
-    cmdline = '-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o'
-    self.assertTrue(extract_ir_lib.should_include_module(cmdline, None))
-    self.assertTrue(extract_ir_lib.should_include_module(cmdline, '.*'))
-    self.assertTrue(extract_ir_lib.should_include_module(cmdline, '^-Oz$'))
-    self.assertFalse(extract_ir_lib.should_include_module(cmdline, '^-O3$'))
-
-  def test_thinlto_index_extractor(self):
-    cmdline = ('-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/'
-               'out.o\0-fthinlto-index=foo/bar.thinlto.bc')
-    self.assertEqual(
-        extract_ir_lib.get_thinlto_index(cmdline, '/the/base/dir'),
-        '/the/base/dir/foo/bar.thinlto.bc')
-
-
-if __name__ == '__main__':
-  absltest.main()
+    def test_one_conversion_arguments_style(self):
+        obj = extract_ir_lib.convert_compile_command_to_objectfile(
+            {
+                "directory": "/output/directory",
+                "arguments": [
+                    "-cc1",
+                    "-c",
+                    "/some/path/lib/foo/bar.cc",
+                    "-o",
+                    "lib/bar.o",
+                ],
+                "file": "/some/path/lib/foo/bar.cc",
+            },
+            "/corpus/destination/path",
+        )
+        self.assertIsNotNone(obj)
+        # pytype: disable=attribute-error
+        # Pytype complains about obj being None
+        self.assertEqual(obj.input_obj(), "/output/directory/lib/bar.o")
+        self.assertEqual(obj.relative_output_path(), "lib/bar.o")
+        self.assertEqual(obj.cmd_file(), "/corpus/destination/path/lib/bar.o.cmd")
+        self.assertEqual(obj.bc_file(), "/corpus/destination/path/lib/bar.o.bc")
+        self.assertEqual(
+            obj.thinlto_index_file(), "/corpus/destination/path/lib/bar.o.thinlto.bc"
+        )
+        # pytype: enable=attribute-error
+
+    def test_arr_conversion(self):
+        res = extract_ir_lib.load_from_compile_commands(
+            [
+                {
+                    "directory": "/output/directory",
+                    "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
+                    "file": "/some/path/lib/foo/bar.cc",
+                },
+                {
+                    "directory": "/output/directory",
+                    "command": "-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o",
+                    "file": "/some/path/lib/foo/baz.cc",
+                },
+            ],
+            "/corpus/destination/path",
+        )
+        res = list(res)
+        self.assertLen(res, 2)
+        self.assertEqual(res[0].input_obj(), "/output/directory/lib/bar.o")
+        self.assertEqual(res[0].relative_output_path(), "lib/bar.o")
+        self.assertEqual(res[0].cmd_file(), "/corpus/destination/path/lib/bar.o.cmd")
+        self.assertEqual(res[0].bc_file(), "/corpus/destination/path/lib/bar.o.bc")
+        self.assertEqual(
+            res[0].thinlto_index_file(), "/corpus/destination/path/lib/bar.o.thinlto.bc"
+        )
+
+        self.assertEqual(res[1].input_obj(), "/output/directory/lib/other/baz.o")
+        self.assertEqual(res[1].relative_output_path(), "lib/other/baz.o")
+        self.assertEqual(
+            res[1].cmd_file(), "/corpus/destination/path/lib/other/baz.o.cmd"
+        )
+        self.assertEqual(
+            res[1].bc_file(), "/corpus/destination/path/lib/other/baz.o.bc"
+        )
+        self.assertEqual(
+            res[1].thinlto_index_file(),
+            "/corpus/destination/path/lib/other/baz.o.thinlto.bc",
+        )
+
+    def test_command_extraction(self):
+        obj = extract_ir_lib.TrainingIRExtractor(
+            obj_relative_path="lib/obj_file.o",
+            output_base_dir="/where/corpus/goes",
+            obj_base_dir="/foo/bar",
+        )
+        self.assertEqual(
+            obj._get_extraction_cmd_command("/bin/llvm_objcopy_path", ".llvmcmd"),
+            [
+                "/bin/llvm_objcopy_path",
+                "--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd",
+                "/foo/bar/lib/obj_file.o",
+                "/dev/null",
+            ],
+        )
+        self.assertEqual(
+            obj._get_extraction_bc_command("/bin/llvm_objcopy_path", ".llvmbc"),
+            [
+                "/bin/llvm_objcopy_path",
+                "--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc",
+                "/foo/bar/lib/obj_file.o",
+                "/dev/null",
+            ],
+        )
+
+    def test_command_extraction_no_basedir(self):
+        obj = extract_ir_lib.TrainingIRExtractor("lib/obj_file.o", "/where/corpus/goes")
+        self.assertEqual(
+            obj._get_extraction_cmd_command("/bin/llvm_objcopy_path", ".llvmcmd"),
+            [
+                "/bin/llvm_objcopy_path",
+                "--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd",
+                "lib/obj_file.o",
+                "/dev/null",
+            ],
+        )
+        self.assertEqual(
+            obj._get_extraction_bc_command("/bin/llvm_objcopy_path", ".llvmbc"),
+            [
+                "/bin/llvm_objcopy_path",
+                "--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc",
+                "lib/obj_file.o",
+                "/dev/null",
+            ],
+        )
+
+    def test_lld_params(self):
+        lld_opts = [
+            "-o",
+            "output/dir/exe",
+            "lib/obj1.o",
+            "somelib.a",
+            "-W,blah",
+            "lib/dir/obj2.o",
+        ]
+        obj = extract_ir_lib.load_from_lld_params(lld_opts, "/some/path", "/tmp/out")
+        self.assertLen(obj, 2)
+        self.assertEqual(obj[0].input_obj(), "/some/path/lib/obj1.o")
+        self.assertEqual(obj[0].relative_output_path(), "lib/obj1.o")
+        self.assertEqual(obj[0].cmd_file(), "/tmp/out/lib/obj1.o.cmd")
+        self.assertEqual(obj[0].thinlto_index_file(), "/tmp/out/lib/obj1.o.thinlto.bc")
+        self.assertEqual(obj[1].input_obj(), "/some/path/lib/dir/obj2.o")
+
+    def test_load_from_directory(self):
+        tempdir = self.create_tempdir()
+        subdir = tempdir.mkdir(dir_path="subdir")
+        subdir.create_file(file_path="test1.o")
+        subdir.create_file(file_path="test2.o")
+        outdir = self.create_tempdir()
+        objs = extract_ir_lib.load_from_directory(tempdir.full_path, outdir.full_path)
+        self.assertLen(objs, 2)
+        for index, obj in enumerate(sorted(objs, key=lambda x: x._obj_relative_path)):
+            self.assertEqual(obj._obj_relative_path, f"subdir/test{index + 1:d}.o")
+            self.assertEqual(obj._obj_base_dir, tempdir.full_path)
+            self.assertEqual(obj._output_base_dir, outdir.full_path)
+
+    def test_lld_thinlto_discovery(self):
+        tempdir = self.create_tempdir()
+        tempdir.create_file(file_path="1.3.import.bc")
+        tempdir.create_file(file_path="2.3.import.bc")
+        tempdir.create_file(file_path="3.3.import.bc")
+        tempdir.create_file(file_path="1.thinlto.bc")
+        tempdir.create_file(file_path="2.thinlto.bc")
+        tempdir.create_file(file_path="3.thinlto.bc")
+        outdir = self.create_tempdir()
+        obj = extract_ir_lib.load_for_lld_thinlto(tempdir.full_path, outdir.full_path)
+        self.assertLen(obj, 3)
+        for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+            self.assertEqual(o._obj_relative_path, f"{i + 1:d}")
+            self.assertEqual(o._obj_base_dir, tempdir.full_path)
+            self.assertEqual(o._output_base_dir, outdir.full_path)
+
+    def test_lld_thinlto_discovery_nested(self):
+        outer = self.create_tempdir()
+        tempdir = outer.mkdir(dir_path="nest")
+        tempdir.create_file(file_path="1.3.import.bc")
+        tempdir.create_file(file_path="2.3.import.bc")
+        tempdir.create_file(file_path="3.3.import.bc")
+        tempdir.create_file(file_path="1.thinlto.bc")
+        tempdir.create_file(file_path="2.thinlto.bc")
+        tempdir.create_file(file_path="3.thinlto.bc")
+        outdir = self.create_tempdir()
+        obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+        self.assertLen(obj, 3)
+        for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+            self.assertEqual(o._obj_relative_path, f"nest/{i + 1:d}")
+            self.assertEqual(o._obj_base_dir, outer.full_path)
+            self.assertEqual(o._output_base_dir, outdir.full_path)
+
+    def test_lld_thinlto_extraction(self):
+        outer = self.create_tempdir()
+        tempdir = outer.mkdir(dir_path="nest")
+        tempdir.create_file(file_path="1.3.import.bc")
+        tempdir.create_file(file_path="2.3.import.bc")
+        tempdir.create_file(file_path="3.3.import.bc")
+        tempdir.create_file(file_path="1.thinlto.bc")
+        tempdir.create_file(file_path="2.thinlto.bc")
+        tempdir.create_file(file_path="3.thinlto.bc")
+        outdir = self.create_tempdir()
+        obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+        for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+            mod_path = o.extract(thinlto_build="local")
+            self.assertEqual(mod_path, f"nest/{i + 1:d}")
+        self.assertTrue(os.path.exists(os.path.join(outdir.full_path, "nest/1.bc")))
+        self.assertTrue(os.path.exists(os.path.join(outdir.full_path, "nest/2.bc")))
+        self.assertTrue(os.path.exists(os.path.join(outdir.full_path, "nest/3.bc")))
+        self.assertTrue(
+            os.path.exists(os.path.join(outdir.full_path, "nest/1.thinlto.bc"))
+        )
+        self.assertTrue(
+            os.path.exists(os.path.join(outdir.full_path, "nest/2.thinlto.bc"))
+        )
+        self.assertTrue(
+            os.path.exists(os.path.join(outdir.full_path, "nest/3.thinlto.bc"))
+        )
+
+    def test_filtering(self):
+        cmdline = "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o"
+        self.assertTrue(extract_ir_lib.should_include_module(cmdline, None))
+        self.assertTrue(extract_ir_lib.should_include_module(cmdline, ".*"))
+        self.assertTrue(extract_ir_lib.should_include_module(cmdline, "^-Oz$"))
+        self.assertFalse(extract_ir_lib.should_include_module(cmdline, "^-O3$"))
+
+    def test_thinlto_index_extractor(self):
+        cmdline = (
+            "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/"
+            "out.o\0-fthinlto-index=foo/bar.thinlto.bc"
+        )
+        self.assertEqual(
+            extract_ir_lib.get_thinlto_index(cmdline, "/the/base/dir"),
+            "/the/base/dir/foo/bar.thinlto.bc",
+        )
+
+
+if __name__ == "__main__":
+    absltest.main()
diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/py/src/mlgo/make_corpus.py
index 989d9790b5bcd9..e6ba013019829e 100644
--- a/llvm/py/src/mlgo/make_corpus.py
+++ b/llvm/py/src/mlgo/make_corpus.py
@@ -18,30 +18,33 @@
 
 from compiler_opt.tools import make_corpus_lib
 
-flags.DEFINE_string('input_dir', None, 'The input directory.')
-flags.DEFINE_string('output_dir', None, 'The output directory.')
+flags.DEFINE_string("input_dir", None, "The input directory.")
+flags.DEFINE_string("output_dir", None, "The output directory.")
 flags.DEFINE_string(
-    'default_args', '',
-    'The compiler flags to compile with when using downstream tooling.')
+    "default_args",
+    "",
+    "The compiler flags to compile with when using downstream tooling.",
+)
 
-flags.mark_flag_as_required('input_dir')
-flags.mark_flag_as_required('output_dir')
+flags.mark_flag_as_required("input_dir")
+flags.mark_flag_as_required("output_dir")
 
 FLAGS = flags.FLAGS
 
 
 def main(_):
-  logging.warning(
-      'Using this tool does not guarantee that the bitcode is taken at '
-      'the correct stage for consumption during model training. Make '
-      'sure to validate assumptions about where the bitcode is coming '
-      'from before using it in production.')
-  relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
-  make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir,
-                               FLAGS.output_dir)
-  make_corpus_lib.write_corpus_manifest(relative_paths, FLAGS.output_dir,
-                                        FLAGS.default_args.split())
-
-
-if __name__ == '__main__':
-  app.run(main)
+    logging.warning(
+        "Using this tool does not guarantee that the bitcode is taken at "
+        "the correct stage for consumption during model training. Make "
+        "sure to validate assumptions about where the bitcode is coming "
+        "from before using it in production."
+    )
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
+    make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, FLAGS.output_dir)
+    make_corpus_lib.write_corpus_manifest(
+        relative_paths, FLAGS.output_dir, FLAGS.default_args.split()
+    )
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/py/src/mlgo/make_corpus_lib.py
index 97db20a9859e17..697c97ebf6ee29 100644
--- a/llvm/py/src/mlgo/make_corpus_lib.py
+++ b/llvm/py/src/mlgo/make_corpus_lib.py
@@ -10,70 +10,68 @@
 
 from typing import List, Optional
 
-BITCODE_EXTENSION = '.bc'
+BITCODE_EXTENSION = ".bc"
 
 
 def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]:
-  """Finds bitcode files to extract from a given directory.
+    """Finds bitcode files to extract from a given directory.
 
-  Args:
-    bitcode_base_dir: The base directory where the bitcode to be copied
-      is from.
-    output_dir: The directory to place the bitcode in.
+    Args:
+      bitcode_base_dir: The base directory where the bitcode to be copied
+        is from.
+      output_dir: The directory to place the bitcode in.
 
-  Returns an array of paths representing the relative path to the bitcode
-  file from the base direcotry.
-  """
-  paths = [
-      str(p)[:-len(BITCODE_EXTENSION)]
-      for p in pathlib.Path(bitcode_base_dir).glob('**/*' + BITCODE_EXTENSION)
-  ]
+    Returns an array of paths representing the relative path to the bitcode
+    file from the base direcotry.
+    """
+    paths = [
+        str(p)[: -len(BITCODE_EXTENSION)]
+        for p in pathlib.Path(bitcode_base_dir).glob("**/*" + BITCODE_EXTENSION)
+    ]
 
-  return [
-      os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths
-  ]
+    return [os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths]
 
 
-def copy_bitcode(relative_paths: List[str], bitcode_base_dir: str,
-                 output_dir: str) -> None:
-  """Copies bitcode files from the base directory to the output directory.
+def copy_bitcode(
+    relative_paths: List[str], bitcode_base_dir: str, output_dir: str
+) -> None:
+    """Copies bitcode files from the base directory to the output directory.
 
-  Args:
-    relative_paths: An array of relative paths to bitcode files that are copied
-      over to the output directory, preserving relative location.
-    bitcode_base_dir: The base directory where the bitcode is located.
-    output_dir: The output directory to place the bitcode in.
-  """
-  for relative_path in relative_paths:
-    base_path = os.path.join(bitcode_base_dir,
-                             relative_path + BITCODE_EXTENSION)
-    destination_path = os.path.join(output_dir,
-                                    relative_path + BITCODE_EXTENSION)
-    os.makedirs(os.path.dirname(destination_path), exist_ok=True)
-    shutil.copy(base_path, destination_path)
+    Args:
+      relative_paths: An array of relative paths to bitcode files that are copied
+        over to the output directory, preserving relative location.
+      bitcode_base_dir: The base directory where the bitcode is located.
+      output_dir: The output directory to place the bitcode in.
+    """
+    for relative_path in relative_paths:
+        base_path = os.path.join(bitcode_base_dir, relative_path + BITCODE_EXTENSION)
+        destination_path = os.path.join(output_dir, relative_path + BITCODE_EXTENSION)
+        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+        shutil.copy(base_path, destination_path)
 
 
-def write_corpus_manifest(relative_output_paths: List[str],
-                          output_dir: str,
-                          default_args: Optional[List[str]] = None) -> None:
-  """Creates a corpus manifest describing the bitcode that has been found.
+def write_corpus_manifest(
+    relative_output_paths: List[str],
+    output_dir: str,
+    default_args: Optional[List[str]] = None,
+) -> None:
+    """Creates a corpus manifest describing the bitcode that has been found.
 
-  Args:
-    relative_output_paths: A list of paths to each bitcode file relative to the
-      output directory.
-    outout_dir: The output directory where the corpus is being created.
-    default_args: An array of compiler flags that should be used to compile
-      the bitcode when using further downstream tooling."""
-  if default_args is None:
-    default_args = []
-  corpus_description = {
-      'global_command_override': default_args,
-      'has_thinlto': False,
-      'modules': [path for path in relative_output_paths if path is not None]
-  }
+    Args:
+      relative_output_paths: A list of paths to each bitcode file relative to the
+        output directory.
+      outout_dir: The output directory where the corpus is being created.
+      default_args: An array of compiler flags that should be used to compile
+        the bitcode when using further downstream tooling."""
+    if default_args is None:
+        default_args = []
+    corpus_description = {
+        "global_command_override": default_args,
+        "has_thinlto": False,
+        "modules": [path for path in relative_output_paths if path is not None],
+    }
 
-  with open(
-      os.path.join(output_dir, 'corpus_description.json'),
-      'w',
-      encoding='utf-8') as description_file:
-    json.dump(corpus_description, description_file, indent=2)
+    with open(
+        os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
+    ) as description_file:
+        json.dump(corpus_description, description_file, indent=2)
diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/py/src/mlgo/make_corpus_test.py
index fcb861ebb91f32..7b5cc954b6d172 100644
--- a/llvm/py/src/mlgo/make_corpus_test.py
+++ b/llvm/py/src/mlgo/make_corpus_test.py
@@ -12,44 +12,43 @@
 
 
 class MakeCorpusTest(absltest.TestCase):
-
-  def test_load_bitcode_from_directory(self):
-    outer = self.create_tempdir()
-    tempdir = outer.mkdir(dir_path='nested')
-    tempdir.create_file('test1.bc')
-    tempdir.create_file('test2.bc')
-    relative_paths = make_corpus_lib.load_bitcode_from_directory(outer)
-    relative_paths = sorted(relative_paths)
-    self.assertEqual(relative_paths[0], 'nested/test1')
-    self.assertEqual(relative_paths[1], 'nested/test2')
-
-  def test_copy_bitcode(self):
-    build_dir = self.create_tempdir()
-    nested_dir = build_dir.mkdir(dir_path='nested')
-    nested_dir.create_file('test1.bc')
-    nested_dir.create_file('test2.bc')
-    relative_paths = ['nested/test1', 'nested/test2']
-    corpus_dir = self.create_tempdir()
-    make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
-    output_files = sorted(os.listdir(os.path.join(corpus_dir, './nested')))
-    self.assertEqual(output_files[0], 'test1.bc')
-    self.assertEqual(output_files[1], 'test2.bc')
-
-  def test_write_corpus_manifest(self):
-    relative_output_paths = ['test/test1', 'test/test2']
-    output_dir = self.create_tempdir()
-    default_args = ['-O3', '-c']
-    make_corpus_lib.write_corpus_manifest(relative_output_paths, output_dir,
-                                          default_args)
-    with open(
-        os.path.join(output_dir, 'corpus_description.json'),
-        encoding='utf-8') as corpus_description_file:
-      corpus_description = json.load(corpus_description_file)
-    self.assertEqual(corpus_description['global_command_override'],
-                     default_args)
-    self.assertEqual(corpus_description['has_thinlto'], False)
-    self.assertEqual(corpus_description['modules'], relative_output_paths)
-
-
-if __name__ == '__main__':
-  absltest.main()
+    def test_load_bitcode_from_directory(self):
+        outer = self.create_tempdir()
+        tempdir = outer.mkdir(dir_path="nested")
+        tempdir.create_file("test1.bc")
+        tempdir.create_file("test2.bc")
+        relative_paths = make_corpus_lib.load_bitcode_from_directory(outer)
+        relative_paths = sorted(relative_paths)
+        self.assertEqual(relative_paths[0], "nested/test1")
+        self.assertEqual(relative_paths[1], "nested/test2")
+
+    def test_copy_bitcode(self):
+        build_dir = self.create_tempdir()
+        nested_dir = build_dir.mkdir(dir_path="nested")
+        nested_dir.create_file("test1.bc")
+        nested_dir.create_file("test2.bc")
+        relative_paths = ["nested/test1", "nested/test2"]
+        corpus_dir = self.create_tempdir()
+        make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
+        output_files = sorted(os.listdir(os.path.join(corpus_dir, "./nested")))
+        self.assertEqual(output_files[0], "test1.bc")
+        self.assertEqual(output_files[1], "test2.bc")
+
+    def test_write_corpus_manifest(self):
+        relative_output_paths = ["test/test1", "test/test2"]
+        output_dir = self.create_tempdir()
+        default_args = ["-O3", "-c"]
+        make_corpus_lib.write_corpus_manifest(
+            relative_output_paths, output_dir, default_args
+        )
+        with open(
+            os.path.join(output_dir, "corpus_description.json"), encoding="utf-8"
+        ) as corpus_description_file:
+            corpus_description = json.load(corpus_description_file)
+        self.assertEqual(corpus_description["global_command_override"], default_args)
+        self.assertEqual(corpus_description["has_thinlto"], False)
+        self.assertEqual(corpus_description["modules"], relative_output_paths)
+
+
+if __name__ == "__main__":
+    absltest.main()

>From f7c712cf6710c419ac2f98d2dc846995e10c9df5 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Mon, 15 Jan 2024 06:03:56 +0000
Subject: [PATCH 4/7] Restructure upstreaming

---
 .../mlgo}/mlgo/combine_training_corpus.py        |  2 +-
 .../mlgo}/mlgo/combine_training_corpus_lib.py    |  9 ++++-----
 llvm/{py/src => utils/mlgo}/mlgo/extract_ir.py   |  2 +-
 .../src => utils/mlgo}/mlgo/extract_ir_lib.py    |  5 ++---
 llvm/{py/src => utils/mlgo}/mlgo/make_corpus.py  |  2 +-
 .../src => utils/mlgo}/mlgo/make_corpus_lib.py   |  0
 llvm/utils/mlgo/pyproject.toml                   | 10 ++++++++++
 llvm/utils/mlgo/tests/__init__.py                | 16 ++++++++++++++++
 .../mlgo/tests}/combine_training_corpus_test.py  |  2 +-
 .../mlgo => utils/mlgo/tests}/extract_ir_test.py |  2 +-
 .../mlgo/tests}/make_corpus_test.py              |  2 +-
 11 files changed, 38 insertions(+), 14 deletions(-)
 rename llvm/{py/src => utils/mlgo}/mlgo/combine_training_corpus.py (95%)
 rename llvm/{py/src => utils/mlgo}/mlgo/combine_training_corpus_lib.py (83%)
 rename llvm/{py/src => utils/mlgo}/mlgo/extract_ir.py (99%)
 rename llvm/{py/src => utils/mlgo}/mlgo/extract_ir_lib.py (99%)
 rename llvm/{py/src => utils/mlgo}/mlgo/make_corpus.py (97%)
 rename llvm/{py/src => utils/mlgo}/mlgo/make_corpus_lib.py (100%)
 create mode 100644 llvm/utils/mlgo/pyproject.toml
 create mode 100644 llvm/utils/mlgo/tests/__init__.py
 rename llvm/{py/src/mlgo => utils/mlgo/tests}/combine_training_corpus_test.py (98%)
 rename llvm/{py/src/mlgo => utils/mlgo/tests}/extract_ir_test.py (99%)
 rename llvm/{py/src/mlgo => utils/mlgo/tests}/make_corpus_test.py (97%)

diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/utils/mlgo/mlgo/combine_training_corpus.py
similarity index 95%
rename from llvm/py/src/mlgo/combine_training_corpus.py
rename to llvm/utils/mlgo/mlgo/combine_training_corpus.py
index c14c9381a18a6b..20684b55332d00 100644
--- a/llvm/py/src/mlgo/combine_training_corpus.py
+++ b/llvm/utils/mlgo/mlgo/combine_training_corpus.py
@@ -26,7 +26,7 @@
 from absl import app
 from absl import flags
 
-from compiler_opt.tools import combine_training_corpus_lib
+from mlgo import combine_training_corpus_lib
 
 flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
 
diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/utils/mlgo/mlgo/combine_training_corpus_lib.py
similarity index 83%
rename from llvm/py/src/mlgo/combine_training_corpus_lib.py
rename to llvm/utils/mlgo/mlgo/combine_training_corpus_lib.py
index 1de182e4cb80dd..e2ae8699ec3180 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_lib.py
+++ b/llvm/utils/mlgo/mlgo/combine_training_corpus_lib.py
@@ -5,11 +5,10 @@
 
 import os
 import json
+import glob
 
 from absl import logging
 
-import tensorflow as tf
-
 _FILE_NAME = "corpus_description.json"
 
 
@@ -18,10 +17,10 @@ def combine_corpus(root_dir: str) -> None:
     output_corpus_description = {}
 
     corpus_description_glob = os.path.join(root_dir, "*/" + _FILE_NAME)
-    for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
+    for corpus_description_path in glob.glob(corpus_description_glob):
         logging.info("processing %s", corpus_description_path)
 
-        with tf.io.gfile.GFile(corpus_description_path, "r") as f:
+        with open(corpus_description_path, encoding="utf-8") as f:
             corpus_description = json.load(f)
             sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
             module_names.extend(
@@ -35,5 +34,5 @@ def combine_corpus(root_dir: str) -> None:
 
     output_corpus_description["modules"] = module_names
 
-    with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), "w") as f:
+    with open(os.path.join(root_dir, _FILE_NAME), "w") as f:
         json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/utils/mlgo/mlgo/extract_ir.py
similarity index 99%
rename from llvm/py/src/mlgo/extract_ir.py
rename to llvm/utils/mlgo/mlgo/extract_ir.py
index 395a298ecec81d..ed580dbeefdc45 100644
--- a/llvm/py/src/mlgo/extract_ir.py
+++ b/llvm/utils/mlgo/mlgo/extract_ir.py
@@ -31,7 +31,7 @@
 from absl import flags
 from absl import logging
 
-from compiler_opt.tools import extract_ir_lib
+from mlgo import extract_ir_lib
 
 flags.DEFINE_string(
     "input",
diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/utils/mlgo/mlgo/extract_ir_lib.py
similarity index 99%
rename from llvm/py/src/mlgo/extract_ir_lib.py
rename to llvm/utils/mlgo/mlgo/extract_ir_lib.py
index ce6a4a17a8e6ac..c662d684f603cd 100644
--- a/llvm/py/src/mlgo/extract_ir_lib.py
+++ b/llvm/utils/mlgo/mlgo/extract_ir_lib.py
@@ -16,8 +16,7 @@
 
 from absl import logging
 
-from compiler_opt.rl import constant
-
+_UNSPECIFIED_OVERRIDE = ['<UNSPECIFIED>']
 
 # TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
 # \0 - separated list of strings, to a \n one.
@@ -378,7 +377,7 @@ def write_corpus_manifest(
     # This comes first rather than later so global_command_override is at the top
     # of the .json after being written
     if thinlto_build == "local":
-        corpus_description = {"global_command_override": constant.UNSPECIFIED_OVERRIDE}
+        corpus_description = {"global_command_override": _UNSPECIFIED_OVERRIDE}
     else:
         corpus_description = {}
 
diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/utils/mlgo/mlgo/make_corpus.py
similarity index 97%
rename from llvm/py/src/mlgo/make_corpus.py
rename to llvm/utils/mlgo/mlgo/make_corpus.py
index e6ba013019829e..7b3d85ff8423b3 100644
--- a/llvm/py/src/mlgo/make_corpus.py
+++ b/llvm/utils/mlgo/mlgo/make_corpus.py
@@ -16,7 +16,7 @@
 from absl import flags
 from absl import logging
 
-from compiler_opt.tools import make_corpus_lib
+from mlgo import make_corpus_lib
 
 flags.DEFINE_string("input_dir", None, "The input directory.")
 flags.DEFINE_string("output_dir", None, "The output directory.")
diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/utils/mlgo/mlgo/make_corpus_lib.py
similarity index 100%
rename from llvm/py/src/mlgo/make_corpus_lib.py
rename to llvm/utils/mlgo/mlgo/make_corpus_lib.py
diff --git a/llvm/utils/mlgo/pyproject.toml b/llvm/utils/mlgo/pyproject.toml
new file mode 100644
index 00000000000000..22d3a560aa3c44
--- /dev/null
+++ b/llvm/utils/mlgo/pyproject.toml
@@ -0,0 +1,10 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "mlgo"
+version = "0.0.1"
+description = "A small example package"
+readme = "README.md"
+requires-python = ">=3.8"
diff --git a/llvm/utils/mlgo/tests/__init__.py b/llvm/utils/mlgo/tests/__init__.py
new file mode 100644
index 00000000000000..4e8e26ce2d7fc2
--- /dev/null
+++ b/llvm/utils/mlgo/tests/__init__.py
@@ -0,0 +1,16 @@
+"""Ensure flags are initialized for e.g. pytest harness case."""
+
+import sys
+
+from absl import flags
+
+# When this module is loaded in an app, flags would have been parsed already
+# (assuming the app's main uses directly or indirectly absl.app.main). However,
+# when loaded in a test harness like pytest or unittest (e.g. python -m pytest)
+# that won't happen.
+# While tests shouldn't use the flags directly, some flags - like compilation
+# timeout - have default values that need to be accessible.
+# This makes sure flags are initialized, for this purpose.
+if not flags.FLAGS.is_parsed():
+  flags.FLAGS(sys.argv, known_only=True)
+assert flags.FLAGS.is_parsed()
diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/utils/mlgo/tests/combine_training_corpus_test.py
similarity index 98%
rename from llvm/py/src/mlgo/combine_training_corpus_test.py
rename to llvm/utils/mlgo/tests/combine_training_corpus_test.py
index 969d8472964971..0457ae1823db6a 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_test.py
+++ b/llvm/utils/mlgo/tests/combine_training_corpus_test.py
@@ -8,7 +8,7 @@
 
 from absl.testing import absltest
 
-from compiler_opt.tools import combine_training_corpus_lib
+from mlgo import combine_training_corpus_lib
 
 
 class CombineTrainingCorpusTest(absltest.TestCase):
diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/utils/mlgo/tests/extract_ir_test.py
similarity index 99%
rename from llvm/py/src/mlgo/extract_ir_test.py
rename to llvm/utils/mlgo/tests/extract_ir_test.py
index ae9b3b30f9a5c9..9eecb33b99b10f 100644
--- a/llvm/py/src/mlgo/extract_ir_test.py
+++ b/llvm/utils/mlgo/tests/extract_ir_test.py
@@ -8,7 +8,7 @@
 
 from absl.testing import absltest
 
-from compiler_opt.tools import extract_ir_lib
+from mlgo import extract_ir_lib
 
 
 class ExtractIrTest(absltest.TestCase):
diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/utils/mlgo/tests/make_corpus_test.py
similarity index 97%
rename from llvm/py/src/mlgo/make_corpus_test.py
rename to llvm/utils/mlgo/tests/make_corpus_test.py
index 7b5cc954b6d172..6ad09eb74571c9 100644
--- a/llvm/py/src/mlgo/make_corpus_test.py
+++ b/llvm/utils/mlgo/tests/make_corpus_test.py
@@ -8,7 +8,7 @@
 
 from absl.testing import absltest
 
-from compiler_opt.tools import make_corpus_lib
+from mlgo import make_corpus_lib
 
 
 class MakeCorpusTest(absltest.TestCase):

>From f99e11f67400bf5b03c289703dad5d140922d400 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Mon, 15 Jan 2024 06:04:48 +0000
Subject: [PATCH 5/7] Fix formatting/copyright

---
 llvm/utils/mlgo/mlgo/extract_ir_lib.py | 3 ++-
 llvm/utils/mlgo/tests/__init__.py      | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/mlgo/mlgo/extract_ir_lib.py b/llvm/utils/mlgo/mlgo/extract_ir_lib.py
index c662d684f603cd..9c828ce1eb631f 100644
--- a/llvm/utils/mlgo/mlgo/extract_ir_lib.py
+++ b/llvm/utils/mlgo/mlgo/extract_ir_lib.py
@@ -16,7 +16,8 @@
 
 from absl import logging
 
-_UNSPECIFIED_OVERRIDE = ['<UNSPECIFIED>']
+_UNSPECIFIED_OVERRIDE = ["<UNSPECIFIED>"]
+
 
 # TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
 # \0 - separated list of strings, to a \n one.
diff --git a/llvm/utils/mlgo/tests/__init__.py b/llvm/utils/mlgo/tests/__init__.py
index 4e8e26ce2d7fc2..9e97ceb6bfef6b 100644
--- a/llvm/utils/mlgo/tests/__init__.py
+++ b/llvm/utils/mlgo/tests/__init__.py
@@ -1,3 +1,6 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """Ensure flags are initialized for e.g. pytest harness case."""
 
 import sys
@@ -12,5 +15,5 @@
 # timeout - have default values that need to be accessible.
 # This makes sure flags are initialized, for this purpose.
 if not flags.FLAGS.is_parsed():
-  flags.FLAGS(sys.argv, known_only=True)
+    flags.FLAGS(sys.argv, known_only=True)
 assert flags.FLAGS.is_parsed()

>From 14286955cc98ffc7a04ed189a57b1b53df4e96d6 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Mon, 15 Jan 2024 06:19:19 +0000
Subject: [PATCH 6/7] Get pyproject working

---
 llvm/utils/mlgo/mlgo/__init__.py |  6 ++++++
 llvm/utils/mlgo/pyproject.toml   | 12 +++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)
 create mode 100644 llvm/utils/mlgo/mlgo/__init__.py

diff --git a/llvm/utils/mlgo/mlgo/__init__.py b/llvm/utils/mlgo/mlgo/__init__.py
new file mode 100644
index 00000000000000..bcb5de2ff4d575
--- /dev/null
+++ b/llvm/utils/mlgo/mlgo/__init__.py
@@ -0,0 +1,6 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+__versioninfo__ = (18, 0, 0)
+__version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
diff --git a/llvm/utils/mlgo/pyproject.toml b/llvm/utils/mlgo/pyproject.toml
index 22d3a560aa3c44..6bcbfe64e362e4 100644
--- a/llvm/utils/mlgo/pyproject.toml
+++ b/llvm/utils/mlgo/pyproject.toml
@@ -4,7 +4,13 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "mlgo"
-version = "0.0.1"
-description = "A small example package"
+description = "Tooling for ML in LLVM"
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.8,<3.11"
+dependencies = [
+  "absl-py>=1.0.0"
+]
+dynamic = ["version"]
+
+[tool.setuptools.dynamic]
+version = {attr = "mlgo.__version__"}

>From 4b8b452161cf029dc14f6070eb07da51d2eae729 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Mon, 15 Jan 2024 06:23:34 +0000
Subject: [PATCH 7/7] Add README

---
 llvm/utils/mlgo/README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 llvm/utils/mlgo/README.md

diff --git a/llvm/utils/mlgo/README.md b/llvm/utils/mlgo/README.md
new file mode 100644
index 00000000000000..53e616d8c6640e
--- /dev/null
+++ b/llvm/utils/mlgo/README.md
@@ -0,0 +1,12 @@
+# MLGO Python Library
+
+This folder contains the MLGO python library. This library consists of telling
+to help enable ML applications within LLVM, particularly tooling to extract
+corpora that can be used in downstream projects to train ML models and perform
+other tasks that benefit from having a large amount of data.
+
+### Python Versioning
+
+Due to type annotations, the MLGO tooling currently only supports a Python
+version greater than 3.8, deviating from the current LLVM project-wide
+minimum supported version of Python 3.6.



More information about the cfe-commits mailing list