[llvm] a387bce - [MLGO] Upstream the corpus extraction tooling (#72319)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 19 17:23:55 PST 2024
Author: Aiden Grossman
Date: 2024-01-19T17:23:51-08:00
New Revision: a387bce4bcbaeb28bf4510817ce54602e2f7a21d
URL: https://github.com/llvm/llvm-project/commit/a387bce4bcbaeb28bf4510817ce54602e2f7a21d
DIFF: https://github.com/llvm/llvm-project/commit/a387bce4bcbaeb28bf4510817ce54602e2f7a21d.diff
LOG: [MLGO] Upstream the corpus extraction tooling (#72319)
This patch upstreams some of the MLGO utilities, particularly the corpus
extraction tooling, into LLVM proper. The motivation for this patch is
available in the RFC.
https://discourse.llvm.org/t/rfc-upstreaming-elements-of-the-mlgo-tooling/74939
Added:
llvm/utils/mlgo-utils/CMakeLists.txt
llvm/utils/mlgo-utils/README.md
llvm/utils/mlgo-utils/mlgo/__init__.py
llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py
llvm/utils/mlgo-utils/pyproject.toml
llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
llvm/utils/mlgo-utils/tests/lit.cfg
llvm/utils/mlgo-utils/tests/lit.local.cfg
llvm/utils/mlgo-utils/tests/lit.site.cfg.in
Modified:
llvm/CMakeLists.txt
Removed:
################################################################################
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 61ab69d237470f2..1d230004e6c34ec 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1197,6 +1197,7 @@ if( LLVM_INCLUDE_UTILS )
add_subdirectory(utils/UnicodeData)
add_subdirectory(utils/yaml-bench)
add_subdirectory(utils/split-file)
+ add_subdirectory(utils/mlgo-utils)
if( LLVM_INCLUDE_TESTS )
add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
endif()
diff --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt
new file mode 100644
index 000000000000000..7b303c7639401ae
--- /dev/null
+++ b/llvm/utils/mlgo-utils/CMakeLists.txt
@@ -0,0 +1,11 @@
+configure_lit_site_cfg(
+ "${CMAKE_CURRENT_SOURCE_DIR}/tests/lit.site.cfg.in"
+ "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg"
+)
+
+add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
+ ${CMAKE_CURRENT_BINARY_DIR}
+ DEPENDS "FileCheck" "not" "count"
+)
+
+set_target_properties(check-mlgo-utils PROPERTIES FOLDER "Tests")
diff --git a/llvm/utils/mlgo-utils/README.md b/llvm/utils/mlgo-utils/README.md
new file mode 100644
index 000000000000000..12e9375f23edac2
--- /dev/null
+++ b/llvm/utils/mlgo-utils/README.md
@@ -0,0 +1,12 @@
+# MLGO Python Utilities
+
+This folder contains MLGO Python utilities, particularly infrastructure
+to help enable ML applications within LLVM, especially tooling to extract
+corpora that can be used in downstream projects to train ML models and perform
+other tasks that benefit from having a large amount of data.
+
+### Python Versioning
+
+Due to type annotations, the MLGO tooling currently only supports a Python
+version greater than 3.8, deviating from the current LLVM project-wide
+minimum supported version of Python 3.6.
diff --git a/llvm/utils/mlgo-utils/mlgo/__init__.py b/llvm/utils/mlgo-utils/mlgo/__init__.py
new file mode 100644
index 000000000000000..bcb5de2ff4d5752
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/__init__.py
@@ -0,0 +1,6 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+__versioninfo__ = (18, 0, 0)
+__version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
new file mode 100644
index 000000000000000..9aabd87b4688e00
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1,48 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+ --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+from absl import app
+from absl import flags
+
+from mlgo.corpus import combine_training_corpus_lib
+
+flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+ if len(argv) > 1:
+ raise app.UsageError("Too many command-line arguments.")
+
+ combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
+
+
+def entrypoint():
+ app.run(main)
+
+
+if __name__ == "__main__":
+ entrypoint()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
new file mode 100644
index 000000000000000..e2ae8699ec31801
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
@@ -0,0 +1,38 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Library for combining training corpora."""
+
+import os
+import json
+import glob
+
+from absl import logging
+
+_FILE_NAME = "corpus_description.json"
+
+
+def combine_corpus(root_dir: str) -> None:
+ module_names = []
+ output_corpus_description = {}
+
+ corpus_description_glob = os.path.join(root_dir, "*/" + _FILE_NAME)
+ for corpus_description_path in glob.glob(corpus_description_glob):
+ logging.info("processing %s", corpus_description_path)
+
+ with open(corpus_description_path, encoding="utf-8") as f:
+ corpus_description = json.load(f)
+ sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
+ module_names.extend(
+ [os.path.join(sub_dir, name) for name in corpus_description["modules"]]
+ )
+ del corpus_description["modules"]
+ if len(output_corpus_description) == 0:
+ output_corpus_description = corpus_description
+ elif corpus_description != output_corpus_description:
+ raise ValueError("Input corpora
diff er by more than modules.")
+
+ output_corpus_description["modules"] = module_names
+
+ with open(os.path.join(root_dir, _FILE_NAME), "w") as f:
+ json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
new file mode 100644
index 000000000000000..9463e61dc534fed
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -0,0 +1,165 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, pass an integer representing the desired
+verbosity to the --verbosity flag. Use 0 for all logs, status information,
+and detailed debug information, -1 for solely warnings, and -2 to not produce
+any output.
+"""
+
+import json
+import multiprocessing
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from mlgo.corpus import extract_ir_lib
+
+flags.DEFINE_string(
+ "input",
+ None,
+ "Input file or directory - either compile_commands.json, a linker parameter"
+ "list, or a path to a directory containing object files.",
+)
+flags.DEFINE_enum(
+ "input_type",
+ "json",
+ ["json", "params", "directory"],
+ "Input file type - json, params, or directory. params latter refers to lld"
+ "params.",
+)
+flags.DEFINE_string("output_dir", None, "Output directory")
+flags.DEFINE_integer(
+ "num_workers",
+ None,
+ "Number of parallel workers for objcopy. `None` for maximum available.",
+)
+flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
+flags.DEFINE_string(
+ "obj_base_dir",
+ "",
+ "Base directory for object files. Defaults to current working dir.",
+)
+flags.DEFINE_string(
+ "cmd_filter",
+ None,
+ "Include only those modules with a command line matching this regexp. "
+ "Setting it to None for not filtering. Note that the regexp is applied "
+ "independently for each separate command line option. For example, ^-Oz$ "
+ "will match Oz - built binaries. Does not work with thinlto_build=lld.",
+)
+flags.DEFINE_enum(
+ "thinlto_build",
+ None,
+ ["distributed", "local"],
+ "Set if the build was performed with either 'distributed' or "
+ "'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
+ "The build is assumed to have had "
+ "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
+ "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
+ "passed in the local case.",
+)
+flags.DEFINE_string(
+ "cmd_section_name",
+ ".llvmcmd",
+ "The section name passed to llvm-objcopy. For ELF object files, the "
+ "default .llvmcmd is correct. For Mach-O object files, one should use "
+ "something like __LLVM,__cmdline",
+)
+flags.DEFINE_string(
+ "bitcode_section_name",
+ ".llvmbc",
+ "The section name passed to llvm-objcopy. For ELF object files, the "
+ "default .llvmbc is correct. For Mach-O object files, one should use "
+ "__LLVM,__bitcode",
+)
+
+flags.mark_flag_as_required("output_dir")
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+ if len(argv) > 1:
+ raise app.UsageError("Too many command-line arguments.")
+
+ objs = []
+ if FLAGS.input is not None and FLAGS.thinlto_build == "local":
+ raise ValueError("--thinlto_build=local cannot be run with --input")
+ if FLAGS.input is None:
+ if FLAGS.thinlto_build != "local":
+ raise ValueError("--input or --thinlto_build=local must be provided")
+ objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
+ elif FLAGS.input_type == "json":
+ with open(FLAGS.input, encoding="utf-8") as f:
+ objs = extract_ir_lib.load_from_compile_commands(
+ json.load(f), FLAGS.output_dir
+ )
+ elif FLAGS.input_type == "params":
+ if not FLAGS.obj_base_dir:
+ logging.info(
+ "-obj_base_dir is unspecified, assuming current directory."
+ "If no objects are found, use this option to specify the root"
+ "directory for the object file paths in the input file."
+ )
+ with open(FLAGS.input, encoding="utf-8") as f:
+ objs = extract_ir_lib.load_from_lld_params(
+ [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
+ )
+ elif FLAGS.input_type == "directory":
+ logging.warning(
+ "Using the directory input is only recommended if the build system"
+ "your project uses does not support any structured output that"
+ "ml-compiler-opt understands. If your build system provides a"
+ "structured compilation database, use that instead"
+ )
+ objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+ else:
+ logging.error("Unknown input type: %s", FLAGS.input_type)
+
+ relative_output_paths = extract_ir_lib.run_extraction(
+ objs,
+ FLAGS.num_workers,
+ FLAGS.llvm_objcopy_path,
+ FLAGS.cmd_filter,
+ FLAGS.thinlto_build,
+ FLAGS.cmd_section_name,
+ FLAGS.bitcode_section_name,
+ )
+
+ extract_ir_lib.write_corpus_manifest(
+ FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
+ )
+
+ logging.info(
+ "Converted %d files out of %d",
+ len(objs) - relative_output_paths.count(None),
+ len(objs),
+ )
+
+
+def entrypoint():
+ multiprocessing.set_start_method("fork")
+ app.run(main)
+
+
+if __name__ == "__main__":
+ entrypoint()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
new file mode 100644
index 000000000000000..9c828ce1eb631fe
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
@@ -0,0 +1,395 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Library functions for IR extraction."""
+
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import multiprocessing
+import functools
+import json
+
+from typing import Dict, List, Optional
+
+from absl import logging
+
+_UNSPECIFIED_OVERRIDE = ["<UNSPECIFIED>"]
+
+
+# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
+# \0 - separated list of strings, to a \n one.
+def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
+ """Determine if the module should be included."""
+ if match_regexp is None:
+ return True
+ lines = cmdline.split("\0")
+ return any(len(re.findall(match_regexp, l)) for l in lines)
+
+
+def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
+ opts = cmdline.split("\0")
+ for option in opts:
+ if option.startswith("-fthinlto-index"):
+ return os.path.join(basedir, option.split("=")[1])
+ return None
+
+
+class TrainingIRExtractor:
+ """IR and command line extraction from an object file."""
+
+ def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
+ """Set up a TrainingIRExtractor.
+
+ Args:
+ obj_relative_path: relative path to the input object file. It will be also
+ used to construct the absolute path of the output IR and cmd files, by
+ appending it to output_base_dir.
+ output_base_dir: the directory under which the output will be produced.
+ obj_base_dir: the base directory for all the input object files.
+ """
+ self._obj_relative_path = obj_relative_path
+ self._output_base_dir = output_base_dir
+ self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ""
+
+ def obj_base_dir(self):
+ return self._obj_base_dir
+
+ def output_base_dir(self):
+ return self._output_base_dir
+
+ def relative_output_path(self):
+ return self._obj_relative_path
+
+ def input_obj(self):
+ return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+
+ def lld_src_bc(self):
+ # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+ # IR bitcode saved by lld. It is hardcoded into lld.
+ return os.path.join(
+ self._obj_base_dir, self._obj_relative_path + ".3.import.bc"
+ )
+
+ def lld_src_thinlto(self):
+ return os.path.join(self._obj_base_dir, self._obj_relative_path + ".thinlto.bc")
+
+ def dest_dir(self):
+ return os.path.join(
+ self.output_base_dir(), os.path.dirname(self._obj_relative_path)
+ )
+
+ def module_name(self):
+ return os.path.basename(self._obj_relative_path)
+
+ def cmd_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + ".cmd")
+
+ def bc_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + ".bc")
+
+ def thinlto_index_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + ".thinlto.bc")
+
+ def _get_extraction_cmd_command(
+ self, llvm_objcopy_path: str, cmd_section_name: str
+ ):
+ """Get llvm-objcopy and process args to a produce a command string that,
+ when invoked, will extract the cmd section info ths self.cmd_file() file.
+ """
+ return [
+ llvm_objcopy_path,
+ "--dump-section=" + cmd_section_name + "=" + self.cmd_file(),
+ self.input_obj(),
+ "/dev/null",
+ ]
+
+ def _get_extraction_bc_command(
+ self, llvm_objcopy_path: str, bitcode_section_name: str
+ ):
+ """Gets llvm-objcopy and process args to produce a command string that,
+ when invoked, will extract the bitcode section into the self.bc_file()
+ file.
+ """
+ return [
+ llvm_objcopy_path,
+ "--dump-section=" + bitcode_section_name + "=" + self.bc_file(),
+ self.input_obj(),
+ "/dev/null",
+ ]
+
+ def _extract_clang_artifacts(
+ self,
+ llvm_objcopy_path: str,
+ cmd_filter: str,
+ is_thinlto: bool,
+ cmd_section_name: str,
+ bitcode_section_name: str,
+ ) -> Optional[str]:
+ """Run llvm-objcopy to extract the .bc and command line."""
+ if not os.path.exists(self.input_obj()):
+ logging.info("%s does not exist.", self.input_obj())
+ return None
+ os.makedirs(self.dest_dir(), exist_ok=True)
+ try:
+ subprocess.check_output(
+ self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
+ stderr=subprocess.STDOUT,
+ encoding="utf-8",
+ )
+ if cmd_filter is not None or is_thinlto:
+ with open(self.cmd_file(), encoding="utf-8") as f:
+ lines = f.readlines()
+ assert len(lines) == 1
+ cmdline = lines[0]
+ if not should_include_module(cmdline, cmd_filter):
+ logging.info(
+ "Excluding module %s because it does not match the filter",
+ self.input_obj(),
+ )
+ os.remove(self.cmd_file())
+ return None
+ if is_thinlto:
+ index_file = get_thinlto_index(cmdline, self.obj_base_dir())
+ shutil.copy(index_file, self.thinlto_index_file())
+
+ subprocess.check_output(
+ self._get_extraction_bc_command(
+ llvm_objcopy_path, bitcode_section_name
+ ),
+ stderr=subprocess.STDOUT,
+ encoding="utf-8",
+ )
+ except subprocess.CalledProcessError as e:
+ # This may happen if .o file was build from asm (.S source).
+ logging.warning("%s was not processed: %s", self.input_obj(), e)
+ logging.info(e.output)
+ return None
+ assert (
+ os.path.exists(self.cmd_file())
+ and os.path.exists(self.bc_file())
+ and (not is_thinlto or os.path.exists(self.thinlto_index_file()))
+ )
+ return self.relative_output_path()
+
+ def _extract_lld_artifacts(self) -> Optional[str]:
+ """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation."""
+ if not os.path.exists(self.lld_src_bc()):
+ logging.info("%s does not exist.", self.lld_src_bc())
+ return None
+ if not os.path.exists(self.lld_src_thinlto()):
+ logging.info("%s does not exist.", self.lld_src_thinlto())
+ return None
+ os.makedirs(self.dest_dir(), exist_ok=True)
+
+ # Copy over the files
+ shutil.copy(self.lld_src_bc(), self.bc_file())
+ shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
+
+ assert os.path.exists(self.bc_file())
+ assert os.path.exists(self.thinlto_index_file())
+ return self._obj_relative_path
+
+ def extract(
+ self,
+ llvm_objcopy_path: Optional[str] = None,
+ cmd_filter: Optional[str] = None,
+ thinlto_build: Optional[str] = None,
+ cmd_section_name: Optional[str] = ".llvmcmd",
+ bitcode_section_name: Optional[str] = ".llvmbc",
+ ) -> Optional[str]:
+ if thinlto_build == "local":
+ return self._extract_lld_artifacts()
+ return self._extract_clang_artifacts(
+ llvm_objcopy_path=llvm_objcopy_path,
+ cmd_filter=cmd_filter,
+ is_thinlto=thinlto_build == "distributed",
+ cmd_section_name=cmd_section_name,
+ bitcode_section_name=bitcode_section_name,
+ )
+
+
+def convert_compile_command_to_objectfile(
+ command: Dict[str, str], output_dir: str
+) -> Optional[TrainingIRExtractor]:
+ obj_base_dir = command["directory"]
+ if "arguments" in command:
+ cmd_parts = command["arguments"]
+ elif "command" in command:
+ cmd_parts = command["command"].split()
+ else:
+ logging.info("compile_commands element has no command and arguments")
+ return None
+
+ try:
+ obj_index = cmd_parts.index("-o") + 1
+ except ValueError:
+ # This could happen if there are non-clang commands in compile_commands.json
+ logging.info("Command has no -o option: %s", " ".join(cmd_parts))
+ return None
+ obj_rel_path = cmd_parts[obj_index]
+ # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
+ return TrainingIRExtractor(
+ obj_relative_path=obj_rel_path,
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir,
+ )
+
+
+def load_from_compile_commands(
+ json_array: List[Dict[str, str]], output_dir: str
+) -> List[TrainingIRExtractor]:
+ objs = [
+ convert_compile_command_to_objectfile(cmd, output_dir) for cmd in json_array
+ ]
+ # Filter out None, in case there were non-clang commands in the .json
+ return [obj for obj in objs if obj is not None]
+
+
+def load_from_lld_params(
+ params_array: List[str], obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+ """Create an ObjectFile array based on lld's parameters."""
+ # yank out -o and the output. After that, anything not starting with '-', and
+ # ending in a '.o', is an object file.
+ try:
+ minus_o_idx = params_array.index("-o")
+ del params_array[minus_o_idx : minus_o_idx + 2]
+ just_obj_paths = [
+ o for o in params_array if not o.startswith("-") and o.endswith(".o")
+ ]
+ except ValueError:
+ logging.info("This params file does not have an explicit -o option.")
+ just_obj_paths = params_array
+
+ def make_obj(obj_file: str) -> TrainingIRExtractor:
+ return TrainingIRExtractor(
+ obj_relative_path=obj_file,
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir,
+ )
+
+ return [make_obj(obj_file) for obj_file in just_obj_paths]
+
+
+def load_from_directory(
+ obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+ """Create an object file array by globbing an entire drectory.
+
+ Args:
+ obj_base_dir: The base build directory that all object files will be
+ written out as being relative to.
+ output_dir: The output directory where extracted .bc and .cmd files should
+ be placed.
+ """
+ paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.o")]
+
+ def make_spec(obj_file: str):
+ return TrainingIRExtractor(
+ obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir,
+ )
+
+ return [make_spec(path) for path in paths]
+
+
+def load_for_lld_thinlto(
+ obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+ # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+ # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
+ # are also emitted next to the postimport bitcode, with the suffix
+ # .thinlto.bc instead
+ paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.3.import.bc")]
+
+ def make_spec(obj_file: str):
+ return TrainingIRExtractor(
+ # Cut away .3.import.bc
+ obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir,
+ )
+
+ return [make_spec(path) for path in paths]
+
+
+def run_extraction(
+ objs: List[TrainingIRExtractor],
+ num_workers: int,
+ llvm_objcopy_path: str,
+ cmd_filter: str,
+ thinlto_build: str,
+ cmd_section_name: str,
+ bitcode_section_name: str,
+):
+ """Extracts all specified object files into the corpus directory.
+
+ Args:
+ objs: A list of TrainingIRExtractor Objects that represent the object files
+ to extract bitcode/commands from.
+ num_workers: The number of parallel processes to spawn to run the
+ extraction.
+ llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
+ cmd_filter: A regular expression that is used to select for compilations
+ performed with specific flags. If you want to include all compilations,
+ set this to None.
+ thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
+ Set this to None if the build was not done with ThinLTO.
+ cmd_section_name: The name of the command line section created by the
+ bitcode embedding.
+ bitcode_section_name: The name of the bitcode section created by the
+ bitcode embedding.
+ """
+ extract_artifacts = functools.partial(
+ TrainingIRExtractor.extract,
+ llvm_objcopy_path=llvm_objcopy_path,
+ cmd_filter=cmd_filter,
+ thinlto_build=thinlto_build,
+ cmd_section_name=cmd_section_name,
+ bitcode_section_name=bitcode_section_name,
+ )
+
+ with multiprocessing.Pool(num_workers) as pool:
+ relative_output_paths = pool.map(extract_artifacts, objs)
+ pool.close()
+ pool.join()
+ return relative_output_paths
+
+
+def write_corpus_manifest(
+ thinlto_build: str, relative_output_paths: List[str], output_dir: str
+):
+ """Writes a corpus_manifest.json containing all necessary information about
+ the corpus.
+
+ Args:
+ thinlto_build: Whether or not the build was done with ThinLTO and if so,
+ what kind of ThinLTO. Set this to none if the build was not performed with
+ ThinLTO.
+ relative_output_paths: The relative (to the corpus directory) output paths
+ of all the bitcode files that should be placed in the corpus manifest
+ output_dir: The corpus directory where the corpus manifest should be
+ placed.
+ """
+ # This comes first rather than later so global_command_override is at the top
+ # of the .json after being written
+ if thinlto_build == "local":
+ corpus_description = {"global_command_override": _UNSPECIFIED_OVERRIDE}
+ else:
+ corpus_description = {}
+
+ corpus_description.update(
+ {
+ "has_thinlto": thinlto_build is not None,
+ "modules": [path for path in relative_output_paths if path is not None],
+ }
+ )
+
+ with open(
+ os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
+ ) as f:
+ json.dump(corpus_description, f, indent=2)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
new file mode 100644
index 000000000000000..edb0ecd853de246
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -0,0 +1,54 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+ --input_dir=<path to input directory> \
+ --output_dir=<path to output directory> \
+ --default_args="<list of space separated flags>"
+"""
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from mlgo.corpus import make_corpus_lib
+
+flags.DEFINE_string("input_dir", None, "The input directory.")
+flags.DEFINE_string("output_dir", None, "The output directory.")
+flags.DEFINE_string(
+ "default_args",
+ "",
+ "The compiler flags to compile with when using downstream tooling.",
+)
+
+flags.mark_flag_as_required("input_dir")
+flags.mark_flag_as_required("output_dir")
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+ logging.warning(
+ "Using this tool does not guarantee that the bitcode is taken at "
+ "the correct stage for consumption during model training. Make "
+ "sure to validate assumptions about where the bitcode is coming "
+ "from before using it in production."
+ )
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
+ make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, FLAGS.output_dir)
+ make_corpus_lib.write_corpus_manifest(
+ relative_paths, FLAGS.output_dir, FLAGS.default_args.split()
+ )
+
+
+def entrypoint():
+ app.run(main)
+
+
+if __name__ == "__main__":
+ entrypoint()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py
new file mode 100644
index 000000000000000..697c97ebf6ee29b
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py
@@ -0,0 +1,77 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Library functions for making a corpus from arbitrary bitcode."""
+
+import pathlib
+import os
+import shutil
+import json
+
+from typing import List, Optional
+
+BITCODE_EXTENSION = ".bc"
+
+
+def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]:
+ """Finds bitcode files to extract from a given directory.
+
+ Args:
+ bitcode_base_dir: The base directory where the bitcode to be copied
+ is from.
+ output_dir: The directory to place the bitcode in.
+
+ Returns an array of paths representing the relative path to the bitcode
+ file from the base direcotry.
+ """
+ paths = [
+ str(p)[: -len(BITCODE_EXTENSION)]
+ for p in pathlib.Path(bitcode_base_dir).glob("**/*" + BITCODE_EXTENSION)
+ ]
+
+ return [os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths]
+
+
+def copy_bitcode(
+ relative_paths: List[str], bitcode_base_dir: str, output_dir: str
+) -> None:
+ """Copies bitcode files from the base directory to the output directory.
+
+ Args:
+ relative_paths: An array of relative paths to bitcode files that are copied
+ over to the output directory, preserving relative location.
+ bitcode_base_dir: The base directory where the bitcode is located.
+ output_dir: The output directory to place the bitcode in.
+ """
+ for relative_path in relative_paths:
+ base_path = os.path.join(bitcode_base_dir, relative_path + BITCODE_EXTENSION)
+ destination_path = os.path.join(output_dir, relative_path + BITCODE_EXTENSION)
+ os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+ shutil.copy(base_path, destination_path)
+
+
+def write_corpus_manifest(
+ relative_output_paths: List[str],
+ output_dir: str,
+ default_args: Optional[List[str]] = None,
+) -> None:
+ """Creates a corpus manifest describing the bitcode that has been found.
+
+ Args:
+ relative_output_paths: A list of paths to each bitcode file relative to the
+ output directory.
+ outout_dir: The output directory where the corpus is being created.
+ default_args: An array of compiler flags that should be used to compile
+ the bitcode when using further downstream tooling."""
+ if default_args is None:
+ default_args = []
+ corpus_description = {
+ "global_command_override": default_args,
+ "has_thinlto": False,
+ "modules": [path for path in relative_output_paths if path is not None],
+ }
+
+ with open(
+ os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
+ ) as description_file:
+ json.dump(corpus_description, description_file, indent=2)
diff --git a/llvm/utils/mlgo-utils/pyproject.toml b/llvm/utils/mlgo-utils/pyproject.toml
new file mode 100644
index 000000000000000..be2af86cd05df30
--- /dev/null
+++ b/llvm/utils/mlgo-utils/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "mlgo"
+description = "Tooling for ML in LLVM"
+readme = "README.md"
+requires-python = ">=3.8,<3.11"
+dependencies = [
+ "absl-py>=1.0.0"
+]
+dynamic = ["version"]
+license = {text = "Apache-2.0 WITH LLVM-exception"}
+classifiers = [
+ "License :: OSI Approved :: Apache Software License"
+]
+
+[tool.setuptools.dynamic]
+version = {attr = "mlgo.__version__"}
+
+[project.scripts]
+combine_training_corpus = "mlgo.combine_training_corpus:entrypoint"
+extract_ir = "mlgo.extract_ir:entrypoint"
+make_corpus = "mlgo.make_corpus:entrypoint"
diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
new file mode 100644
index 000000000000000..dbfdc78a87533b5
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
@@ -0,0 +1,132 @@
+# REQUIRES: python-38, absl
+
+## Test the functionality of combine_training_corpus_lib
+
+import json
+import os
+import sys
+
+from mlgo.corpus import combine_training_corpus_lib
+
+## Test that combining two training corpora works as expected
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: mkdir %t.dir/subcorpus2
+# RUN: %python %s test_combine_corpus %t.dir | FileCheck %s --check-prefix CHECK-COMBINE-CORPUS
+
+
+def test_combine_corpus(corpus_dir):
+ subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+ subcorpus2_dir = os.path.join(corpus_dir, "subcorpus2")
+ subcorpus1_description = {
+ "has_thinlto": False,
+ "modules": ["test1.o", "test2.o"],
+ }
+ subcorpus2_description = {
+ "has_thinlto": False,
+ "modules": ["test3.o", "test4.o"],
+ }
+ with open(
+ os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+ ) as corpus1_description_handle:
+ json.dump(subcorpus1_description, corpus1_description_handle)
+ with open(
+ os.path.join(subcorpus2_dir, "corpus_description.json"), "w"
+ ) as corpus2_description_handle:
+ json.dump(subcorpus2_description, corpus2_description_handle)
+ combine_training_corpus_lib.combine_corpus(corpus_dir)
+ with open(
+ os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+ ) as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ print(combined_corpus_description["has_thinlto"])
+ # CHECK-COMBINE-CORPUS: False
+ for module in sorted(combined_corpus_description["modules"]):
+ print(module)
+ # CHECK-COMBINE-CORPUS: subcorpus1/test1.o
+ # CHECK-COMBINE-CORPUS: subcorpus1/test2.o
+ # CHECK-COMBINE-CORPUS: subcorpus2/test3.o
+ # CHECK-COMBINE-CORPUS: subcorpus2/test4.o
+
+
+## Test that we handle the empty folder case gracefully
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: mkdir %t.dir/empty_dir
+# RUN: %python %s test_empty_folder %t.dir | FileCheck %s --check-prefix CHECK-EMPTY-DIR
+
+
+def test_empty_folder(corpus_dir):
+ subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+ subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
+ with open(
+ os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+ ) as subcorpus1_description_handle:
+ json.dump(subcorpus1_description, subcorpus1_description_handle)
+ combine_training_corpus_lib.combine_corpus(corpus_dir)
+ with open(
+ os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+ ) as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ print(len(combined_corpus_description["modules"]))
+ # CHECK-EMPTY-DIR: 2
+
+
+## Test that we ignore extra files that will not end up contributing to the
+## corpus.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: touch %t.dir/empty.log
+# RUN: %python %s test_ignore_extra_file %t.dir | FileCheck %s --check-prefix CHECK-IGNORE-EXTRA-FILE
+
+
+def test_ignore_extra_file(corpus_dir):
+ subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+ subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
+ with open(
+ os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+ ) as subcorpus1_description_handle:
+ json.dump(subcorpus1_description, subcorpus1_description_handle)
+ combine_training_corpus_lib.combine_corpus(corpus_dir)
+ with open(
+ os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+ ) as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ print(len(combined_corpus_description["modules"]))
+ # CHECK-IGNORE-EXTRA-FILE: 2
+
+
+## Test that we raise an error in the case where the corpora
diff er in a
+## substantial way.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: mkdir %t.dir/subcorpus2
+# RUN: %python %s test_
diff erent_corpora %t.dir | FileCheck %s --check-prefix CHECK-DIFFERENT-CORPORA
+
+
+def test_
diff erent_corpora(corpus_dir):
+ subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+ subcorpus2_dir = os.path.join(corpus_dir, "subcorpus2")
+ subcorpus1_description = {"has_thinlto": False, "modules": ["test1.o"]}
+ subcorpus2_description = {"has_thinlto": True, "modules": ["test2.o"]}
+ with open(
+ os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+ ) as subcorpus1_description_handle:
+ json.dump(subcorpus1_description, subcorpus1_description_handle)
+ with open(
+ os.path.join(subcorpus2_dir, "corpus_description.json"), "w"
+ ) as subcorpus2_description_handle:
+ json.dump(subcorpus2_description, subcorpus2_description_handle)
+ try:
+ combine_training_corpus_lib.combine_corpus(corpus_dir)
+ except ValueError:
+ print("ValueError")
+ # CHECK-DIFFERENT-CORPORA: ValueError
+
+
+if __name__ == "__main__":
+ globals()[sys.argv[1]](*sys.argv[2:])
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
new file mode 100644
index 000000000000000..3ed52ec21de953c
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
@@ -0,0 +1,339 @@
+# REQUIRES: python-38, absl
+
+## Test the functionality of extract_ir_lib
+
+import os.path
+import sys
+
+from mlgo.corpus import extract_ir_lib
+
+## Test that we can convert a compilation database with a single compilation
+## command in it.
+
+# RUN: %python %s test_one_conversion | FileCheck %s --check-prefix CHECK-ONE-CONVERSION
+
+
+def test_one_conversion():
+ obj = extract_ir_lib.convert_compile_command_to_objectfile(
+ {
+ "directory": "/output/directory",
+ "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
+ "file": "/some/path/lib/foo/bar.cc",
+ },
+ "/corpus/destination/path",
+ )
+ print(obj.input_obj())
+ # CHECK-ONE-CONVERSION: /output/directory/lib/bar.o
+ print(obj.relative_output_path())
+ # CHECK-ONE-CONVERSION: lib/bar.o
+ print(obj.cmd_file())
+ # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.cmd
+ print(obj.bc_file())
+ # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.bc
+ print(obj.thinlto_index_file())
+ # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.thinlto.bc
+
+
+## Test that we can convert an arguments style compilation database
+
+# RUN: %python %s test_one_conversion_arguments_style | FileCheck %s --check-prefix CHECK-ARGUMENTS-STYLE
+
+
+def test_one_conversion_arguments_style():
+ obj = extract_ir_lib.convert_compile_command_to_objectfile(
+ {
+ "directory": "/output/directory",
+ "arguments": [
+ "-cc1",
+ "-c",
+ "/some/path/lib/foo/bar.cc",
+ "-o",
+ "lib/bar.o",
+ ],
+ "file": "/some/path/lib/foo/bar.cc",
+ },
+ "/corpus/destination/path",
+ )
+ print(obj.input_obj())
+ # CHECK-ARGUMENTS-STYLE: /output/directory/lib/bar.o
+ print(obj.relative_output_path())
+ # CHECK-ARGUMENTS-STYLE: lib/bar.o
+ print(obj.cmd_file())
+ # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.cmd
+ print(obj.bc_file())
+ # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.bc
+ print(obj.thinlto_index_file())
+ # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.thinlto.bc
+
+
+## Test that converting multiple files works as well
+
+# RUN: %python %s test_multiple_conversion | FileCheck %s --check-prefix CHECK-MULTIPLE-CONVERSION
+
+
+def test_multiple_conversion():
+ res = extract_ir_lib.load_from_compile_commands(
+ [
+ {
+ "directory": "/output/directory",
+ "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
+ "file": "/some/path/lib/foo/bar.cc",
+ },
+ {
+ "directory": "/output/directory",
+ "command": "-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o",
+ "file": "/some/path/lib/foo/baz.cc",
+ },
+ ],
+ "/corpus/destination/path",
+ )
+ res = list(res)
+ print(res[0].input_obj())
+ # CHECK-MULTIPLE-CONVERSION: /output/directory/lib/bar.o
+ print(res[0].relative_output_path())
+ # CHECK-MULTIPLE-CONVERSION: lib/bar.o
+ print(res[0].cmd_file())
+ # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.cmd
+ print(res[0].bc_file())
+ # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.bc
+ print(res[0].thinlto_index_file())
+ # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.thinlto.bc
+
+ print(res[1].input_obj(), "/output/directory/lib/other/baz.o")
+ # CHECK-MULTIPLE-CONVERSION: /output/directory/lib/other/baz.o
+ print(res[1].relative_output_path(), "lib/other/baz.o")
+ # CHECK-MULTIPLE-CONVERSION: lib/other/baz.o
+ print(res[1].cmd_file())
+ # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.cmd
+ print(res[1].bc_file())
+ # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.bc
+ print(res[1].thinlto_index_file())
+ # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.thinlto.bc
+
+
+## Test that we generate the correct objcopy commands for extracting commands
+
+# RUN: %python %s test_command_extraction | FileCheck %s --check-prefix CHECK-COMMAND-EXTRACT
+
+
+def test_command_extraction():
+ obj = extract_ir_lib.TrainingIRExtractor(
+ obj_relative_path="lib/obj_file.o",
+ output_base_dir="/where/corpus/goes",
+ obj_base_dir="/foo/bar",
+ )
+ extraction_cmd1 = obj._get_extraction_cmd_command(
+ "/bin/llvm_objcopy_path", ".llvmcmd"
+ )
+ for part in extraction_cmd1:
+ print(part)
+ # CHECK-COMMAND-EXTRACT: /bin/llvm_objcopy_path
+ # CHECK-COMMAND-EXTRACT: --dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd
+ # CHECK-COMMAND-EXTRACT: /foo/bar/lib/obj_file.o
+ # CHECK-COMMAND-EXTRACT: /dev/null
+
+ extraction_cmd2 = obj._get_extraction_bc_command(
+ "/bin/llvm_objcopy_path", ".llvmbc"
+ )
+ for part in extraction_cmd2:
+ print(part)
+ # CHECK-COMMAND-EXTRACT: /bin/llvm_objcopy_path
+ # CHECK-COMMAND-EXTRACT: --dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc
+ # CHECK-COMMAND-EXTRACT: /foo/bar/lib/obj_file.o
+ # CHECK-COMMAND-EXTRACT: /dev/null
+
+
+## Test that we generate the correct extraction commands without specifying
+## an output base directory.
+
+# RUN: %python %s test_command_extraction_no_basedir | FileCheck %s --check-prefix CHECK-COMMAND-EXTRACT-NOBASEDIR
+
+
+def test_command_extraction_no_basedir():
+ obj = extract_ir_lib.TrainingIRExtractor("lib/obj_file.o", "/where/corpus/goes")
+ extraction_cmd1 = obj._get_extraction_cmd_command(
+ "/bin/llvm_objcopy_path", ".llvmcmd"
+ )
+ for part in extraction_cmd1:
+ print(part)
+ # CHECK-COMMAND-EXTRACT-NOBASEDIR: /bin/llvm_objcopy_path
+ # CHECK-COMMAND-EXTRACT-NOBASEDIR: --dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd
+ # CHECK-COMMAND-EXTRACT-NOBASEDIR: lib/obj_file.o
+ # CHECK-COMMAND-EXTRACT-NOBASEDIR: /dev/null
+
+ extraction_cmd2 = obj._get_extraction_bc_command(
+ "/bin/llvm_objcopy_path", ".llvmbc"
+ )
+ for part in extraction_cmd2:
+ print(part)
+ # CHECK-COMMAND-EXTRACT-NOBASEDIR: /bin/llvm_objcopy_path
+ # CHECK-COMMAND-EXTRACT-NOBASEDIR: --dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc
+ # CHECK-COMMAND-EXTRACT-NOBASEDIR: lib/obj_file.o
+ # CHECK-COMMAND-EXTRACT-NOBASEDIR: /dev/null
+
+
+## Test that we can extract a corpus from lld parameters
+
+# RUN: %python %s test_lld_params | FileCheck %s --check-prefix CHECK-LLD-PARAMS
+
+
+def test_lld_params():
+ lld_opts = [
+ "-o",
+ "output/dir/exe",
+ "lib/obj1.o",
+ "somelib.a",
+ "-W,blah",
+ "lib/dir/obj2.o",
+ ]
+ obj = extract_ir_lib.load_from_lld_params(lld_opts, "/some/path", "/tmp/out")
+ print(obj[0].input_obj())
+ # CHECK-LLD-PARAMS: /some/path/lib/obj1.o
+ print(obj[0].relative_output_path())
+ # CHECK-LLD-PARAMS: lib/obj1.o
+ print(obj[0].cmd_file())
+ # CHECK-LLD-PARAMS: /tmp/out/lib/obj1.o.cmd
+ print(obj[0].thinlto_index_file())
+ # CHECK-LLD-PARAMS: /tmp/out/lib/obj1.o.thinlto.bc
+ print(obj[1].input_obj())
+ # CHECK-LLD-PARMAS: /some/path/lib/dir/obj2.o
+
+
+## Test that we can load a corpus from a directory containing object files
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subdir
+# RUN: touch %t.dir/subdir/test1.o
+# RUN: touch %t.dir/subdir/test2.o
+# RUN: %python %s test_load_from_directory %t.dir | FileCheck %s --check-prefix CHECK-LOAD-DIR
+
+
+def test_load_from_directory(tempdir):
+ objs = extract_ir_lib.load_from_directory(tempdir, "/output")
+ for index, obj in enumerate(sorted(objs, key=lambda x: x._obj_relative_path)):
+ print(obj._obj_relative_path, f"subdir/test{index + 1:d}.o")
+ # CHECK-LOAD-DIR: subdir/test1.o
+ # Explicitly check for equality here as we can not check within
+ # FileCheck the exact value as lit substitutions do not work in
+ # FileCheck lines.
+ print(obj._obj_base_dir == tempdir)
+ # CHECK-LOAD-DIR: True
+ print(obj._output_base_dir)
+ # CHECK-LOAD-DIR /output
+
+
+## Test that we can load a corpus in the lld thinLTO case
+
+# RUN: rm -rf %.dir && mkdir %t.dir
+# RUN: touch %t.dir/1.3.import.bc
+# RUN: touch %t.dir/2.3.import.bc
+# RUN: touch %t.dir/3.3.import.bc
+# RUN: touch %t.dir/1.thinlto.bc
+# RUN: touch %t.dir/2.thinlto.bc
+# RUN: touch %t.dir/3.thinlto.bc
+# RUN: %python %s test_lld_thinlto_discovery %t.dir | FileCheck %s --check-prefix CHECK-LLD-THINLTO-DISCOVERY
+
+
+def test_lld_thinlto_discovery(tempdir):
+ obj = extract_ir_lib.load_for_lld_thinlto(tempdir, "/output")
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ print(o._obj_relative_path)
+ # Explicitly check for equality as we can not check within FileCheck
+ # using the lit substitution for the temp dir
+ print(o._obj_base_dir == tempdir)
+ print(o._output_base_dir) # outdir
+ # CHECK-LLD-THINLTO-DISCOVERY: 1
+ # CHECK-LLD-THINLTO-DISCOVERY: True
+ # CHECK-LLD-THINLTO-DISCOVERY: /output
+ # CHECK-LLD-THINLTO-DISCOVERY: 2
+ # CHECK-LLD-THINLTO-DISCOVERY: True
+ # CHECK-LLD-THINLTO-DISCOVERY: /output
+ # CHECK-LLD-THINLTO-DISCOVERY: 3
+ # CHECK-LLD-THINLTO-DISCOVERY: True
+ # CHECK-LLD-THINLTO-DISCOVERY: /output
+
+
+## Test that we can load a corpus in the nested lld thinLTO case
+
+# RUN: mkdir %t.dir/nest
+# RUN: mv %t.dir/*.bc %t.dir/nest
+# RUN: %python %s test_lld_thinlto_discovery_nested %t.dir | FileCheck %s --check-prefix CHECK-LLD-THINLTO-DISCOVERY-NESTED
+
+
+def test_lld_thinlto_discovery_nested(outer):
+ obj = extract_ir_lib.load_for_lld_thinlto(outer, "/output")
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ print(o._obj_relative_path)
+ print(o._obj_base_dir == outer)
+ print(o._output_base_dir)
+ # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/1
+ # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
+ # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
+ # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/2
+ # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
+ # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
+ # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/3
+ # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
+ # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
+
+
+## Test the lld extraction works as expected
+
+# RUN: rm -rf %t.dir.out && mkdir %t.dir.out
+# RUN: %python %s test_lld_thinlto_extraction %t.dir %t.dir.out | FileCheck %s --check-prefix CHECK-LLD-THINLTO-EXTRACTION-PY
+# ls %t.dir.out/nest | FileChceck %s --check-prefix CHECK-LLD-THINLTO-EXTRACTION
+
+# CHECK-LLD-THINLTO-EXTRACTION: 1
+# CHECK-LLD-THINLTO-EXTRACTION: 2
+# CHECK-LLD-THINLTO-EXTRACTION: 3
+# CHECK-LLD-THINLTO-EXTRACTION: 1.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 2.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 3.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 1.thinlto.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 2.thinlto.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 3.thinlto.bc
+
+
+def test_lld_thinlto_extraction(outer, outdir):
+ obj = extract_ir_lib.load_for_lld_thinlto(outer, outdir)
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ mod_path = o.extract(thinlto_build="local")
+ print(mod_path)
+ # CHECK-LLD-THINLTO-EXTRACTION-PY: 1
+ # CHECK-LLD-THINLTO-EXTRACTION-PY: 2
+ # CHECK-LLD-THINLTO-EXTRACTION-PY: 3
+
+
+## Test that filtering works correctly
+
+# RUN: %python %s test_filtering | FileCheck %s --check-prefix CHECK-TEST-FILTERING
+
+
+def test_filtering():
+ cmdline = "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o"
+ print(extract_ir_lib.should_include_module(cmdline, None))
+ # CHECK-TEST-FILTERING: True
+ print(extract_ir_lib.should_include_module(cmdline, ".*"))
+ # CHECK-TEST-FILTERING: True
+ print(extract_ir_lib.should_include_module(cmdline, "^-Oz$"))
+ # CHECK-TEST-FILTERING: True
+ print(extract_ir_lib.should_include_module(cmdline, "^-O3$"))
+ # CHECK-TEST-FILTERING: False
+
+
+## Test that we extract the thinLTO index correctly
+
+# RUN: %python %s test_thinlto_index_extractor | FileCheck %s --check-prefix CHECK-THINLTO-INDEX-EXTRACTOR
+
+
+def test_thinlto_index_extractor():
+ cmdline = (
+ "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/"
+ "out.o\0-fthinlto-index=foo/bar.thinlto.bc"
+ )
+ print(extract_ir_lib.get_thinlto_index(cmdline, "/the/base/dir"))
+ # CHECK-THINLTO-INDEX-EXTRACTOR: /the/base/dir/foo/bar.thinlto.bc
+
+
+if __name__ == "__main__":
+ globals()[sys.argv[1]](*sys.argv[2:])
diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
new file mode 100644
index 000000000000000..0f970414b1aecde
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
@@ -0,0 +1,71 @@
+# REQUIRES: python-38, absl
+
+## Test the functionality of make_corpus_lib
+
+import json
+import os
+import sys
+
+from mlgo.corpus import make_corpus_lib
+
+## Test that when we load the bitcode from a directory using the
+## load_bitcode_from_directory function, we get the expected results.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/nested
+# RUN: touch %t.dir/nested/test1.bc
+# RUN: touch %t.dir/nested/test2.bc
+# RUN: %python %s test_load_bitcode_from_directory %t.dir | FileCheck %s --check-prefix CHECK-LOAD
+
+
+def test_load_bitcode_from_directory(work_dir):
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(work_dir)
+ relative_paths = sorted(relative_paths)
+ for relative_path in relative_paths:
+ print(relative_path)
+ # CHECK-LOAD: nested/test1
+ # CHECK-LOAD: nested/test2
+
+
+## Test that when we copy the bitcode given a list of relative paths, the
+## appropriate files are copied over.
+
+# RUN: rm -rf %t.dir1 && mkdir %t.dir1
+# RUN: %python %s test_copy_bitcode %t.dir %t.dir1
+# RUN: ls %t.dir1/nested | FileCheck %s --check-prefix CHECK-COPY
+
+# CHECK-COPY: test1.bc
+# CHECK-COPY: test2.bc
+
+
+def test_copy_bitcode(directory, output_dir):
+ relative_paths = ["nested/test1", "nested/test2"]
+ make_corpus_lib.copy_bitcode(relative_paths, directory, output_dir)
+
+
+## Test that we get the expected corpus manifest when writing a corpus
+## manifest to the specificed directory.
+
+# RUN: %python %s test_write_corpus_manifest %t.dir1 | FileCheck %s --check-prefix CHECK-MANIFEST
+
+
+def test_write_corpus_manifest(output_dir):
+ relative_output_paths = ["test/test1", "test/test2"]
+ default_args = ["-O3", "-c"]
+ make_corpus_lib.write_corpus_manifest(
+ relative_output_paths, output_dir, default_args
+ )
+ with open(
+ os.path.join(output_dir, "corpus_description.json"), encoding="utf-8"
+ ) as corpus_description_file:
+ corpus_description = json.load(corpus_description_file)
+ print(corpus_description["global_command_override"])
+ # CHECK-MANIFEST: ['-O3', '-c']
+ print(corpus_description["has_thinlto"])
+ # CHECK-MANIFEST: False
+ print(corpus_description["modules"])
+ # CHECK-MANIFEST: ['test/test1', 'test/test2']
+
+
+if __name__ == "__main__":
+ globals()[sys.argv[1]](*sys.argv[2:])
diff --git a/llvm/utils/mlgo-utils/tests/lit.cfg b/llvm/utils/mlgo-utils/tests/lit.cfg
new file mode 100644
index 000000000000000..055f0945942fc1c
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/lit.cfg
@@ -0,0 +1,15 @@
+import lit.formats
+
+from lit.llvm import llvm_config
+
+config.name = "mlgo-utils"
+config.test_format = lit.formats.ShTest(execute_external=False)
+
+config.suffixes = [".py"]
+
+config.test_source_root = os.path.dirname(__file__)
+config.test_exec_root = config.obj_root
+
+config.environment["PYTHONPATH"] = os.path.join(config.src_root, "utils", "mlgo-utils")
+
+llvm_config.use_default_substitutions()
diff --git a/llvm/utils/mlgo-utils/tests/lit.local.cfg b/llvm/utils/mlgo-utils/tests/lit.local.cfg
new file mode 100644
index 000000000000000..90cdf8ba618ed8f
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/lit.local.cfg
@@ -0,0 +1,14 @@
+import sys
+
+# TODO(boomanaiden154): Remove this flag once the minimum Python version for
+# the entire project has been bumped to 3.8.
+if sys.version_info > (3,8):
+ config.available_features.add("python-38")
+
+# TODO(boomanaiden154): Remove this flag once the scripts are converted to
+# not use absl anymore.
+try:
+ import absl
+ config.available_features.add("absl")
+except:
+ pass
diff --git a/llvm/utils/mlgo-utils/tests/lit.site.cfg.in b/llvm/utils/mlgo-utils/tests/lit.site.cfg.in
new file mode 100644
index 000000000000000..22e1524e6a8fd20
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/lit.site.cfg.in
@@ -0,0 +1,10 @@
+ at LIT_SITE_CFG_IN_HEADER@
+
+config.src_root = "@LLVM_SOURCE_DIR@"
+config.obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+lit_config.load_config(config, "@LLVM_SOURCE_DIR@/utils/mlgo-utils/tests/lit.cfg")
More information about the llvm-commits
mailing list