[llvm] [mlgo-utils] Hoist entrypoint scripts to mlgo-utils directory (PR #146981)
Vincent Lee via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 9 02:11:20 PDT 2025
https://github.com/thevinster updated https://github.com/llvm/llvm-project/pull/146981
>From 497d8f76d2df3dbd396c8b734b96ce81b12dcea7 Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince at fb.com>
Date: Thu, 3 Jul 2025 18:22:55 -0700
Subject: [PATCH 1/6] [mlgo-utils] Hoist entry script out to the correct
directory
---
.../mlgo-utils/{mlgo/corpus => }/combine_training_corpus.py | 0
llvm/utils/mlgo-utils/{mlgo/corpus => }/extract_ir.py | 0
llvm/utils/mlgo-utils/{mlgo/corpus => }/make_corpus.py | 0
utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 6 +++---
4 files changed, 3 insertions(+), 3 deletions(-)
rename llvm/utils/mlgo-utils/{mlgo/corpus => }/combine_training_corpus.py (100%)
rename llvm/utils/mlgo-utils/{mlgo/corpus => }/extract_ir.py (100%)
rename llvm/utils/mlgo-utils/{mlgo/corpus => }/make_corpus.py (100%)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
similarity index 100%
rename from llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
rename to llvm/utils/mlgo-utils/combine_training_corpus.py
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
similarity index 100%
rename from llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
rename to llvm/utils/mlgo-utils/extract_ir.py
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
similarity index 100%
rename from llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
rename to llvm/utils/mlgo-utils/make_corpus.py
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index b618c74c19da1..db8a92fd25de6 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -5210,8 +5210,8 @@ py_binary(
py_binary(
name = "extract_ir",
srcs = [
+ "utils/mlgo-utils/extract_ir.py",
"utils/mlgo-utils/mlgo/__init__.py",
- "utils/mlgo-utils/mlgo/corpus/extract_ir.py",
"utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py",
"utils/mlgo-utils/mlgo/corpus/flags.py",
],
@@ -5221,8 +5221,8 @@ py_binary(
py_binary(
name = "combine_training_corpus",
srcs = [
+ "utils/mlgo-utils/combine_training_corpus.py",
"utils/mlgo-utils/mlgo/__init__.py",
- "utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py",
"utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py",
"utils/mlgo-utils/mlgo/corpus/flags.py",
],
@@ -5232,8 +5232,8 @@ py_binary(
py_binary(
name = "make_corpus",
srcs = [
+ "utils/mlgo-utils/make_corpus.py",
"utils/mlgo-utils/mlgo/__init__.py",
- "utils/mlgo-utils/mlgo/corpus/make_corpus.py",
"utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py",
],
imports = ["utils/mlgo-utils"],
>From ee72352a0ec409fe3f88bb09136261923c24d8ba Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince at fb.com>
Date: Fri, 4 Jul 2025 11:13:02 -0700
Subject: [PATCH 2/6] Add wrapper instead of moving
---
.../mlgo-utils/combine_training_corpus.py | 57 +-----
llvm/utils/mlgo-utils/extract_ir.py | 189 +-----------------
llvm/utils/mlgo-utils/make_corpus.py | 58 +-----
.../mlgo/corpus/combine_training_corpus.py | 52 +++++
.../mlgo-utils/mlgo/corpus/extract_ir.py | 184 +++++++++++++++++
.../mlgo-utils/mlgo/corpus/make_corpus.py | 53 +++++
6 files changed, 310 insertions(+), 283 deletions(-)
mode change 100644 => 100755 llvm/utils/mlgo-utils/combine_training_corpus.py
mode change 100644 => 100755 llvm/utils/mlgo-utils/extract_ir.py
mode change 100644 => 100755 llvm/utils/mlgo-utils/make_corpus.py
create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
old mode 100644
new mode 100755
index 9884d6696a43f..7a1d870ad7e38
--- a/llvm/utils/mlgo-utils/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -1,52 +1,9 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-r"""Combine multiple training corpus into a single training corpus.
+#!/usr/bin/env python3
-Currently only support the case that multiple corpus share the same
-configurables except the "modules" field.
+import re
+import sys
+from mlgo.corpus.combine_training_corpus import parse_args_and_run
+if __name__ == '__main__':
+ sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+ sys.exit(parse_args_and_run())
-Usage: we'd like to combine training corpus corpus1 and corpus2 into
-combinedcorpus; we first structure the files as follows:
-
-combinedcorpus
-combinedcorpus/corpus1
-combinedcorpus/corpus2
-
-Running this script with
-
-python3 \
-compiler_opt/tools/combine_training_corpus.py \
- --root_dir=$PATH_TO_combinedcorpus
-
-generates combinedcorpus/corpus_description.json file. In this way corpus1
-and corpus2 are combined into combinedcorpus.
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import combine_training_corpus_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
- parser = argparse.ArgumentParser(
- description="A tool for combining multiple training corpora"
- )
- parser.add_argument(
- "--root_dir", type=str, help="The root dir of module paths to combine."
- )
- flags.add_verbosity_arguments(parser)
- args = parser.parse_args()
- main(args)
-
-
-def main(args):
- logging.basicConfig(level=args.verbosity)
-
- combine_training_corpus_lib.combine_corpus(args.root_dir)
-
-
-if __name__ == "__main__":
- parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
old mode 100644
new mode 100755
index 3101cef196b4a..589a5c50af726
--- a/llvm/utils/mlgo-utils/extract_ir.py
+++ b/llvm/utils/mlgo-utils/extract_ir.py
@@ -1,184 +1,9 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Extract IR for training.
+#!/usr/bin/env python3
-Extract IR for training, either from a compile_commands.json file produced by
-cmake, or a linker parameter list file.
+import re
+import sys
+from mlgo.corpus.extract_ir import parse_args_and_run
+if __name__ == '__main__':
+ sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+ sys.exit(parse_args_and_run())
-Only run with
-'python compiler_opt/tools/extract_ir.py ...'
-
-The compilation is assumed to have been performed with clang, using
--fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
-
-In a distributed ThinLTO case, the compilation is assumed to have been performed
-specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
-
-In a local ThinLTO case, the compilation is assumedto have been performed
-specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
-
-To change the logging verbosity, set the --verbosity flag to the desired level.
-Setting it to a specific level will enable all messages at that level and
-higher. Exact values can be found by invoking the script with --help.
-"""
-
-import argparse
-import json
-import logging
-
-from mlgo.corpus import extract_ir_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
- parser = argparse.ArgumentParser(
- description="A tool for making a corpus from build artifacts"
- )
- parser.add_argument(
- "--input",
- type=str,
- help="Input file or directory - either compile_commands.json, a linker "
- "parameter list, or a path to a directory containing object files.",
- )
- parser.add_argument(
- "--input_type",
- type=str,
- help="Input file type - JSON, LLD params, directory, or bazel aquery.",
- choices=["json", "params", "directory", "bazel_aquery"],
- default="json",
- nargs="?",
- )
- parser.add_argument("--output_dir", type=str, help="Output directory")
- parser.add_argument(
- "--num_workers",
- type=int,
- help="Number of parallel works for objcopy. `None` for maximum available.",
- default=None,
- nargs="?",
- )
- parser.add_argument(
- "--llvm_objcopy_path",
- type=str,
- help="Path to llvm-objcopy",
- default="llvm-objcopy",
- nargs="?",
- )
- parser.add_argument(
- "--obj_base_dir",
- type=str,
- help="Base directory for object files. Defaults to current working dir.",
- default="",
- nargs="?",
- )
- parser.add_argument(
- "--cmd_filter",
- type=str,
- help="Include only those modules with a command line matching this regular "
- "expression. Set it to None to not perform any filtering. Note that the "
- "regular expression is applied independently for each separate command line "
- "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
- "with thinlto_build=lld.",
- default=None,
- nargs="?",
- )
- parser.add_argument(
- "--thinlto_build",
- type=str,
- help="Set if the build was performed with either 'distributed' or 'local' "
- "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
- "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
- "the distributed case or -Wl,--save-temps=import and "
- "-Wl,--thinlto-emit-index-files passed in the local case",
- choices=["distributed", "local"],
- default=None,
- nargs="?",
- )
- parser.add_argument(
- "--cmd_section_name",
- type=str,
- help="The section name passed to llvm-objcopy. For ELF object files, the "
- "default .llvmcmd is correct. For Mach-O object files, one should use "
- "something like __LLVM,__cmdline",
- default=".llvmcmd",
- nargs="?",
- )
- parser.add_argument(
- "--bitcode_section_name",
- type=str,
- help="The section name passed to llvm-objcopy. For ELF object files, the "
- "default .llvmbc is correct. For Mach-O object files, one should use "
- "__LLVM,__bitcode",
- default=".llvmbc",
- nargs="?",
- )
- flags.add_verbosity_arguments(parser)
- args = parser.parse_args()
- main(args)
-
-
-def main(args):
- logging.basicConfig(level=args.verbosity)
-
- objs = []
- if args.input is not None and args.thinlto_build == "local":
- raise ValueError("--thinlto_build=local cannot be run with --input")
- if args.input is None:
- if args.thinlto_build != "local":
- raise ValueError("--input or --thinlto_build=local must be provided")
- objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
- elif args.input_type == "json":
- with open(args.input, encoding="utf-8") as f:
- objs = extract_ir_lib.load_from_compile_commands(
- json.load(f), args.output_dir
- )
- elif args.input_type == "params":
- if not args.obj_base_dir:
- logging.info(
- "-obj_base_dir is unspecified, assuming current directory. "
- "If no objects are found, use this option to specify the root "
- "directory for the object file paths in the input file."
- )
- with open(args.input, encoding="utf-8") as f:
- objs = extract_ir_lib.load_from_lld_params(
- [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
- )
- elif args.input_type == "directory":
- logging.warning(
- "Using the directory input is only recommended if the build system "
- "your project uses does not support any structured output that "
- "ml-compiler-opt understands. If your build system provides a "
- "structured compilation database, use that instead"
- )
- objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
- elif args.input_type == "bazel_aquery":
- with open(args.input, encoding="utf-8") as aquery_json_handle:
- objs = extract_ir_lib.load_bazel_aquery(
- json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
- )
- else:
- logging.error("Unknown input type: %s", args.input_type)
-
- relative_output_paths = extract_ir_lib.run_extraction(
- objs,
- args.num_workers,
- args.llvm_objcopy_path,
- args.cmd_filter,
- args.thinlto_build,
- args.cmd_section_name,
- args.bitcode_section_name,
- )
-
- extract_ir_lib.write_corpus_manifest(
- args.thinlto_build, relative_output_paths, args.output_dir
- )
-
- logging.info(
- "Converted %d files out of %d",
- len(objs) - relative_output_paths.count(None),
- len(objs),
- )
-
-
-if __name__ == "__main__":
- parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
old mode 100644
new mode 100755
index 221486e16c6e0..5b4a9bef486ff
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -1,53 +1,9 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Tool for making a corpus from arbitrary bitcode.
+#!/usr/bin/env python3
-To create a corpus from a set of bitcode files in an input directory, run
-the following command:
+import re
+import sys
+from mlgo.corpus.make_corpus import parse_args_and_run
+if __name__ == '__main__':
+ sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+ sys.exit(parse_args_and_run())
-PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
- --input_dir=<path to input directory> \
- --output_dir=<path to output directory> \
- --default_args="<list of space separated flags>"
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import make_corpus_lib
-
-
-def parse_args_and_run():
- parser = argparse.ArgumentParser(
- description="A tool for making a corpus from arbitrary bitcode"
- )
- parser.add_argument("--input_dir", type=str, help="The input directory.")
- parser.add_argument("--output_dir", type=str, help="The output directory.")
- parser.add_argument(
- "--default_args",
- type=str,
- help="The compiler flags to compile with when using downstream tooling.",
- default="",
- nargs="?",
- )
- args = parser.parse_args()
- main(args)
-
-
-def main(args):
- logging.warning(
- "Using this tool does not guarantee that the bitcode is taken at "
- "the correct stage for consumption during model training. Make "
- "sure to validate assumptions about where the bitcode is coming "
- "from before using it in production."
- )
- relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
- make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
- make_corpus_lib.write_corpus_manifest(
- relative_paths, args.output_dir, args.default_args.split()
- )
-
-
-if __name__ == "__main__":
- parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
new file mode 100644
index 0000000000000..9884d6696a43f
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1,52 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+ --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import combine_training_corpus_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+ parser = argparse.ArgumentParser(
+ description="A tool for combining multiple training corpora"
+ )
+ parser.add_argument(
+ "--root_dir", type=str, help="The root dir of module paths to combine."
+ )
+ flags.add_verbosity_arguments(parser)
+ args = parser.parse_args()
+ main(args)
+
+
+def main(args):
+ logging.basicConfig(level=args.verbosity)
+
+ combine_training_corpus_lib.combine_corpus(args.root_dir)
+
+
+if __name__ == "__main__":
+ parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
new file mode 100644
index 0000000000000..3101cef196b4a
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -0,0 +1,184 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, set the --verbosity flag to the desired level.
+Setting it to a specific level will enable all messages at that level and
+higher. Exact values can be found by invoking the script with --help.
+"""
+
+import argparse
+import json
+import logging
+
+from mlgo.corpus import extract_ir_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+ parser = argparse.ArgumentParser(
+ description="A tool for making a corpus from build artifacts"
+ )
+ parser.add_argument(
+ "--input",
+ type=str,
+ help="Input file or directory - either compile_commands.json, a linker "
+ "parameter list, or a path to a directory containing object files.",
+ )
+ parser.add_argument(
+ "--input_type",
+ type=str,
+ help="Input file type - JSON, LLD params, directory, or bazel aquery.",
+ choices=["json", "params", "directory", "bazel_aquery"],
+ default="json",
+ nargs="?",
+ )
+ parser.add_argument("--output_dir", type=str, help="Output directory")
+ parser.add_argument(
+ "--num_workers",
+ type=int,
+ help="Number of parallel works for objcopy. `None` for maximum available.",
+ default=None,
+ nargs="?",
+ )
+ parser.add_argument(
+ "--llvm_objcopy_path",
+ type=str,
+ help="Path to llvm-objcopy",
+ default="llvm-objcopy",
+ nargs="?",
+ )
+ parser.add_argument(
+ "--obj_base_dir",
+ type=str,
+ help="Base directory for object files. Defaults to current working dir.",
+ default="",
+ nargs="?",
+ )
+ parser.add_argument(
+ "--cmd_filter",
+ type=str,
+ help="Include only those modules with a command line matching this regular "
+ "expression. Set it to None to not perform any filtering. Note that the "
+ "regular expression is applied independently for each separate command line "
+ "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
+ "with thinlto_build=lld.",
+ default=None,
+ nargs="?",
+ )
+ parser.add_argument(
+ "--thinlto_build",
+ type=str,
+ help="Set if the build was performed with either 'distributed' or 'local' "
+ "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
+ "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
+ "the distributed case or -Wl,--save-temps=import and "
+ "-Wl,--thinlto-emit-index-files passed in the local case",
+ choices=["distributed", "local"],
+ default=None,
+ nargs="?",
+ )
+ parser.add_argument(
+ "--cmd_section_name",
+ type=str,
+ help="The section name passed to llvm-objcopy. For ELF object files, the "
+ "default .llvmcmd is correct. For Mach-O object files, one should use "
+ "something like __LLVM,__cmdline",
+ default=".llvmcmd",
+ nargs="?",
+ )
+ parser.add_argument(
+ "--bitcode_section_name",
+ type=str,
+ help="The section name passed to llvm-objcopy. For ELF object files, the "
+ "default .llvmbc is correct. For Mach-O object files, one should use "
+ "__LLVM,__bitcode",
+ default=".llvmbc",
+ nargs="?",
+ )
+ flags.add_verbosity_arguments(parser)
+ args = parser.parse_args()
+ main(args)
+
+
+def main(args):
+ logging.basicConfig(level=args.verbosity)
+
+ objs = []
+ if args.input is not None and args.thinlto_build == "local":
+ raise ValueError("--thinlto_build=local cannot be run with --input")
+ if args.input is None:
+ if args.thinlto_build != "local":
+ raise ValueError("--input or --thinlto_build=local must be provided")
+ objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+ elif args.input_type == "json":
+ with open(args.input, encoding="utf-8") as f:
+ objs = extract_ir_lib.load_from_compile_commands(
+ json.load(f), args.output_dir
+ )
+ elif args.input_type == "params":
+ if not args.obj_base_dir:
+ logging.info(
+ "-obj_base_dir is unspecified, assuming current directory. "
+ "If no objects are found, use this option to specify the root "
+ "directory for the object file paths in the input file."
+ )
+ with open(args.input, encoding="utf-8") as f:
+ objs = extract_ir_lib.load_from_lld_params(
+ [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
+ )
+ elif args.input_type == "directory":
+ logging.warning(
+ "Using the directory input is only recommended if the build system "
+ "your project uses does not support any structured output that "
+ "ml-compiler-opt understands. If your build system provides a "
+ "structured compilation database, use that instead"
+ )
+ objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+ elif args.input_type == "bazel_aquery":
+ with open(args.input, encoding="utf-8") as aquery_json_handle:
+ objs = extract_ir_lib.load_bazel_aquery(
+ json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
+ )
+ else:
+ logging.error("Unknown input type: %s", args.input_type)
+
+ relative_output_paths = extract_ir_lib.run_extraction(
+ objs,
+ args.num_workers,
+ args.llvm_objcopy_path,
+ args.cmd_filter,
+ args.thinlto_build,
+ args.cmd_section_name,
+ args.bitcode_section_name,
+ )
+
+ extract_ir_lib.write_corpus_manifest(
+ args.thinlto_build, relative_output_paths, args.output_dir
+ )
+
+ logging.info(
+ "Converted %d files out of %d",
+ len(objs) - relative_output_paths.count(None),
+ len(objs),
+ )
+
+
+if __name__ == "__main__":
+ parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
new file mode 100644
index 0000000000000..221486e16c6e0
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -0,0 +1,53 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+ --input_dir=<path to input directory> \
+ --output_dir=<path to output directory> \
+ --default_args="<list of space separated flags>"
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import make_corpus_lib
+
+
+def parse_args_and_run():
+ parser = argparse.ArgumentParser(
+ description="A tool for making a corpus from arbitrary bitcode"
+ )
+ parser.add_argument("--input_dir", type=str, help="The input directory.")
+ parser.add_argument("--output_dir", type=str, help="The output directory.")
+ parser.add_argument(
+ "--default_args",
+ type=str,
+ help="The compiler flags to compile with when using downstream tooling.",
+ default="",
+ nargs="?",
+ )
+ args = parser.parse_args()
+ main(args)
+
+
+def main(args):
+ logging.warning(
+ "Using this tool does not guarantee that the bitcode is taken at "
+ "the correct stage for consumption during model training. Make "
+ "sure to validate assumptions about where the bitcode is coming "
+ "from before using it in production."
+ )
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
+ make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
+ make_corpus_lib.write_corpus_manifest(
+ relative_paths, args.output_dir, args.default_args.split()
+ )
+
+
+if __name__ == "__main__":
+ parse_args_and_run()
>From d51bd453fe228504f5cf06db2836798860399880 Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince at fb.com>
Date: Fri, 4 Jul 2025 11:20:34 -0700
Subject: [PATCH 3/6] format
---
llvm/utils/mlgo-utils/combine_training_corpus.py | 2 +-
llvm/utils/mlgo-utils/extract_ir.py | 2 +-
llvm/utils/mlgo-utils/make_corpus.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
index 7a1d870ad7e38..563801091f2d2 100755
--- a/llvm/utils/mlgo-utils/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -3,7 +3,7 @@
import re
import sys
from mlgo.corpus.combine_training_corpus import parse_args_and_run
+
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(parse_args_and_run())
-
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
index 589a5c50af726..1ed7d2a13f43b 100755
--- a/llvm/utils/mlgo-utils/extract_ir.py
+++ b/llvm/utils/mlgo-utils/extract_ir.py
@@ -3,7 +3,7 @@
import re
import sys
from mlgo.corpus.extract_ir import parse_args_and_run
+
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(parse_args_and_run())
-
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
index 5b4a9bef486ff..3e1a4fcca8cb6 100755
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -3,7 +3,7 @@
import re
import sys
from mlgo.corpus.make_corpus import parse_args_and_run
+
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(parse_args_and_run())
-
>From 370f7ac40c13a84bafea103fd6b92357078e9c49 Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince at fb.com>
Date: Mon, 7 Jul 2025 15:16:12 -0700
Subject: [PATCH 4/6] Use double quotes
---
llvm/utils/mlgo-utils/combine_training_corpus.py | 4 ++--
llvm/utils/mlgo-utils/extract_ir.py | 4 ++--
llvm/utils/mlgo-utils/make_corpus.py | 4 ++--
3 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
index 563801091f2d2..b8c247ecb181c 100755
--- a/llvm/utils/mlgo-utils/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -4,6 +4,6 @@
import sys
from mlgo.corpus.combine_training_corpus import parse_args_and_run
-if __name__ == '__main__':
- sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+if __name__ == "__main__":
+ sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
sys.exit(parse_args_and_run())
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
index 1ed7d2a13f43b..85f05b9a72ce8 100755
--- a/llvm/utils/mlgo-utils/extract_ir.py
+++ b/llvm/utils/mlgo-utils/extract_ir.py
@@ -4,6 +4,6 @@
import sys
from mlgo.corpus.extract_ir import parse_args_and_run
-if __name__ == '__main__':
- sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+if __name__ == "__main__":
+ sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
sys.exit(parse_args_and_run())
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
index 3e1a4fcca8cb6..725ac7f3461a0 100755
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -4,6 +4,6 @@
import sys
from mlgo.corpus.make_corpus import parse_args_and_run
-if __name__ == '__main__':
- sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
+if __name__ == "__main__":
+ sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
sys.exit(parse_args_and_run())
>From 7479f5a3bbd1569270742c32f79b45528033a7ca Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince at fb.com>
Date: Wed, 9 Jul 2025 02:03:37 -0700
Subject: [PATCH 5/6] Use symlinks
---
.../mlgo-utils/combine_training_corpus.py | 55 +++++-
llvm/utils/mlgo-utils/extract_ir.py | 187 +++++++++++++++++-
llvm/utils/mlgo-utils/make_corpus.py | 59 +++++-
.../mlgo/corpus/combine_training_corpus.py | 53 +----
.../mlgo-utils/mlgo/corpus/extract_ir.py | 185 +----------------
.../mlgo-utils/mlgo/corpus/make_corpus.py | 54 +----
6 files changed, 287 insertions(+), 306 deletions(-)
mode change 100755 => 100644 llvm/utils/mlgo-utils/combine_training_corpus.py
mode change 100755 => 100644 llvm/utils/mlgo-utils/extract_ir.py
mode change 100755 => 100644 llvm/utils/mlgo-utils/make_corpus.py
mode change 100644 => 120000 llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
mode change 100644 => 120000 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
mode change 100644 => 120000 llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
diff --git a/llvm/utils/mlgo-utils/combine_training_corpus.py b/llvm/utils/mlgo-utils/combine_training_corpus.py
old mode 100755
new mode 100644
index b8c247ecb181c..9884d6696a43f
--- a/llvm/utils/mlgo-utils/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/combine_training_corpus.py
@@ -1,9 +1,52 @@
-#!/usr/bin/env python3
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+ --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+import argparse
+import logging
+
+from mlgo.corpus import combine_training_corpus_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+ parser = argparse.ArgumentParser(
+ description="A tool for combining multiple training corpora"
+ )
+ parser.add_argument(
+ "--root_dir", type=str, help="The root dir of module paths to combine."
+ )
+ flags.add_verbosity_arguments(parser)
+ args = parser.parse_args()
+ main(args)
+
+
+def main(args):
+ logging.basicConfig(level=args.verbosity)
+
+ combine_training_corpus_lib.combine_corpus(args.root_dir)
-import re
-import sys
-from mlgo.corpus.combine_training_corpus import parse_args_and_run
if __name__ == "__main__":
- sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
- sys.exit(parse_args_and_run())
+ parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/extract_ir.py b/llvm/utils/mlgo-utils/extract_ir.py
old mode 100755
new mode 100644
index 85f05b9a72ce8..3101cef196b4a
--- a/llvm/utils/mlgo-utils/extract_ir.py
+++ b/llvm/utils/mlgo-utils/extract_ir.py
@@ -1,9 +1,184 @@
-#!/usr/bin/env python3
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, set the --verbosity flag to the desired level.
+Setting it to a specific level will enable all messages at that level and
+higher. Exact values can be found by invoking the script with --help.
+"""
+
+import argparse
+import json
+import logging
+
+from mlgo.corpus import extract_ir_lib
+from mlgo.corpus import flags
+
+
+def parse_args_and_run():
+ parser = argparse.ArgumentParser(
+ description="A tool for making a corpus from build artifacts"
+ )
+ parser.add_argument(
+ "--input",
+ type=str,
+ help="Input file or directory - either compile_commands.json, a linker "
+ "parameter list, or a path to a directory containing object files.",
+ )
+ parser.add_argument(
+ "--input_type",
+ type=str,
+ help="Input file type - JSON, LLD params, directory, or bazel aquery.",
+ choices=["json", "params", "directory", "bazel_aquery"],
+ default="json",
+ nargs="?",
+ )
+ parser.add_argument("--output_dir", type=str, help="Output directory")
+ parser.add_argument(
+ "--num_workers",
+ type=int,
+ help="Number of parallel works for objcopy. `None` for maximum available.",
+ default=None,
+ nargs="?",
+ )
+ parser.add_argument(
+ "--llvm_objcopy_path",
+ type=str,
+ help="Path to llvm-objcopy",
+ default="llvm-objcopy",
+ nargs="?",
+ )
+ parser.add_argument(
+ "--obj_base_dir",
+ type=str,
+ help="Base directory for object files. Defaults to current working dir.",
+ default="",
+ nargs="?",
+ )
+ parser.add_argument(
+ "--cmd_filter",
+ type=str,
+ help="Include only those modules with a command line matching this regular "
+ "expression. Set it to None to not perform any filtering. Note that the "
+ "regular expression is applied independently for each separate command line "
+ "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
+ "with thinlto_build=lld.",
+ default=None,
+ nargs="?",
+ )
+ parser.add_argument(
+ "--thinlto_build",
+ type=str,
+ help="Set if the build was performed with either 'distributed' or 'local' "
+ "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
+ "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
+ "the distributed case or -Wl,--save-temps=import and "
+ "-Wl,--thinlto-emit-index-files passed in the local case",
+ choices=["distributed", "local"],
+ default=None,
+ nargs="?",
+ )
+ parser.add_argument(
+ "--cmd_section_name",
+ type=str,
+ help="The section name passed to llvm-objcopy. For ELF object files, the "
+ "default .llvmcmd is correct. For Mach-O object files, one should use "
+ "something like __LLVM,__cmdline",
+ default=".llvmcmd",
+ nargs="?",
+ )
+ parser.add_argument(
+ "--bitcode_section_name",
+ type=str,
+ help="The section name passed to llvm-objcopy. For ELF object files, the "
+ "default .llvmbc is correct. For Mach-O object files, one should use "
+ "__LLVM,__bitcode",
+ default=".llvmbc",
+ nargs="?",
+ )
+ flags.add_verbosity_arguments(parser)
+ args = parser.parse_args()
+ main(args)
+
+
+def main(args):
+ logging.basicConfig(level=args.verbosity)
+
+ objs = []
+ if args.input is not None and args.thinlto_build == "local":
+ raise ValueError("--thinlto_build=local cannot be run with --input")
+ if args.input is None:
+ if args.thinlto_build != "local":
+ raise ValueError("--input or --thinlto_build=local must be provided")
+ objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+ elif args.input_type == "json":
+ with open(args.input, encoding="utf-8") as f:
+ objs = extract_ir_lib.load_from_compile_commands(
+ json.load(f), args.output_dir
+ )
+ elif args.input_type == "params":
+ if not args.obj_base_dir:
+ logging.info(
+ "-obj_base_dir is unspecified, assuming current directory. "
+ "If no objects are found, use this option to specify the root "
+ "directory for the object file paths in the input file."
+ )
+ with open(args.input, encoding="utf-8") as f:
+ objs = extract_ir_lib.load_from_lld_params(
+ [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
+ )
+ elif args.input_type == "directory":
+ logging.warning(
+ "Using the directory input is only recommended if the build system "
+ "your project uses does not support any structured output that "
+ "ml-compiler-opt understands. If your build system provides a "
+ "structured compilation database, use that instead"
+ )
+ objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+ elif args.input_type == "bazel_aquery":
+ with open(args.input, encoding="utf-8") as aquery_json_handle:
+ objs = extract_ir_lib.load_bazel_aquery(
+ json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
+ )
+ else:
+ logging.error("Unknown input type: %s", args.input_type)
+
+ relative_output_paths = extract_ir_lib.run_extraction(
+ objs,
+ args.num_workers,
+ args.llvm_objcopy_path,
+ args.cmd_filter,
+ args.thinlto_build,
+ args.cmd_section_name,
+ args.bitcode_section_name,
+ )
+
+ extract_ir_lib.write_corpus_manifest(
+ args.thinlto_build, relative_output_paths, args.output_dir
+ )
+
+ logging.info(
+ "Converted %d files out of %d",
+ len(objs) - relative_output_paths.count(None),
+ len(objs),
+ )
-import re
-import sys
-from mlgo.corpus.extract_ir import parse_args_and_run
if __name__ == "__main__":
- sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
- sys.exit(parse_args_and_run())
+ parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
old mode 100755
new mode 100644
index 725ac7f3461a0..92aab4d969d4d
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -1,9 +1,58 @@
-#!/usr/bin/env python3
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tool for making a corpus from arbitrary bitcode.
-import re
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+ --input_dir=<path to input directory> \
+ --output_dir=<path to output directory> \
+ --default_args="<list of space separated flags>"
+"""
+
+import argparse
+import logging
import sys
-from mlgo.corpus.make_corpus import parse_args_and_run
+import pathlib
+print(pathlib.Path(__file__).parent.parent.parent)
+
+sys.path.insert(0, pathlib.Path(__file__).parent.parent.parent)
+
+from mlgo.corpus import make_corpus_lib
+
+
+def parse_args_and_run():
+ parser = argparse.ArgumentParser(
+ description="A tool for making a corpus from arbitrary bitcode"
+ )
+ parser.add_argument("--input_dir", type=str, help="The input directory.")
+ parser.add_argument("--output_dir", type=str, help="The output directory.")
+ parser.add_argument(
+ "--default_args",
+ type=str,
+ help="The compiler flags to compile with when using downstream tooling.",
+ default="",
+ nargs="?",
+ )
+ args = parser.parse_args()
+ main(args)
+
+
+def main(args):
+ logging.warning(
+ "Using this tool does not guarantee that the bitcode is taken at "
+ "the correct stage for consumption during model training. Make "
+ "sure to validate assumptions about where the bitcode is coming "
+ "from before using it in production."
+ )
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
+ make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
+ make_corpus_lib.write_corpus_manifest(
+ relative_paths, args.output_dir, args.default_args.split()
+ )
+
if __name__ == "__main__":
- sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
- sys.exit(parse_args_and_run())
+ parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
deleted file mode 100644
index 9884d6696a43f..0000000000000
--- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-r"""Combine multiple training corpus into a single training corpus.
-
-Currently only support the case that multiple corpus share the same
-configurables except the "modules" field.
-
-Usage: we'd like to combine training corpus corpus1 and corpus2 into
-combinedcorpus; we first structure the files as follows:
-
-combinedcorpus
-combinedcorpus/corpus1
-combinedcorpus/corpus2
-
-Running this script with
-
-python3 \
-compiler_opt/tools/combine_training_corpus.py \
- --root_dir=$PATH_TO_combinedcorpus
-
-generates combinedcorpus/corpus_description.json file. In this way corpus1
-and corpus2 are combined into combinedcorpus.
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import combine_training_corpus_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
- parser = argparse.ArgumentParser(
- description="A tool for combining multiple training corpora"
- )
- parser.add_argument(
- "--root_dir", type=str, help="The root dir of module paths to combine."
- )
- flags.add_verbosity_arguments(parser)
- args = parser.parse_args()
- main(args)
-
-
-def main(args):
- logging.basicConfig(level=args.verbosity)
-
- combine_training_corpus_lib.combine_corpus(args.root_dir)
-
-
-if __name__ == "__main__":
- parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
new file mode 120000
index 0000000000000..5a6885a6d1fa2
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1 @@
+../../combine_training_corpus.py
\ No newline at end of file
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
deleted file mode 100644
index 3101cef196b4a..0000000000000
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Extract IR for training.
-
-Extract IR for training, either from a compile_commands.json file produced by
-cmake, or a linker parameter list file.
-
-Only run with
-'python compiler_opt/tools/extract_ir.py ...'
-
-The compilation is assumed to have been performed with clang, using
--fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
-
-In a distributed ThinLTO case, the compilation is assumed to have been performed
-specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
-
-In a local ThinLTO case, the compilation is assumedto have been performed
-specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
-
-To change the logging verbosity, set the --verbosity flag to the desired level.
-Setting it to a specific level will enable all messages at that level and
-higher. Exact values can be found by invoking the script with --help.
-"""
-
-import argparse
-import json
-import logging
-
-from mlgo.corpus import extract_ir_lib
-from mlgo.corpus import flags
-
-
-def parse_args_and_run():
- parser = argparse.ArgumentParser(
- description="A tool for making a corpus from build artifacts"
- )
- parser.add_argument(
- "--input",
- type=str,
- help="Input file or directory - either compile_commands.json, a linker "
- "parameter list, or a path to a directory containing object files.",
- )
- parser.add_argument(
- "--input_type",
- type=str,
- help="Input file type - JSON, LLD params, directory, or bazel aquery.",
- choices=["json", "params", "directory", "bazel_aquery"],
- default="json",
- nargs="?",
- )
- parser.add_argument("--output_dir", type=str, help="Output directory")
- parser.add_argument(
- "--num_workers",
- type=int,
- help="Number of parallel works for objcopy. `None` for maximum available.",
- default=None,
- nargs="?",
- )
- parser.add_argument(
- "--llvm_objcopy_path",
- type=str,
- help="Path to llvm-objcopy",
- default="llvm-objcopy",
- nargs="?",
- )
- parser.add_argument(
- "--obj_base_dir",
- type=str,
- help="Base directory for object files. Defaults to current working dir.",
- default="",
- nargs="?",
- )
- parser.add_argument(
- "--cmd_filter",
- type=str,
- help="Include only those modules with a command line matching this regular "
- "expression. Set it to None to not perform any filtering. Note that the "
- "regular expression is applied independently for each separate command line "
- "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
- "with thinlto_build=lld.",
- default=None,
- nargs="?",
- )
- parser.add_argument(
- "--thinlto_build",
- type=str,
- help="Set if the build was performed with either 'distributed' or 'local' "
- "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
- "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
- "the distributed case or -Wl,--save-temps=import and "
- "-Wl,--thinlto-emit-index-files passed in the local case",
- choices=["distributed", "local"],
- default=None,
- nargs="?",
- )
- parser.add_argument(
- "--cmd_section_name",
- type=str,
- help="The section name passed to llvm-objcopy. For ELF object files, the "
- "default .llvmcmd is correct. For Mach-O object files, one should use "
- "something like __LLVM,__cmdline",
- default=".llvmcmd",
- nargs="?",
- )
- parser.add_argument(
- "--bitcode_section_name",
- type=str,
- help="The section name passed to llvm-objcopy. For ELF object files, the "
- "default .llvmbc is correct. For Mach-O object files, one should use "
- "__LLVM,__bitcode",
- default=".llvmbc",
- nargs="?",
- )
- flags.add_verbosity_arguments(parser)
- args = parser.parse_args()
- main(args)
-
-
-def main(args):
- logging.basicConfig(level=args.verbosity)
-
- objs = []
- if args.input is not None and args.thinlto_build == "local":
- raise ValueError("--thinlto_build=local cannot be run with --input")
- if args.input is None:
- if args.thinlto_build != "local":
- raise ValueError("--input or --thinlto_build=local must be provided")
- objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
- elif args.input_type == "json":
- with open(args.input, encoding="utf-8") as f:
- objs = extract_ir_lib.load_from_compile_commands(
- json.load(f), args.output_dir
- )
- elif args.input_type == "params":
- if not args.obj_base_dir:
- logging.info(
- "-obj_base_dir is unspecified, assuming current directory. "
- "If no objects are found, use this option to specify the root "
- "directory for the object file paths in the input file."
- )
- with open(args.input, encoding="utf-8") as f:
- objs = extract_ir_lib.load_from_lld_params(
- [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
- )
- elif args.input_type == "directory":
- logging.warning(
- "Using the directory input is only recommended if the build system "
- "your project uses does not support any structured output that "
- "ml-compiler-opt understands. If your build system provides a "
- "structured compilation database, use that instead"
- )
- objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
- elif args.input_type == "bazel_aquery":
- with open(args.input, encoding="utf-8") as aquery_json_handle:
- objs = extract_ir_lib.load_bazel_aquery(
- json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
- )
- else:
- logging.error("Unknown input type: %s", args.input_type)
-
- relative_output_paths = extract_ir_lib.run_extraction(
- objs,
- args.num_workers,
- args.llvm_objcopy_path,
- args.cmd_filter,
- args.thinlto_build,
- args.cmd_section_name,
- args.bitcode_section_name,
- )
-
- extract_ir_lib.write_corpus_manifest(
- args.thinlto_build, relative_output_paths, args.output_dir
- )
-
- logging.info(
- "Converted %d files out of %d",
- len(objs) - relative_output_paths.count(None),
- len(objs),
- )
-
-
-if __name__ == "__main__":
- parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
new file mode 120000
index 0000000000000..ce3baa062b3e1
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -0,0 +1 @@
+../../extract_ir.py
\ No newline at end of file
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
deleted file mode 100644
index 221486e16c6e0..0000000000000
--- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Tool for making a corpus from arbitrary bitcode.
-
-To create a corpus from a set of bitcode files in an input directory, run
-the following command:
-
-PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
- --input_dir=<path to input directory> \
- --output_dir=<path to output directory> \
- --default_args="<list of space separated flags>"
-"""
-
-import argparse
-import logging
-
-from mlgo.corpus import make_corpus_lib
-
-
-def parse_args_and_run():
- parser = argparse.ArgumentParser(
- description="A tool for making a corpus from arbitrary bitcode"
- )
- parser.add_argument("--input_dir", type=str, help="The input directory.")
- parser.add_argument("--output_dir", type=str, help="The output directory.")
- parser.add_argument(
- "--default_args",
- type=str,
- help="The compiler flags to compile with when using downstream tooling.",
- default="",
- nargs="?",
- )
- args = parser.parse_args()
- main(args)
-
-
-def main(args):
- logging.warning(
- "Using this tool does not guarantee that the bitcode is taken at "
- "the correct stage for consumption during model training. Make "
- "sure to validate assumptions about where the bitcode is coming "
- "from before using it in production."
- )
- relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
- make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
- make_corpus_lib.write_corpus_manifest(
- relative_paths, args.output_dir, args.default_args.split()
- )
-
-
-if __name__ == "__main__":
- parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
new file mode 120000
index 0000000000000..7ea4447a76efc
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -0,0 +1 @@
+../../make_corpus.py
\ No newline at end of file
>From 9ea9dce4d154712b728ef874cd9aa915605e10ae Mon Sep 17 00:00:00 2001
From: Vincent Lee <leevince at fb.com>
Date: Wed, 9 Jul 2025 02:11:04 -0700
Subject: [PATCH 6/6] Remove testing code
---
llvm/utils/mlgo-utils/make_corpus.py | 5 -----
1 file changed, 5 deletions(-)
diff --git a/llvm/utils/mlgo-utils/make_corpus.py b/llvm/utils/mlgo-utils/make_corpus.py
index 92aab4d969d4d..221486e16c6e0 100644
--- a/llvm/utils/mlgo-utils/make_corpus.py
+++ b/llvm/utils/mlgo-utils/make_corpus.py
@@ -14,11 +14,6 @@
import argparse
import logging
-import sys
-import pathlib
-print(pathlib.Path(__file__).parent.parent.parent)
-
-sys.path.insert(0, pathlib.Path(__file__).parent.parent.parent)
from mlgo.corpus import make_corpus_lib
More information about the llvm-commits
mailing list