[llvm-branch-commits] [llvm] [MLGO] Remove absl dependency from scripts (PR #78880)
Aiden Grossman via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat Jan 20 22:10:54 PST 2024
https://github.com/boomanaiden154 created https://github.com/llvm/llvm-project/pull/78880
This patch removes the absl dependency from the mlgo-utils scripts. We were only using absl.logging, and absl.flags, so this patch just consists of mechanically converting the absl flags parsing to Python's builtin argparse as Python's logging is a drop in replacement for absl.logging.
>From 9dfc0ac712315f232e69496b7f8c0aed20421626 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 21 Jan 2024 06:08:28 +0000
Subject: [PATCH] [MLGO] Remove absl dependency from scripts
This patch removes the absl dependency from the mlgo-utils scripts. We
were only using absl.logging, and absl.flags, so this patch just
consists of mechanically converting the absl flags parsing to Python's
builtin argparse as Python's logging is a drop in replacement for
absl.logging.
---
.../mlgo/corpus/combine_training_corpus.py | 27 +--
.../mlgo-utils/mlgo/corpus/extract_ir.py | 204 +++++++++---------
.../mlgo-utils/mlgo/corpus/make_corpus.py | 45 ++--
3 files changed, 137 insertions(+), 139 deletions(-)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
index 9aabd87b4688e0..cc21061cbbef5e 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -23,26 +23,21 @@
and corpus2 are combined into combinedcorpus.
"""
-from absl import app
-from absl import flags
+import argparse
from mlgo.corpus import combine_training_corpus_lib
-flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
-FLAGS = flags.FLAGS
-
-
-def main(argv):
- if len(argv) > 1:
- raise app.UsageError("Too many command-line arguments.")
-
- combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
-
-
-def entrypoint():
- app.run(main)
+def main(args):
+ combine_training_corpus_lib.combine_corpus(args.root_dir)
if __name__ == "__main__":
- entrypoint()
+ parser = argparse.ArgumentParser(
+ description="A tool for combining multiple training corpora"
+ )
+ parser.add_argument(
+ "--root_dir", type=str, help="The root dir of module paths to combine."
+ )
+ args = parser.parse_args()
+ main(args)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
index 9463e61dc534fe..4426463e22b0e7 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -26,127 +26,59 @@
import json
import multiprocessing
-
-from absl import app
-from absl import flags
-from absl import logging
+import logging
+import argparse
from mlgo.corpus import extract_ir_lib
-flags.DEFINE_string(
- "input",
- None,
- "Input file or directory - either compile_commands.json, a linker parameter"
- "list, or a path to a directory containing object files.",
-)
-flags.DEFINE_enum(
- "input_type",
- "json",
- ["json", "params", "directory"],
- "Input file type - json, params, or directory. params latter refers to lld"
- "params.",
-)
-flags.DEFINE_string("output_dir", None, "Output directory")
-flags.DEFINE_integer(
- "num_workers",
- None,
- "Number of parallel workers for objcopy. `None` for maximum available.",
-)
-flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
-flags.DEFINE_string(
- "obj_base_dir",
- "",
- "Base directory for object files. Defaults to current working dir.",
-)
-flags.DEFINE_string(
- "cmd_filter",
- None,
- "Include only those modules with a command line matching this regexp. "
- "Setting it to None for not filtering. Note that the regexp is applied "
- "independently for each separate command line option. For example, ^-Oz$ "
- "will match Oz - built binaries. Does not work with thinlto_build=lld.",
-)
-flags.DEFINE_enum(
- "thinlto_build",
- None,
- ["distributed", "local"],
- "Set if the build was performed with either 'distributed' or "
- "'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
- "The build is assumed to have had "
- "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
- "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
- "passed in the local case.",
-)
-flags.DEFINE_string(
- "cmd_section_name",
- ".llvmcmd",
- "The section name passed to llvm-objcopy. For ELF object files, the "
- "default .llvmcmd is correct. For Mach-O object files, one should use "
- "something like __LLVM,__cmdline",
-)
-flags.DEFINE_string(
- "bitcode_section_name",
- ".llvmbc",
- "The section name passed to llvm-objcopy. For ELF object files, the "
- "default .llvmbc is correct. For Mach-O object files, one should use "
- "__LLVM,__bitcode",
-)
-
-flags.mark_flag_as_required("output_dir")
-
-FLAGS = flags.FLAGS
-
-
-def main(argv):
- if len(argv) > 1:
- raise app.UsageError("Too many command-line arguments.")
+def main(args):
objs = []
- if FLAGS.input is not None and FLAGS.thinlto_build == "local":
+ if args.input is not None and args.thinlto_build == "local":
raise ValueError("--thinlto_build=local cannot be run with --input")
- if FLAGS.input is None:
- if FLAGS.thinlto_build != "local":
+ if args.input is None:
+ if args.thinlto_build != "local":
raise ValueError("--input or --thinlto_build=local must be provided")
- objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
- elif FLAGS.input_type == "json":
- with open(FLAGS.input, encoding="utf-8") as f:
+ objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+ elif args.input_type == "json":
+ with open(args.input, encoding="utf-8") as f:
objs = extract_ir_lib.load_from_compile_commands(
- json.load(f), FLAGS.output_dir
+ json.load(f), args.output_dir
)
- elif FLAGS.input_type == "params":
- if not FLAGS.obj_base_dir:
+ elif args.input_type == "params":
+ if not args.obj_base_dir:
logging.info(
"-obj_base_dir is unspecified, assuming current directory."
"If no objects are found, use this option to specify the root"
"directory for the object file paths in the input file."
)
- with open(FLAGS.input, encoding="utf-8") as f:
+ with open(args.input, encoding="utf-8") as f:
objs = extract_ir_lib.load_from_lld_params(
- [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
+ [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
)
- elif FLAGS.input_type == "directory":
+ elif args.input_type == "directory":
logging.warning(
"Using the directory input is only recommended if the build system"
"your project uses does not support any structured output that"
"ml-compiler-opt understands. If your build system provides a"
"structured compilation database, use that instead"
)
- objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+ objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
else:
- logging.error("Unknown input type: %s", FLAGS.input_type)
+ logging.error("Unknown input type: %s", args.input_type)
relative_output_paths = extract_ir_lib.run_extraction(
objs,
- FLAGS.num_workers,
- FLAGS.llvm_objcopy_path,
- FLAGS.cmd_filter,
- FLAGS.thinlto_build,
- FLAGS.cmd_section_name,
- FLAGS.bitcode_section_name,
+ args.num_workers,
+ args.llvm_objcopy_path,
+ args.cmd_filter,
+ args.thinlto_build,
+ args.cmd_section_name,
+ args.bitcode_section_name,
)
extract_ir_lib.write_corpus_manifest(
- FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
+ args.thinlto_build, relative_output_paths, args.output_dir
)
logging.info(
@@ -156,10 +88,86 @@ def main(argv):
)
-def entrypoint():
- multiprocessing.set_start_method("fork")
- app.run(main)
-
-
if __name__ == "__main__":
- entrypoint()
+ parser = argparse.ArgumentParser(
+ description="A tool for making a corpus from build artifacts"
+ )
+ parser.add_argument(
+ "--input",
+ type=str,
+ help="Input file or directory - either compile_commands.json, a linker "
+ "parameter list, or a path to a directory containing object files.",
+ )
+ parser.add_argument(
+ "--input_type",
+ type=str,
+ help="Input file type - JSON, LLD params, or directory.",
+ choices=["json", "params", "directory"],
+ default="json",
+ nargs="?",
+ )
+ parser.add_argument("--output_dir", type=str, help="Output directory")
+ parser.add_argument(
+ "--num_workers",
+ type=int,
+ help="Number of parallel works for objcopy. `None` for maximum available.",
+ default=None,
+ nargs="?",
+ )
+ parser.add_argument(
+ "--llvm_objcopy_path",
+ type=str,
+ help="Path to llvm-objcopy",
+ default="llvm-objcopy",
+ nargs="?",
+ )
+ parser.add_argument(
+ "--obj_base_dir",
+ type=str,
+ help="Base directory for object files. Defaults to current working dir.",
+ default="",
+ nargs="?",
+ )
+ parser.add_argument(
+ "--cmd_filter",
+ type=str,
+ help="Include only those modules with a command line matching this regular "
+ "expression. Set it to None to not perform any filtering. Note that the "
+ "regular expression is applied independently for each separate command line "
+ "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
+ "with thinlto_build=lld.",
+ default=None,
+ nargs="?",
+ )
+ parser.add_argument(
+ "--thinlto_build",
+ type=str,
+ help="Set if the build was performed with either 'distributed' or 'local' "
+ "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
+ "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
+ "the distributed case or -Wl,--save-temps=import and "
+ "-Wl,--thinlto-emit-index-files passed in the local case",
+ choices=["distributed", "local"],
+ default=None,
+ nargs="?",
+ )
+ parser.add_argument(
+ "--cmd_section_name",
+ type=str,
+ help="The section name passed to llvm-objcopy. For ELF object files, the "
+ "default .llvmcmd is correct. For Mach-O object files, one should use "
+ "something like __LLVM,__cmdline",
+ default=".llvmcmd",
+ nargs="?",
+ )
+ parser.add_argument(
+ "--bitcode_section_name",
+ type=str,
+ help="The section name passed to llvm-objcopy. For ELF object files, the "
+ "default .llvmbc is correct. For Mach-O object files, one should use "
+ "__LLVM,__bitcode",
+ default=".llvmbc",
+ nargs="?",
+ )
+ args = parser.parse_args()
+ main(args)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
index edb0ecd853de24..05ceb750de673e 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -12,43 +12,38 @@
--default_args="<list of space separated flags>"
"""
-from absl import app
-from absl import flags
-from absl import logging
+import logging
+import argparse
from mlgo.corpus import make_corpus_lib
-flags.DEFINE_string("input_dir", None, "The input directory.")
-flags.DEFINE_string("output_dir", None, "The output directory.")
-flags.DEFINE_string(
- "default_args",
- "",
- "The compiler flags to compile with when using downstream tooling.",
-)
-flags.mark_flag_as_required("input_dir")
-flags.mark_flag_as_required("output_dir")
-
-FLAGS = flags.FLAGS
-
-
-def main(_):
+def main(args):
logging.warning(
"Using this tool does not guarantee that the bitcode is taken at "
"the correct stage for consumption during model training. Make "
"sure to validate assumptions about where the bitcode is coming "
"from before using it in production."
)
- relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
- make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, FLAGS.output_dir)
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
+ make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
make_corpus_lib.write_corpus_manifest(
- relative_paths, FLAGS.output_dir, FLAGS.default_args.split()
+ relative_paths, args.output_dir, args.default_args.split()
)
-def entrypoint():
- app.run(main)
-
-
if __name__ == "__main__":
- entrypoint()
+ parser = argparse.ArgumentParser(
+ description="A tool for making a corpus from arbitrary bitcode"
+ )
+ parser.add_argument("--input_dir", type=str, help="The input directory.")
+ parser.add_argument("--output_dir", type=str, help="The output directory.")
+ parser.add_argument(
+ "--default_args",
+ type=str,
+ help="The compiler flags to compile with when using downstream tooling.",
+ default="",
+ nargs="?",
+ )
+ args = parser.parse_args()
+ main(args)
More information about the llvm-branch-commits
mailing list