[llvm-branch-commits] [llvm] [MLGO] Remove absl dependency from scripts (PR #78880)

Aiden Grossman via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Sat Jan 20 22:10:54 PST 2024


https://github.com/boomanaiden154 created https://github.com/llvm/llvm-project/pull/78880

This patch removes the absl dependency from the mlgo-utils scripts. We were only using absl.logging, and absl.flags, so this patch just consists of mechanically converting the absl flags parsing to Python's builtin argparse as Python's logging is a drop in replacement for absl.logging.

>From 9dfc0ac712315f232e69496b7f8c0aed20421626 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 21 Jan 2024 06:08:28 +0000
Subject: [PATCH] [MLGO] Remove absl dependency from scripts

This patch removes the absl dependency from the mlgo-utils scripts. We
were only using absl.logging, and absl.flags, so this patch just
consists of mechanically converting the absl flags parsing to Python's
builtin argparse as Python's logging is a drop in replacement for
absl.logging.
---
 .../mlgo/corpus/combine_training_corpus.py    |  27 +--
 .../mlgo-utils/mlgo/corpus/extract_ir.py      | 204 +++++++++---------
 .../mlgo-utils/mlgo/corpus/make_corpus.py     |  45 ++--
 3 files changed, 137 insertions(+), 139 deletions(-)

diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
index 9aabd87b4688e0..cc21061cbbef5e 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -23,26 +23,21 @@
 and corpus2 are combined into combinedcorpus.
 """
 
-from absl import app
-from absl import flags
+import argparse
 
 from mlgo.corpus import combine_training_corpus_lib
 
-flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
 
-FLAGS = flags.FLAGS
-
-
-def main(argv):
-    if len(argv) > 1:
-        raise app.UsageError("Too many command-line arguments.")
-
-    combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
-
-
-def entrypoint():
-    app.run(main)
+def main(args):
+    combine_training_corpus_lib.combine_corpus(args.root_dir)
 
 
 if __name__ == "__main__":
-    entrypoint()
+    parser = argparse.ArgumentParser(
+        description="A tool for combining multiple training corpora"
+    )
+    parser.add_argument(
+        "--root_dir", type=str, help="The root dir of module paths to combine."
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
index 9463e61dc534fe..4426463e22b0e7 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -26,127 +26,59 @@
 
 import json
 import multiprocessing
-
-from absl import app
-from absl import flags
-from absl import logging
+import logging
+import argparse
 
 from mlgo.corpus import extract_ir_lib
 
-flags.DEFINE_string(
-    "input",
-    None,
-    "Input file or directory - either compile_commands.json, a linker parameter"
-    "list, or a path to a directory containing object files.",
-)
-flags.DEFINE_enum(
-    "input_type",
-    "json",
-    ["json", "params", "directory"],
-    "Input file type - json, params, or directory. params latter refers to lld"
-    "params.",
-)
-flags.DEFINE_string("output_dir", None, "Output directory")
-flags.DEFINE_integer(
-    "num_workers",
-    None,
-    "Number of parallel workers for objcopy. `None` for maximum available.",
-)
-flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
-flags.DEFINE_string(
-    "obj_base_dir",
-    "",
-    "Base directory for object files. Defaults to current working dir.",
-)
-flags.DEFINE_string(
-    "cmd_filter",
-    None,
-    "Include only those modules with a command line matching this regexp. "
-    "Setting it to None for not filtering. Note that the regexp is applied "
-    "independently for each separate command line option. For example, ^-Oz$ "
-    "will match Oz - built binaries. Does not work with thinlto_build=lld.",
-)
-flags.DEFINE_enum(
-    "thinlto_build",
-    None,
-    ["distributed", "local"],
-    "Set if the build was performed with either 'distributed' or "
-    "'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
-    "The build is assumed to have had "
-    "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
-    "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
-    "passed in the local case.",
-)
-flags.DEFINE_string(
-    "cmd_section_name",
-    ".llvmcmd",
-    "The section name passed to llvm-objcopy. For ELF object files, the "
-    "default .llvmcmd is correct. For Mach-O object files, one should use "
-    "something like __LLVM,__cmdline",
-)
-flags.DEFINE_string(
-    "bitcode_section_name",
-    ".llvmbc",
-    "The section name passed to llvm-objcopy. For ELF object files, the "
-    "default .llvmbc is correct. For Mach-O object files, one should use "
-    "__LLVM,__bitcode",
-)
-
-flags.mark_flag_as_required("output_dir")
-
-FLAGS = flags.FLAGS
-
-
-def main(argv):
-    if len(argv) > 1:
-        raise app.UsageError("Too many command-line arguments.")
 
+def main(args):
     objs = []
-    if FLAGS.input is not None and FLAGS.thinlto_build == "local":
+    if args.input is not None and args.thinlto_build == "local":
         raise ValueError("--thinlto_build=local cannot be run with --input")
-    if FLAGS.input is None:
-        if FLAGS.thinlto_build != "local":
+    if args.input is None:
+        if args.thinlto_build != "local":
             raise ValueError("--input or --thinlto_build=local must be provided")
-        objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
-    elif FLAGS.input_type == "json":
-        with open(FLAGS.input, encoding="utf-8") as f:
+        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+    elif args.input_type == "json":
+        with open(args.input, encoding="utf-8") as f:
             objs = extract_ir_lib.load_from_compile_commands(
-                json.load(f), FLAGS.output_dir
+                json.load(f), args.output_dir
             )
-    elif FLAGS.input_type == "params":
-        if not FLAGS.obj_base_dir:
+    elif args.input_type == "params":
+        if not args.obj_base_dir:
             logging.info(
                 "-obj_base_dir is unspecified, assuming current directory."
                 "If no objects are found, use this option to specify the root"
                 "directory for the object file paths in the input file."
             )
-        with open(FLAGS.input, encoding="utf-8") as f:
+        with open(args.input, encoding="utf-8") as f:
             objs = extract_ir_lib.load_from_lld_params(
-                [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
+                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
             )
-    elif FLAGS.input_type == "directory":
+    elif args.input_type == "directory":
         logging.warning(
             "Using the directory input is only recommended if the build system"
             "your project uses does not support any structured output that"
             "ml-compiler-opt understands. If your build system provides a"
             "structured compilation database, use that instead"
         )
-        objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
     else:
-        logging.error("Unknown input type: %s", FLAGS.input_type)
+        logging.error("Unknown input type: %s", args.input_type)
 
     relative_output_paths = extract_ir_lib.run_extraction(
         objs,
-        FLAGS.num_workers,
-        FLAGS.llvm_objcopy_path,
-        FLAGS.cmd_filter,
-        FLAGS.thinlto_build,
-        FLAGS.cmd_section_name,
-        FLAGS.bitcode_section_name,
+        args.num_workers,
+        args.llvm_objcopy_path,
+        args.cmd_filter,
+        args.thinlto_build,
+        args.cmd_section_name,
+        args.bitcode_section_name,
     )
 
     extract_ir_lib.write_corpus_manifest(
-        FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
+        args.thinlto_build, relative_output_paths, args.output_dir
     )
 
     logging.info(
@@ -156,10 +88,86 @@ def main(argv):
     )
 
 
-def entrypoint():
-    multiprocessing.set_start_method("fork")
-    app.run(main)
-
-
 if __name__ == "__main__":
-    entrypoint()
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from build artifacts"
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="Input file or directory - either compile_commands.json, a linker "
+        "parameter list, or a path to a directory containing object files.",
+    )
+    parser.add_argument(
+        "--input_type",
+        type=str,
+        help="Input file type - JSON, LLD params, or directory.",
+        choices=["json", "params", "directory"],
+        default="json",
+        nargs="?",
+    )
+    parser.add_argument("--output_dir", type=str, help="Output directory")
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        help="Number of parallel works for objcopy. `None` for maximum available.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--llvm_objcopy_path",
+        type=str,
+        help="Path to llvm-objcopy",
+        default="llvm-objcopy",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--obj_base_dir",
+        type=str,
+        help="Base directory for object files. Defaults to current working dir.",
+        default="",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_filter",
+        type=str,
+        help="Include only those modules with a command line matching this regular "
+        "expression. Set it to None to not perform any filtering. Note that the "
+        "regular expression is applied independently for each separate command line "
+        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
+        "with thinlto_build=lld.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--thinlto_build",
+        type=str,
+        help="Set if the build was performed with either 'distributed' or 'local' "
+        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
+        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
+        "the distributed case or -Wl,--save-temps=import and "
+        "-Wl,--thinlto-emit-index-files passed in the local case",
+        choices=["distributed", "local"],
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmcmd is correct. For Mach-O object files, one should use "
+        "something like __LLVM,__cmdline",
+        default=".llvmcmd",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--bitcode_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmbc is correct. For Mach-O object files, one should use "
+        "__LLVM,__bitcode",
+        default=".llvmbc",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
index edb0ecd853de24..05ceb750de673e 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -12,43 +12,38 @@
   --default_args="<list of space separated flags>"
 """
 
-from absl import app
-from absl import flags
-from absl import logging
+import logging
+import argparse
 
 from mlgo.corpus import make_corpus_lib
 
-flags.DEFINE_string("input_dir", None, "The input directory.")
-flags.DEFINE_string("output_dir", None, "The output directory.")
-flags.DEFINE_string(
-    "default_args",
-    "",
-    "The compiler flags to compile with when using downstream tooling.",
-)
 
-flags.mark_flag_as_required("input_dir")
-flags.mark_flag_as_required("output_dir")
-
-FLAGS = flags.FLAGS
-
-
-def main(_):
+def main(args):
     logging.warning(
         "Using this tool does not guarantee that the bitcode is taken at "
         "the correct stage for consumption during model training. Make "
         "sure to validate assumptions about where the bitcode is coming "
         "from before using it in production."
     )
-    relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
-    make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, FLAGS.output_dir)
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
+    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
     make_corpus_lib.write_corpus_manifest(
-        relative_paths, FLAGS.output_dir, FLAGS.default_args.split()
+        relative_paths, args.output_dir, args.default_args.split()
     )
 
 
-def entrypoint():
-    app.run(main)
-
-
 if __name__ == "__main__":
-    entrypoint()
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from arbitrary bitcode"
+    )
+    parser.add_argument("--input_dir", type=str, help="The input directory.")
+    parser.add_argument("--output_dir", type=str, help="The output directory.")
+    parser.add_argument(
+        "--default_args",
+        type=str,
+        help="The compiler flags to compile with when using downstream tooling.",
+        default="",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)



More information about the llvm-branch-commits mailing list