[clang] [clang-tools-extra] [mlir] [libc] [libcxx] [llvm] [lldb] [MLGO] Remove absl dependency from scripts (PR #78880)

Aiden Grossman via cfe-commits cfe-commits at lists.llvm.org
Sun Jan 21 15:04:22 PST 2024


https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/78880

>From 80c9507d7f49ddbc5f2554f597950f797355c255 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 21 Jan 2024 03:53:03 +0000
Subject: [PATCH 1/7] Add make_corpus script test

---
 .../tests/corpus/make_corpus_script.test      | 22 +++++++++++++++++++
 llvm/utils/mlgo-utils/tests/lit.cfg           |  7 +++++-
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test

diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
new file mode 100644
index 000000000000000..f4f97544bce47d3
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
@@ -0,0 +1,22 @@
+## Testing that the make_corpus script works as expected when invoked.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: touch %t.dir/test1.bc
+# RUN: touch %t.dir/test2.bc
+# RUN: rm -rf %t.out.dir && mkdir %t.out.dir
+
+# RUN: %python %scripts_dir/corpus/make_corpus.py --input_dir=%t.dir --output_dir=%t.out.dir --default_args="-test"
+
+# RUN: cat %t.out.dir/corpus_description.json | FileCheck %s
+
+## Check that we get the expected command in the global command override
+# CHECK: "-test"
+# CHECK: "has_thinlto": false
+## Check that the modules are in the corpus description
+# CHECK: "test1"
+# CHECK: "test2"
+
+# RUN: ls %t.out.dir | FileCheck %s --check-prefix CHECK-DIR
+
+# CHECK-DIR: test1.bc
+# CHECK-DIR: test2.bc
diff --git a/llvm/utils/mlgo-utils/tests/lit.cfg b/llvm/utils/mlgo-utils/tests/lit.cfg
index 055f0945942fc1c..9afced53f195c5f 100644
--- a/llvm/utils/mlgo-utils/tests/lit.cfg
+++ b/llvm/utils/mlgo-utils/tests/lit.cfg
@@ -1,3 +1,5 @@
+import os
+
 import lit.formats
 
 from lit.llvm import llvm_config
@@ -5,7 +7,7 @@ from lit.llvm import llvm_config
 config.name = "mlgo-utils"
 config.test_format = lit.formats.ShTest(execute_external=False)
 
-config.suffixes = [".py"]
+config.suffixes = [".py", ".test"]
 
 config.test_source_root = os.path.dirname(__file__)
 config.test_exec_root = config.obj_root
@@ -13,3 +15,6 @@ config.test_exec_root = config.obj_root
 config.environment["PYTHONPATH"] = os.path.join(config.src_root, "utils", "mlgo-utils")
 
 llvm_config.use_default_substitutions()
+
+scripts_dir = os.path.join(config.src_root, "utils/mlgo-utils/mlgo")
+config.substitutions.append(("%scripts_dir", scripts_dir))

>From d99f5d4cd2c7c6d9e70125e893dc2ae40c897d36 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 21 Jan 2024 04:58:42 +0000
Subject: [PATCH 2/7] Add another test

---
 llvm/utils/mlgo-utils/CMakeLists.txt          |  2 +-
 .../combine_training_corpus_script.test       | 29 +++++++++++++++++++
 llvm/utils/mlgo-utils/tests/lit.cfg           |  1 +
 3 files changed, 31 insertions(+), 1 deletion(-)
 create mode 100644 llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test

diff --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt
index 7b303c7639401ae..c263c92c632797e 100644
--- a/llvm/utils/mlgo-utils/CMakeLists.txt
+++ b/llvm/utils/mlgo-utils/CMakeLists.txt
@@ -5,7 +5,7 @@ configure_lit_site_cfg(
 
 add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
   ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS "FileCheck" "not" "count"
+  DEPENDS "FileCheck" "not" "count" "split-file"
 )
 
 set_target_properties(check-mlgo-utils PROPERTIES FOLDER "Tests")
diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
new file mode 100644
index 000000000000000..1aa182146a49ee4
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
@@ -0,0 +1,29 @@
+## Testing that the combine_trainig_corpus script works as expected when
+## invoked.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: split-file %s %t.dir
+# RUN: %python %scripts_dir/corpus/combine_training_corpus.py --root_dir=%t.dir
+# RUN: cat %t.dir/corpus_description.json | FileCheck %s
+
+## Check that we end up with the same properties as the original corpora
+# CHECK: "has_thinlto": false
+
+## Check that the modules end up in the combined corpus. Order does not matter.
+# CHECK-DAG: "subcorpus1/test1.o"
+# CHECK-DAG: "subcorpus2/test2.o"
+
+#--- subcorpus1/corpus_description.json
+{
+  "has_thinlto": false,
+  "modules": [
+    "test1.o"
+  ]
+}
+#--- subcorpus2/corpus_description.json
+{
+  "has_thinlto": false,
+  "modules": [
+    "test2.o"
+  ]
+}
diff --git a/llvm/utils/mlgo-utils/tests/lit.cfg b/llvm/utils/mlgo-utils/tests/lit.cfg
index 9afced53f195c5f..58c35e69c652c58 100644
--- a/llvm/utils/mlgo-utils/tests/lit.cfg
+++ b/llvm/utils/mlgo-utils/tests/lit.cfg
@@ -15,6 +15,7 @@ config.test_exec_root = config.obj_root
 config.environment["PYTHONPATH"] = os.path.join(config.src_root, "utils", "mlgo-utils")
 
 llvm_config.use_default_substitutions()
+config.substitutions.append(("split-file", llvm_config.use_llvm_tool("split-file")))
 
 scripts_dir = os.path.join(config.src_root, "utils/mlgo-utils/mlgo")
 config.substitutions.append(("%scripts_dir", scripts_dir))

>From 0f2d0cd83efb07fdaee048b49f2562f4372c944d Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 21 Jan 2024 05:16:05 +0000
Subject: [PATCH 3/7] Add extract_ir test

---
 llvm/utils/mlgo-utils/CMakeLists.txt          |  2 +-
 .../tests/corpus/extract_ir_script.test       | 44 +++++++++++++++++++
 llvm/utils/mlgo-utils/tests/lit.cfg           |  2 +
 3 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test

diff --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt
index c263c92c632797e..3129331d58c75bb 100644
--- a/llvm/utils/mlgo-utils/CMakeLists.txt
+++ b/llvm/utils/mlgo-utils/CMakeLists.txt
@@ -5,7 +5,7 @@ configure_lit_site_cfg(
 
 add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
   ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS "FileCheck" "not" "count" "split-file"
+  DEPENDS "FileCheck" "not" "count" "split-file" "yaml2obj" "llvm-objcopy"
 )
 
 set_target_properties(check-mlgo-utils PROPERTIES FOLDER "Tests")
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
new file mode 100644
index 000000000000000..a7629eb629219d7
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
@@ -0,0 +1,44 @@
+## Test that invoking the extract_ir script work as expected.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: yaml2obj %s -o %t.dir/test1.o
+# RUN: yaml2obj %s -o %t.dir/test2.o
+# RUN: rm -rf %t.dir.out && mkdir %t.dir.out
+
+# RUN: %python %scripts_dir/corpus/extract_ir.py --input=%t.dir --input_type=directory --output_dir=%t.dir.out --llvm_objcopy_path=llvm-objcopy
+# RUN: cat %t.dir.out/corpus_description.json | FileCheck %s
+
+## Check that this is not a thinLTO build
+# CHECK: "has_thinlto": false
+## Check that the expected modules end up in the corpus description
+# CHECK-DAG: "test1.o"
+# CHECK-DAG: "test2.o"
+
+# RUN: ls %t.dir.out | FileCheck %s --check-prefix CHECK-DIR
+
+# CHECK-DIR: test1.o.bc
+# CHECK-DIR: test1.o.cmd
+# CHECK-DIR: test2.o.bc
+# CHECK-DIR: test2.o.cmd
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+  SectionHeaderStringTable: .strtab
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x10
+    Content:         55
+  - Name:            .llvmbc
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         55
+  - Name:            .llvmcmd
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         ff
diff --git a/llvm/utils/mlgo-utils/tests/lit.cfg b/llvm/utils/mlgo-utils/tests/lit.cfg
index 58c35e69c652c58..0f6137e5e91383e 100644
--- a/llvm/utils/mlgo-utils/tests/lit.cfg
+++ b/llvm/utils/mlgo-utils/tests/lit.cfg
@@ -16,6 +16,8 @@ config.environment["PYTHONPATH"] = os.path.join(config.src_root, "utils", "mlgo-
 
 llvm_config.use_default_substitutions()
 config.substitutions.append(("split-file", llvm_config.use_llvm_tool("split-file")))
+config.substitutions.append(("yaml2obj", llvm_config.use_llvm_tool("yaml2obj")))
+config.substitutions.append(("llvm-objcopy", llvm_config.use_llvm_tool("llvm-objcopy")))
 
 scripts_dir = os.path.join(config.src_root, "utils/mlgo-utils/mlgo")
 config.substitutions.append(("%scripts_dir", scripts_dir))

>From 01dd2821526a435524fbe2d4cad0fff4b880a8fd Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 21 Jan 2024 06:08:28 +0000
Subject: [PATCH 4/7] [MLGO] Remove absl dependency from scripts

This patch removes the absl dependency from the mlgo-utils scripts. We
were only using absl.logging, and absl.flags, so this patch just
consists of mechanically converting the absl flags parsing to Python's
builtin argparse as Python's logging is a drop in replacement for
absl.logging.
---
 .../mlgo/corpus/combine_training_corpus.py    |  27 +--
 .../mlgo-utils/mlgo/corpus/extract_ir.py      | 204 +++++++++---------
 .../mlgo-utils/mlgo/corpus/make_corpus.py     |  45 ++--
 3 files changed, 137 insertions(+), 139 deletions(-)

diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
index 9aabd87b4688e00..cc21061cbbef5ea 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -23,26 +23,21 @@
 and corpus2 are combined into combinedcorpus.
 """
 
-from absl import app
-from absl import flags
+import argparse
 
 from mlgo.corpus import combine_training_corpus_lib
 
-flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
 
-FLAGS = flags.FLAGS
-
-
-def main(argv):
-    if len(argv) > 1:
-        raise app.UsageError("Too many command-line arguments.")
-
-    combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
-
-
-def entrypoint():
-    app.run(main)
+def main(args):
+    combine_training_corpus_lib.combine_corpus(args.root_dir)
 
 
 if __name__ == "__main__":
-    entrypoint()
+    parser = argparse.ArgumentParser(
+        description="A tool for combining multiple training corpora"
+    )
+    parser.add_argument(
+        "--root_dir", type=str, help="The root dir of module paths to combine."
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
index 9463e61dc534fed..4426463e22b0e74 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -26,127 +26,59 @@
 
 import json
 import multiprocessing
-
-from absl import app
-from absl import flags
-from absl import logging
+import logging
+import argparse
 
 from mlgo.corpus import extract_ir_lib
 
-flags.DEFINE_string(
-    "input",
-    None,
-    "Input file or directory - either compile_commands.json, a linker parameter"
-    "list, or a path to a directory containing object files.",
-)
-flags.DEFINE_enum(
-    "input_type",
-    "json",
-    ["json", "params", "directory"],
-    "Input file type - json, params, or directory. params latter refers to lld"
-    "params.",
-)
-flags.DEFINE_string("output_dir", None, "Output directory")
-flags.DEFINE_integer(
-    "num_workers",
-    None,
-    "Number of parallel workers for objcopy. `None` for maximum available.",
-)
-flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
-flags.DEFINE_string(
-    "obj_base_dir",
-    "",
-    "Base directory for object files. Defaults to current working dir.",
-)
-flags.DEFINE_string(
-    "cmd_filter",
-    None,
-    "Include only those modules with a command line matching this regexp. "
-    "Setting it to None for not filtering. Note that the regexp is applied "
-    "independently for each separate command line option. For example, ^-Oz$ "
-    "will match Oz - built binaries. Does not work with thinlto_build=lld.",
-)
-flags.DEFINE_enum(
-    "thinlto_build",
-    None,
-    ["distributed", "local"],
-    "Set if the build was performed with either 'distributed' or "
-    "'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
-    "The build is assumed to have had "
-    "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
-    "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
-    "passed in the local case.",
-)
-flags.DEFINE_string(
-    "cmd_section_name",
-    ".llvmcmd",
-    "The section name passed to llvm-objcopy. For ELF object files, the "
-    "default .llvmcmd is correct. For Mach-O object files, one should use "
-    "something like __LLVM,__cmdline",
-)
-flags.DEFINE_string(
-    "bitcode_section_name",
-    ".llvmbc",
-    "The section name passed to llvm-objcopy. For ELF object files, the "
-    "default .llvmbc is correct. For Mach-O object files, one should use "
-    "__LLVM,__bitcode",
-)
-
-flags.mark_flag_as_required("output_dir")
-
-FLAGS = flags.FLAGS
-
-
-def main(argv):
-    if len(argv) > 1:
-        raise app.UsageError("Too many command-line arguments.")
 
+def main(args):
     objs = []
-    if FLAGS.input is not None and FLAGS.thinlto_build == "local":
+    if args.input is not None and args.thinlto_build == "local":
         raise ValueError("--thinlto_build=local cannot be run with --input")
-    if FLAGS.input is None:
-        if FLAGS.thinlto_build != "local":
+    if args.input is None:
+        if args.thinlto_build != "local":
             raise ValueError("--input or --thinlto_build=local must be provided")
-        objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
-    elif FLAGS.input_type == "json":
-        with open(FLAGS.input, encoding="utf-8") as f:
+        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+    elif args.input_type == "json":
+        with open(args.input, encoding="utf-8") as f:
             objs = extract_ir_lib.load_from_compile_commands(
-                json.load(f), FLAGS.output_dir
+                json.load(f), args.output_dir
             )
-    elif FLAGS.input_type == "params":
-        if not FLAGS.obj_base_dir:
+    elif args.input_type == "params":
+        if not args.obj_base_dir:
             logging.info(
                 "-obj_base_dir is unspecified, assuming current directory."
                 "If no objects are found, use this option to specify the root"
                 "directory for the object file paths in the input file."
             )
-        with open(FLAGS.input, encoding="utf-8") as f:
+        with open(args.input, encoding="utf-8") as f:
             objs = extract_ir_lib.load_from_lld_params(
-                [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
+                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
             )
-    elif FLAGS.input_type == "directory":
+    elif args.input_type == "directory":
         logging.warning(
             "Using the directory input is only recommended if the build system"
             "your project uses does not support any structured output that"
             "ml-compiler-opt understands. If your build system provides a"
             "structured compilation database, use that instead"
         )
-        objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
     else:
-        logging.error("Unknown input type: %s", FLAGS.input_type)
+        logging.error("Unknown input type: %s", args.input_type)
 
     relative_output_paths = extract_ir_lib.run_extraction(
         objs,
-        FLAGS.num_workers,
-        FLAGS.llvm_objcopy_path,
-        FLAGS.cmd_filter,
-        FLAGS.thinlto_build,
-        FLAGS.cmd_section_name,
-        FLAGS.bitcode_section_name,
+        args.num_workers,
+        args.llvm_objcopy_path,
+        args.cmd_filter,
+        args.thinlto_build,
+        args.cmd_section_name,
+        args.bitcode_section_name,
     )
 
     extract_ir_lib.write_corpus_manifest(
-        FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
+        args.thinlto_build, relative_output_paths, args.output_dir
     )
 
     logging.info(
@@ -156,10 +88,86 @@ def main(argv):
     )
 
 
-def entrypoint():
-    multiprocessing.set_start_method("fork")
-    app.run(main)
-
-
 if __name__ == "__main__":
-    entrypoint()
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from build artifacts"
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="Input file or directory - either compile_commands.json, a linker "
+        "parameter list, or a path to a directory containing object files.",
+    )
+    parser.add_argument(
+        "--input_type",
+        type=str,
+        help="Input file type - JSON, LLD params, or directory.",
+        choices=["json", "params", "directory"],
+        default="json",
+        nargs="?",
+    )
+    parser.add_argument("--output_dir", type=str, help="Output directory")
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        help="Number of parallel works for objcopy. `None` for maximum available.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--llvm_objcopy_path",
+        type=str,
+        help="Path to llvm-objcopy",
+        default="llvm-objcopy",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--obj_base_dir",
+        type=str,
+        help="Base directory for object files. Defaults to current working dir.",
+        default="",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_filter",
+        type=str,
+        help="Include only those modules with a command line matching this regular "
+        "expression. Set it to None to not perform any filtering. Note that the "
+        "regular expression is applied independently for each separate command line "
+        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
+        "with thinlto_build=lld.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--thinlto_build",
+        type=str,
+        help="Set if the build was performed with either 'distributed' or 'local' "
+        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
+        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
+        "the distributed case or -Wl,--save-temps=import and "
+        "-Wl,--thinlto-emit-index-files passed in the local case",
+        choices=["distributed", "local"],
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmcmd is correct. For Mach-O object files, one should use "
+        "something like __LLVM,__cmdline",
+        default=".llvmcmd",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--bitcode_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmbc is correct. For Mach-O object files, one should use "
+        "__LLVM,__bitcode",
+        default=".llvmbc",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
index edb0ecd853de246..05ceb750de673ef 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -12,43 +12,38 @@
   --default_args="<list of space separated flags>"
 """
 
-from absl import app
-from absl import flags
-from absl import logging
+import logging
+import argparse
 
 from mlgo.corpus import make_corpus_lib
 
-flags.DEFINE_string("input_dir", None, "The input directory.")
-flags.DEFINE_string("output_dir", None, "The output directory.")
-flags.DEFINE_string(
-    "default_args",
-    "",
-    "The compiler flags to compile with when using downstream tooling.",
-)
 
-flags.mark_flag_as_required("input_dir")
-flags.mark_flag_as_required("output_dir")
-
-FLAGS = flags.FLAGS
-
-
-def main(_):
+def main(args):
     logging.warning(
         "Using this tool does not guarantee that the bitcode is taken at "
         "the correct stage for consumption during model training. Make "
         "sure to validate assumptions about where the bitcode is coming "
         "from before using it in production."
     )
-    relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
-    make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, FLAGS.output_dir)
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
+    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
     make_corpus_lib.write_corpus_manifest(
-        relative_paths, FLAGS.output_dir, FLAGS.default_args.split()
+        relative_paths, args.output_dir, args.default_args.split()
     )
 
 
-def entrypoint():
-    app.run(main)
-
-
 if __name__ == "__main__":
-    entrypoint()
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from arbitrary bitcode"
+    )
+    parser.add_argument("--input_dir", type=str, help="The input directory.")
+    parser.add_argument("--output_dir", type=str, help="The output directory.")
+    parser.add_argument(
+        "--default_args",
+        type=str,
+        help="The compiler flags to compile with when using downstream tooling.",
+        default="",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)

>From 69e230ad1bae8060c9055157eced593b15b436ee Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 21 Jan 2024 06:12:29 +0000
Subject: [PATCH 5/7] Add requires lines to tests

---
 .../mlgo-utils/tests/corpus/combine_training_corpus_script.test | 2 ++
 llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test       | 2 ++
 llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test      | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
index 1aa182146a49ee4..933a9c2b9f811e2 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
@@ -1,3 +1,5 @@
+# REQUIRES: python-38, absl, system-linux
+
 ## Testing that the combine_trainig_corpus script works as expected when
 ## invoked.
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
index a7629eb629219d7..c20581dacdc6516 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
@@ -1,3 +1,5 @@
+# REQUIRES: python-38, absl, system-linux
+
 ## Test that invoking the extract_ir script work as expected.
 
 # RUN: rm -rf %t.dir && mkdir %t.dir
diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
index f4f97544bce47d3..3c1b96523718e44 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
@@ -1,3 +1,5 @@
+# REQUIRES: python-38, absl, system-linux
+
 ## Testing that the make_corpus script works as expected when invoked.
 
 # RUN: rm -rf %t.dir && mkdir %t.dir

>From b919c42c768a3343056998a0643c3a02896b1202 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 21 Jan 2024 06:14:14 +0000
Subject: [PATCH 6/7] Remove other references to absl dep

---
 llvm/utils/mlgo-utils/pyproject.toml                      | 3 ---
 .../tests/corpus/combine_training_corpus_script.test      | 2 +-
 llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test | 2 +-
 .../utils/mlgo-utils/tests/corpus/make_corpus_script.test | 2 +-
 llvm/utils/mlgo-utils/tests/lit.local.cfg                 | 8 --------
 5 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/llvm/utils/mlgo-utils/pyproject.toml b/llvm/utils/mlgo-utils/pyproject.toml
index be2af86cd05df30..dac18a785c17b93 100644
--- a/llvm/utils/mlgo-utils/pyproject.toml
+++ b/llvm/utils/mlgo-utils/pyproject.toml
@@ -7,9 +7,6 @@ name = "mlgo"
 description = "Tooling for ML in LLVM"
 readme = "README.md"
 requires-python = ">=3.8,<3.11"
-dependencies = [
-  "absl-py>=1.0.0"
-]
 dynamic = ["version"]
 license = {text = "Apache-2.0 WITH LLVM-exception"}
 classifiers = [
diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
index 933a9c2b9f811e2..51dc637347caf09 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, absl, system-linux
+# REQUIRES: python-38, system-linux
 
 ## Testing that the combine_trainig_corpus script works as expected when
 ## invoked.
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
index c20581dacdc6516..107116618ce97bb 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, absl, system-linux
+# REQUIRES: python-38, system-linux
 
 ## Test that invoking the extract_ir script work as expected.
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
index 3c1b96523718e44..a08780055f31f1e 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, absl, system-linux
+# REQUIRES: python-38, system-linux
 
 ## Testing that the make_corpus script works as expected when invoked.
 
diff --git a/llvm/utils/mlgo-utils/tests/lit.local.cfg b/llvm/utils/mlgo-utils/tests/lit.local.cfg
index 90cdf8ba618ed8f..a9088750cb58b1e 100644
--- a/llvm/utils/mlgo-utils/tests/lit.local.cfg
+++ b/llvm/utils/mlgo-utils/tests/lit.local.cfg
@@ -4,11 +4,3 @@ import sys
 # the entire project has been bumped to 3.8.
 if sys.version_info > (3,8):
     config.available_features.add("python-38")
-
-# TODO(boomanaiden154): Remove this flag once the scripts are converted to
-# not use absl anymore.
-try:
-    import absl
-    config.available_features.add("absl")
-except:
-    pass

>From 336ccb0d643ed627343d2e38d5dbdd659e8688bc Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 21 Jan 2024 23:01:09 +0000
Subject: [PATCH 7/7] Move argument parsing and sort imports

---
 .../mlgo/corpus/combine_training_corpus.py    |  14 +-
 .../mlgo-utils/mlgo/corpus/extract_ir.py      | 122 +++++++++---------
 .../mlgo-utils/mlgo/corpus/make_corpus.py     |  34 ++---
 3 files changed, 91 insertions(+), 79 deletions(-)

diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
index cc21061cbbef5ea..3b2077b4c0e0e60 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -28,11 +28,7 @@
 from mlgo.corpus import combine_training_corpus_lib
 
 
-def main(args):
-    combine_training_corpus_lib.combine_corpus(args.root_dir)
-
-
-if __name__ == "__main__":
+def parse_args_and_run():
     parser = argparse.ArgumentParser(
         description="A tool for combining multiple training corpora"
     )
@@ -41,3 +37,11 @@ def main(args):
     )
     args = parser.parse_args()
     main(args)
+
+
+def main(args):
+    combine_training_corpus_lib.combine_corpus(args.root_dir)
+
+
+if __name__ == "__main__":
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
index 4426463e22b0e74..94415431ab4a388 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -24,71 +24,15 @@
 any output.
 """
 
+import argparse
 import json
-import multiprocessing
 import logging
-import argparse
+import multiprocessing
 
 from mlgo.corpus import extract_ir_lib
 
 
-def main(args):
-    objs = []
-    if args.input is not None and args.thinlto_build == "local":
-        raise ValueError("--thinlto_build=local cannot be run with --input")
-    if args.input is None:
-        if args.thinlto_build != "local":
-            raise ValueError("--input or --thinlto_build=local must be provided")
-        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
-    elif args.input_type == "json":
-        with open(args.input, encoding="utf-8") as f:
-            objs = extract_ir_lib.load_from_compile_commands(
-                json.load(f), args.output_dir
-            )
-    elif args.input_type == "params":
-        if not args.obj_base_dir:
-            logging.info(
-                "-obj_base_dir is unspecified, assuming current directory."
-                "If no objects are found, use this option to specify the root"
-                "directory for the object file paths in the input file."
-            )
-        with open(args.input, encoding="utf-8") as f:
-            objs = extract_ir_lib.load_from_lld_params(
-                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
-            )
-    elif args.input_type == "directory":
-        logging.warning(
-            "Using the directory input is only recommended if the build system"
-            "your project uses does not support any structured output that"
-            "ml-compiler-opt understands. If your build system provides a"
-            "structured compilation database, use that instead"
-        )
-        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
-    else:
-        logging.error("Unknown input type: %s", args.input_type)
-
-    relative_output_paths = extract_ir_lib.run_extraction(
-        objs,
-        args.num_workers,
-        args.llvm_objcopy_path,
-        args.cmd_filter,
-        args.thinlto_build,
-        args.cmd_section_name,
-        args.bitcode_section_name,
-    )
-
-    extract_ir_lib.write_corpus_manifest(
-        args.thinlto_build, relative_output_paths, args.output_dir
-    )
-
-    logging.info(
-        "Converted %d files out of %d",
-        len(objs) - relative_output_paths.count(None),
-        len(objs),
-    )
-
-
-if __name__ == "__main__":
+def parse_args_and_run():
     parser = argparse.ArgumentParser(
         description="A tool for making a corpus from build artifacts"
     )
@@ -171,3 +115,63 @@ def main(args):
     )
     args = parser.parse_args()
     main(args)
+
+
+def main(args):
+    objs = []
+    if args.input is not None and args.thinlto_build == "local":
+        raise ValueError("--thinlto_build=local cannot be run with --input")
+    if args.input is None:
+        if args.thinlto_build != "local":
+            raise ValueError("--input or --thinlto_build=local must be provided")
+        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+    elif args.input_type == "json":
+        with open(args.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_compile_commands(
+                json.load(f), args.output_dir
+            )
+    elif args.input_type == "params":
+        if not args.obj_base_dir:
+            logging.info(
+                "-obj_base_dir is unspecified, assuming current directory."
+                "If no objects are found, use this option to specify the root"
+                "directory for the object file paths in the input file."
+            )
+        with open(args.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_lld_params(
+                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
+            )
+    elif args.input_type == "directory":
+        logging.warning(
+            "Using the directory input is only recommended if the build system"
+            "your project uses does not support any structured output that"
+            "ml-compiler-opt understands. If your build system provides a"
+            "structured compilation database, use that instead"
+        )
+        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+    else:
+        logging.error("Unknown input type: %s", args.input_type)
+
+    relative_output_paths = extract_ir_lib.run_extraction(
+        objs,
+        args.num_workers,
+        args.llvm_objcopy_path,
+        args.cmd_filter,
+        args.thinlto_build,
+        args.cmd_section_name,
+        args.bitcode_section_name,
+    )
+
+    extract_ir_lib.write_corpus_manifest(
+        args.thinlto_build, relative_output_paths, args.output_dir
+    )
+
+    logging.info(
+        "Converted %d files out of %d",
+        len(objs) - relative_output_paths.count(None),
+        len(objs),
+    )
+
+
+if __name__ == "__main__":
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
index 05ceb750de673ef..221486e16c6e008 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -12,12 +12,29 @@
   --default_args="<list of space separated flags>"
 """
 
-import logging
 import argparse
+import logging
 
 from mlgo.corpus import make_corpus_lib
 
 
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from arbitrary bitcode"
+    )
+    parser.add_argument("--input_dir", type=str, help="The input directory.")
+    parser.add_argument("--output_dir", type=str, help="The output directory.")
+    parser.add_argument(
+        "--default_args",
+        type=str,
+        help="The compiler flags to compile with when using downstream tooling.",
+        default="",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
 def main(args):
     logging.warning(
         "Using this tool does not guarantee that the bitcode is taken at "
@@ -33,17 +50,4 @@ def main(args):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="A tool for making a corpus from arbitrary bitcode"
-    )
-    parser.add_argument("--input_dir", type=str, help="The input directory.")
-    parser.add_argument("--output_dir", type=str, help="The output directory.")
-    parser.add_argument(
-        "--default_args",
-        type=str,
-        help="The compiler flags to compile with when using downstream tooling.",
-        default="",
-        nargs="?",
-    )
-    args = parser.parse_args()
-    main(args)
+    parse_args_and_run()



More information about the cfe-commits mailing list