[llvm] [MLGO] Add ability to extract IR from bazel using aquery (PR #96964)

Aiden Grossman via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 27 14:28:02 PDT 2024


https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/96964

>From 3b2bb28b92e3c082885ab9ad8f3bfabb673b11cc Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman at google.com>
Date: Thu, 27 Jun 2024 20:21:41 +0000
Subject: [PATCH 1/2] [MLGO] Add ability to extract IR from bazel using aquery

This patch adds in support for extracting IR from binaries built with
bazel through querying the linker command line using bazel aquery.
---
 .../mlgo-utils/mlgo/corpus/extract_ir.py      |  9 ++++-
 .../mlgo-utils/mlgo/corpus/extract_ir_lib.py  | 21 +++++++++++
 .../tests/corpus/extract_ir_test.py           | 37 +++++++++++++++++++
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
index 94415431ab4a3..2a531fd2debbf 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -45,8 +45,8 @@ def parse_args_and_run():
     parser.add_argument(
         "--input_type",
         type=str,
-        help="Input file type - JSON, LLD params, or directory.",
-        choices=["json", "params", "directory"],
+        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
+        choices=["json", "params", "directory", "aquery"],
         default="json",
         nargs="?",
     )
@@ -149,6 +149,11 @@ def main(args):
             "structured compilation database, use that instead"
         )
         objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+    elif args.input_type == "aquery":
+        with open(args.input, encoding="utf-8") as aquery_json_handle:
+            objs = extract_ir_lib.load_bazel_aquery(
+                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
+            )
     else:
         logging.error("Unknown input type: %s", args.input_type)
 
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
index 8e9779c6257f1..7eb6e3889fa99 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
@@ -316,6 +316,27 @@ def make_spec(obj_file: str):
     return [make_spec(path) for path in paths]
 
 
+def load_bazel_aquery(aquery_json, obj_base_dir: str, output_dir: str):
+    """Creates an object file array by looking at the JSON output of bazel aquery.
+
+    Args:
+      aquery_json: The JSON-formatted output of the bazel aquery command for
+        the target of interest.
+      obj_base_dir: The base build directory that all object files will be
+        written out as arelative to.
+      output_dir: The output directory where extracted .bc and .cmd files should
+        be placed.
+    """
+    linker_params = []
+
+    for action_info in aquery_json["actions"]:
+        if action_info["mnemonic"] != "CppLink":
+            continue
+        linker_params = action_info["arguments"]
+
+    return load_from_lld_params(linker_params, obj_base_dir, output_dir)
+
+
 def run_extraction(
     objs: List[TrainingIRExtractor],
     num_workers: int,
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
index 5ee0762a551a5..50a2684f57117 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
@@ -304,6 +304,43 @@ def test_lld_thinlto_extraction(outer, outdir):
     # CHECK-LLD-THINLTO-EXTRACTION-PY: 3
 
 
+## Test that we can load a bazel query JSON as expected.
+
+# RUN: %python %s test_load_bazel_aquery | FileCheck %s --check-prefix CHECK-TEST-LOAD-BAZEL-AQUERY
+
+
+def test_load_bazel_aquery():
+    obj = extract_ir_lib.load_bazel_aquery(
+        {
+            "actions": [
+                {"mnemonic": "not-link", "arguments": []},
+                {
+                    "mnemonic": "CppLink",
+                    "arguments": ["clang", "-o", "output_binary", "test1.o", "test2.o"],
+                },
+            ]
+        },
+        "/some/path",
+        "/tmp/out",
+    )
+    print(obj[0].input_obj())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /some/path/test1.o
+    print(obj[0].relative_output_path())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: test1.o
+    print(obj[0].cmd_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test1.o.cmd
+    print(obj[0].bc_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test1.o.bc
+    print(obj[1].input_obj())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /some/path/test2.o
+    print(obj[1].relative_output_path())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: test2.o
+    print(obj[1].cmd_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test2.o.cmd
+    print(obj[1].bc_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test2.o.bc
+
+
 ## Test that filtering works correctly
 
 # RUN: %python %s test_filtering | FileCheck %s --check-prefix CHECK-TEST-FILTERING

>From a7a4129722f0af38333f6444d13a6730264b8090 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman at google.com>
Date: Thu, 27 Jun 2024 21:27:51 +0000
Subject: [PATCH 2/2] Address reviewer feedback

---
 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py     | 4 ++--
 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
index 2a531fd2debbf..a7d52daaedba3 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -46,7 +46,7 @@ def parse_args_and_run():
         "--input_type",
         type=str,
         help="Input file type - JSON, LLD params, directory, or bazel aquery.",
-        choices=["json", "params", "directory", "aquery"],
+        choices=["json", "params", "directory", "bazel_aquery"],
         default="json",
         nargs="?",
     )
@@ -149,7 +149,7 @@ def main(args):
             "structured compilation database, use that instead"
         )
         objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
-    elif args.input_type == "aquery":
+    elif args.input_type == "bazel_aquery":
         with open(args.input, encoding="utf-8") as aquery_json_handle:
             objs = extract_ir_lib.load_bazel_aquery(
                 json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
index 7eb6e3889fa99..f434e59524bbf 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
@@ -321,7 +321,9 @@ def load_bazel_aquery(aquery_json, obj_base_dir: str, output_dir: str):
 
     Args:
       aquery_json: The JSON-formatted output of the bazel aquery command for
-        the target of interest.
+        the target of interest. The bazel aquery JSON should be a JSON
+        serialized version of the analysis.ActionGraphContainer proto.
+        https://github.com/bazelbuild/bazel/blob/master/src/main/protobuf/analysis_v2.proto
       obj_base_dir: The base build directory that all object files will be
         written out as arelative to.
       output_dir: The output directory where extracted .bc and .cmd files should



More information about the llvm-commits mailing list