[llvm] [MLGO] Add ability to extract IR from bazel using aquery (PR #96964)

Thu Jun 27 13:51:11 PDT 2024

https://github.com/boomanaiden154 created https://github.com/llvm/llvm-project/pull/96964

This patch adds in support for extracting IR from binaries built with bazel through querying the linker command line using bazel aquery.

>From 3b2bb28b92e3c082885ab9ad8f3bfabb673b11cc Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman at google.com>
Date: Thu, 27 Jun 2024 20:21:41 +0000
Subject: [PATCH] [MLGO] Add ability to extract IR from bazel using aquery

This patch adds in support for extracting IR from binaries built with
bazel through querying the linker command line using bazel aquery.
---
 .../mlgo-utils/mlgo/corpus/extract_ir.py      |  9 ++++-
 .../mlgo-utils/mlgo/corpus/extract_ir_lib.py  | 21 +++++++++++
 .../tests/corpus/extract_ir_test.py           | 37 +++++++++++++++++++
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
index 94415431ab4a3..2a531fd2debbf 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -45,8 +45,8 @@ def parse_args_and_run():
     parser.add_argument(
         "--input_type",
         type=str,
-        help="Input file type - JSON, LLD params, or directory.",
-        choices=["json", "params", "directory"],
+        help="Input file type - JSON, LLD params, directory, or bazel aquery.",
+        choices=["json", "params", "directory", "aquery"],
         default="json",
         nargs="?",
     )
@@ -149,6 +149,11 @@ def main(args):
             "structured compilation database, use that instead"
         )
         objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
+    elif args.input_type == "aquery":
+        with open(args.input, encoding="utf-8") as aquery_json_handle:
+            objs = extract_ir_lib.load_bazel_aquery(
+                json.load(aquery_json_handle), args.obj_base_dir, args.output_dir
+            )
     else:
         logging.error("Unknown input type: %s", args.input_type)
 
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
index 8e9779c6257f1..7eb6e3889fa99 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
@@ -316,6 +316,27 @@ def make_spec(obj_file: str):
     return [make_spec(path) for path in paths]
 
 
+def load_bazel_aquery(aquery_json, obj_base_dir: str, output_dir: str):
+    """Creates an object file array by looking at the JSON output of bazel aquery.
+
+    Args:
+      aquery_json: The JSON-formatted output of the bazel aquery command for
+        the target of interest.
+      obj_base_dir: The base build directory that all object files will be
+        written out as arelative to.
+      output_dir: The output directory where extracted .bc and .cmd files should
+        be placed.
+    """
+    linker_params = []
+
+    for action_info in aquery_json["actions"]:
+        if action_info["mnemonic"] != "CppLink":
+            continue
+        linker_params = action_info["arguments"]
+
+    return load_from_lld_params(linker_params, obj_base_dir, output_dir)
+
+
 def run_extraction(
     objs: List[TrainingIRExtractor],
     num_workers: int,
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
index 5ee0762a551a5..50a2684f57117 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
@@ -304,6 +304,43 @@ def test_lld_thinlto_extraction(outer, outdir):
     # CHECK-LLD-THINLTO-EXTRACTION-PY: 3
 
 
+## Test that we can load a bazel query JSON as expected.
+
+# RUN: %python %s test_load_bazel_aquery | FileCheck %s --check-prefix CHECK-TEST-LOAD-BAZEL-AQUERY
+
+
+def test_load_bazel_aquery():
+    obj = extract_ir_lib.load_bazel_aquery(
+        {
+            "actions": [
+                {"mnemonic": "not-link", "arguments": []},
+                {
+                    "mnemonic": "CppLink",
+                    "arguments": ["clang", "-o", "output_binary", "test1.o", "test2.o"],
+                },
+            ]
+        },
+        "/some/path",
+        "/tmp/out",
+    )
+    print(obj[0].input_obj())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /some/path/test1.o
+    print(obj[0].relative_output_path())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: test1.o
+    print(obj[0].cmd_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test1.o.cmd
+    print(obj[0].bc_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test1.o.bc
+    print(obj[1].input_obj())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /some/path/test2.o
+    print(obj[1].relative_output_path())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: test2.o
+    print(obj[1].cmd_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test2.o.cmd
+    print(obj[1].bc_file())
+    # CHECK-TEST-LOAD-BAZEL-AQUERY: /tmp/out/test2.o.bc
+
+
 ## Test that filtering works correctly
 
 # RUN: %python %s test_filtering | FileCheck %s --check-prefix CHECK-TEST-FILTERING