[llvm] [clang-tools-extra] [MLGO] Upstream the corpus extraction tooling (PR #72319)
Aiden Grossman via cfe-commits
cfe-commits at lists.llvm.org
Sun Jan 14 22:23:49 PST 2024
https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/72319
>From c3f723c8a975cc5e075d56350645b0be486f3cda Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Tue, 14 Nov 2023 14:20:24 -0800
Subject: [PATCH 1/7] [MLGO] Upstream the corpus extraction tooling
---
llvm/py/Pyproject.toml | 1 +
llvm/py/src/mlgo/combine_training_corpus.py | 55 +++
.../src/mlgo/combine_training_corpus_lib.py | 50 +++
.../src/mlgo/combine_training_corpus_test.py | 104 +++++
llvm/py/src/mlgo/extract_ir.py | 142 +++++++
llvm/py/src/mlgo/extract_ir_lib.py | 373 ++++++++++++++++++
llvm/py/src/mlgo/extract_ir_test.py | 231 +++++++++++
llvm/py/src/mlgo/make_corpus.py | 58 +++
llvm/py/src/mlgo/make_corpus_lib.py | 90 +++++
llvm/py/src/mlgo/make_corpus_test.py | 66 ++++
10 files changed, 1170 insertions(+)
create mode 100644 llvm/py/Pyproject.toml
create mode 100644 llvm/py/src/mlgo/combine_training_corpus.py
create mode 100644 llvm/py/src/mlgo/combine_training_corpus_lib.py
create mode 100644 llvm/py/src/mlgo/combine_training_corpus_test.py
create mode 100644 llvm/py/src/mlgo/extract_ir.py
create mode 100644 llvm/py/src/mlgo/extract_ir_lib.py
create mode 100644 llvm/py/src/mlgo/extract_ir_test.py
create mode 100644 llvm/py/src/mlgo/make_corpus.py
create mode 100644 llvm/py/src/mlgo/make_corpus_lib.py
create mode 100644 llvm/py/src/mlgo/make_corpus_test.py
diff --git a/llvm/py/Pyproject.toml b/llvm/py/Pyproject.toml
new file mode 100644
index 00000000000000..dcf2c804da5e19
--- /dev/null
+++ b/llvm/py/Pyproject.toml
@@ -0,0 +1 @@
+# Placeholder
diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/py/src/mlgo/combine_training_corpus.py
new file mode 100644
index 00000000000000..94ee1cbac9cea4
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+ --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+from absl import app
+from absl import flags
+
+from compiler_opt.tools import combine_training_corpus_lib
+
+flags.DEFINE_string('root_dir', '', 'root dir of module paths to combine.')
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+ if len(argv) > 1:
+ raise app.UsageError('Too many command-line arguments.')
+
+ combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
+
+
+if __name__ == '__main__':
+ app.run(main)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/py/src/mlgo/combine_training_corpus_lib.py
new file mode 100644
index 00000000000000..0359961266a240
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus_lib.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library for combining training corpora."""
+
+import os
+import json
+
+from absl import logging
+
+import tensorflow as tf
+
+_FILE_NAME = 'corpus_description.json'
+
+
+def combine_corpus(root_dir: str) -> None:
+ module_names = []
+ output_corpus_description = {}
+
+ corpus_description_glob = os.path.join(root_dir, '*/' + _FILE_NAME)
+ for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
+ logging.info('processing %s', corpus_description_path)
+
+ with tf.io.gfile.GFile(corpus_description_path, 'r') as f:
+ corpus_description = json.load(f)
+ sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
+ module_names.extend([
+ os.path.join(sub_dir, name) for name in corpus_description['modules']
+ ])
+ del corpus_description['modules']
+ if len(output_corpus_description) == 0:
+ output_corpus_description = corpus_description
+ elif corpus_description != output_corpus_description:
+ raise ValueError('Input corpora differ by more than modules.')
+
+ output_corpus_description['modules'] = module_names
+
+ with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), 'w') as f:
+ json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/py/src/mlgo/combine_training_corpus_test.py
new file mode 100644
index 00000000000000..47dd602967b68f
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus_test.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for combining training corpora."""
+
+import json
+import os
+
+from absl.testing import absltest
+
+from compiler_opt.tools import combine_training_corpus_lib
+
+
+class CombineTrainingCorpusTest(absltest.TestCase):
+
+ def test_combine_corpus(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+ subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
+ subcorpus1_description = {
+ 'has_thinlto': False,
+ 'modules': ['test1.o', 'test2.o']
+ }
+ subcorpus2_description = {
+ 'has_thinlto': False,
+ 'modules': ['test3.o', 'test4.o']
+ }
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus2_description_file = subcorpus2_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+ combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+ with open(
+ os.path.join(corpus_dir, 'corpus_description.json'),
+ encoding='utf-8') as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ self.assertEqual(combined_corpus_description['has_thinlto'], False)
+ self.assertLen(combined_corpus_description['modules'], 4)
+ self.assertIn('subcorpus1/test1.o', combined_corpus_description['modules'])
+ self.assertIn('subcorpus1/test2.o', combined_corpus_description['modules'])
+ self.assertIn('subcorpus2/test3.o', combined_corpus_description['modules'])
+ self.assertIn('subcorpus2/test4.o', combined_corpus_description['modules'])
+
+ def test_empty_folder(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+ _ = corpus_dir.mkdir(dir_path='empty_dir')
+ subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+ with open(
+ os.path.join(corpus_dir, 'corpus_description.json'),
+ encoding='utf-8') as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ self.assertLen(combined_corpus_description['modules'], 2)
+
+ def test_ignore_extra_file(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+ _ = corpus_dir.create_file(file_path='empty.log')
+ subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+ with open(
+ os.path.join(corpus_dir, 'corpus_description.json'),
+ encoding='utf-8') as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ self.assertLen(combined_corpus_description['modules'], 2)
+
+ def test_different_corpora(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+ subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
+ subcorpus1_description = {'has_thinlto': False, 'modules': ['test1.o']}
+ subcorpus2_description = {'has_thinlto': True, 'modules': ['test2.o']}
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus2_description_file = subcorpus2_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+ self.assertRaises(ValueError, combine_training_corpus_lib.combine_corpus,
+ corpus_dir.full_path)
+
+
+if __name__ == '__main__':
+ absltest.main()
diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/py/src/mlgo/extract_ir.py
new file mode 100644
index 00000000000000..2a1ef3978888d6
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir.py
@@ -0,0 +1,142 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, pass an integer representing the desired
+verbosity to the --verbosity flag. Use 0 for all logs, status information,
+and detailed debug information, -1 for solely warnings, and -2 to not produce
+any output.
+"""
+
+import json
+import multiprocessing
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from compiler_opt.tools import extract_ir_lib
+
+flags.DEFINE_string(
+ 'input', None,
+ 'Input file or directory - either compile_commands.json, a linker parameter'
+ 'list, or a path to a directory containing object files.')
+flags.DEFINE_enum(
+ 'input_type', 'json', ['json', 'params', 'directory'],
+ 'Input file type - json, params, or directory. params latter refers to lld'
+ 'params.')
+flags.DEFINE_string('output_dir', None, 'Output directory')
+flags.DEFINE_integer(
+ 'num_workers', None,
+ 'Number of parallel workers for objcopy. `None` for maximum available.')
+flags.DEFINE_string('llvm_objcopy_path', 'llvm-objcopy', 'Path to llvm-objcopy')
+flags.DEFINE_string(
+ 'obj_base_dir', '',
+ 'Base directory for object files. Defaults to current working dir.')
+flags.DEFINE_string(
+ 'cmd_filter', None,
+ 'Include only those modules with a command line matching this regexp. '
+ 'Setting it to None for not filtering. Note that the regexp is applied '
+ 'independently for each separate command line option. For example, ^-Oz$ '
+ 'will match Oz - built binaries. Does not work with thinlto_build=lld.')
+flags.DEFINE_enum(
+ 'thinlto_build', None, ['distributed', 'local'],
+ 'Set if the build was performed with either \'distributed\' or '
+ '\'local\' ThinLTO. This ensures the thinlto.bc files are also copied. '
+ 'The build is assumed to have had '
+ '-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed '
+ 'case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files '
+ 'passed in the local case.')
+flags.DEFINE_string(
+ 'cmd_section_name', '.llvmcmd',
+ 'The section name passed to llvm-objcopy. For ELF object files, the '
+ 'default .llvmcmd is correct. For Mach-O object files, one should use '
+ 'something like __LLVM,__cmdline')
+flags.DEFINE_string(
+ 'bitcode_section_name', '.llvmbc',
+ 'The section name passed to llvm-objcopy. For ELF object files, the '
+ 'default .llvmbc is correct. For Mach-O object files, one should use '
+ '__LLVM,__bitcode')
+
+flags.mark_flag_as_required('output_dir')
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+ if len(argv) > 1:
+ raise app.UsageError('Too many command-line arguments.')
+
+ objs = []
+ if FLAGS.input is not None and FLAGS.thinlto_build == 'local':
+ raise ValueError('--thinlto_build=local cannot be run with --input')
+ if FLAGS.input is None:
+ if FLAGS.thinlto_build != 'local':
+ raise ValueError('--input or --thinlto_build=local must be provided')
+ objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir,
+ FLAGS.output_dir)
+ elif FLAGS.input_type == 'json':
+ with open(FLAGS.input, encoding='utf-8') as f:
+ objs = extract_ir_lib.load_from_compile_commands(
+ json.load(f), FLAGS.output_dir)
+ elif FLAGS.input_type == 'params':
+ if not FLAGS.obj_base_dir:
+ logging.info(
+ '-obj_base_dir is unspecified, assuming current directory.'
+ 'If no objects are found, use this option to specify the root'
+ 'directory for the object file paths in the input file.')
+ with open(FLAGS.input, encoding='utf-8') as f:
+ objs = extract_ir_lib.load_from_lld_params(
+ [l.strip() for l in f.readlines()], FLAGS.obj_base_dir,
+ FLAGS.output_dir)
+ elif FLAGS.input_type == 'directory':
+ logging.warning(
+ 'Using the directory input is only recommended if the build system'
+ 'your project uses does not support any structured output that'
+ 'ml-compiler-opt understands. If your build system provides a'
+ 'structured compilation database, use that instead')
+ objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+ else:
+ logging.error('Unknown input type: %s', FLAGS.input_type)
+
+ relative_output_paths = extract_ir_lib.run_extraction(
+ objs, FLAGS.num_workers, FLAGS.llvm_objcopy_path, FLAGS.cmd_filter,
+ FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name)
+
+ extract_ir_lib.write_corpus_manifest(FLAGS.thinlto_build,
+ relative_output_paths, FLAGS.output_dir)
+
+ logging.info('Converted %d files out of %d',
+ len(objs) - relative_output_paths.count(None), len(objs))
+
+
+if __name__ == '__main__':
+ multiprocessing.set_start_method('fork')
+ app.run(main)
diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/py/src/mlgo/extract_ir_lib.py
new file mode 100644
index 00000000000000..c1d2a54b9a9e7c
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir_lib.py
@@ -0,0 +1,373 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library functions for IR extraction."""
+
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import multiprocessing
+import functools
+import json
+
+from typing import Dict, List, Optional
+
+from absl import logging
+
+from compiler_opt.rl import constant
+
+
+# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
+# \0 - separated list of strings, to a \n one.
+def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
+ """Determine if the module should be included."""
+ if match_regexp is None:
+ return True
+ lines = cmdline.split('\0')
+ return any(len(re.findall(match_regexp, l)) for l in lines)
+
+
+def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
+ opts = cmdline.split('\0')
+ for option in opts:
+ if option.startswith('-fthinlto-index'):
+ return os.path.join(basedir, option.split('=')[1])
+ return None
+
+
+class TrainingIRExtractor:
+ """IR and command line extraction from an object file."""
+
+ def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
+ """Set up a TrainingIRExtractor.
+
+ Args:
+ obj_relative_path: relative path to the input object file. It will be also
+ used to construct the absolute path of the output IR and cmd files, by
+ appending it to output_base_dir.
+ output_base_dir: the directory under which the output will be produced.
+ obj_base_dir: the base directory for all the input object files.
+ """
+ self._obj_relative_path = obj_relative_path
+ self._output_base_dir = output_base_dir
+ self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ''
+
+ def obj_base_dir(self):
+ return self._obj_base_dir
+
+ def output_base_dir(self):
+ return self._output_base_dir
+
+ def relative_output_path(self):
+ return self._obj_relative_path
+
+ def input_obj(self):
+ return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+
+ def lld_src_bc(self):
+ # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+ # IR bitcode saved by lld. It is hardcoded into lld.
+ return os.path.join(self._obj_base_dir,
+ self._obj_relative_path + '.3.import.bc')
+
+ def lld_src_thinlto(self):
+ return os.path.join(self._obj_base_dir,
+ self._obj_relative_path + '.thinlto.bc')
+
+ def dest_dir(self):
+ return os.path.join(self.output_base_dir(),
+ os.path.dirname(self._obj_relative_path))
+
+ def module_name(self):
+ return os.path.basename(self._obj_relative_path)
+
+ def cmd_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + '.cmd')
+
+ def bc_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + '.bc')
+
+ def thinlto_index_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + '.thinlto.bc')
+
+ def _get_extraction_cmd_command(self, llvm_objcopy_path: str,
+ cmd_section_name: str):
+ """Get llvm-objcopy and process args to a produce a command string that,
+ when invoked, will extract the cmd section info ths self.cmd_file() file.
+ """
+ return [
+ llvm_objcopy_path,
+ '--dump-section=' + cmd_section_name + '=' + self.cmd_file(),
+ self.input_obj(), '/dev/null'
+ ]
+
+ def _get_extraction_bc_command(self, llvm_objcopy_path: str,
+ bitcode_section_name: str):
+ """Gets llvm-objcopy and process args to produce a command string that,
+ when invoked, will extract the bitcode section into the self.bc_file()
+ file.
+ """
+ return [
+ llvm_objcopy_path,
+ '--dump-section=' + bitcode_section_name + '=' + self.bc_file(),
+ self.input_obj(), '/dev/null'
+ ]
+
+ def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str,
+ is_thinlto: bool, cmd_section_name: str,
+ bitcode_section_name: str) -> Optional[str]:
+ """Run llvm-objcopy to extract the .bc and command line."""
+ if not os.path.exists(self.input_obj()):
+ logging.info('%s does not exist.', self.input_obj())
+ return None
+ os.makedirs(self.dest_dir(), exist_ok=True)
+ try:
+ subprocess.check_output(
+ self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
+ stderr=subprocess.STDOUT,
+ encoding='utf-8')
+ if cmd_filter is not None or is_thinlto:
+ with open(self.cmd_file(), encoding='utf-8') as f:
+ lines = f.readlines()
+ assert len(lines) == 1
+ cmdline = lines[0]
+ if not should_include_module(cmdline, cmd_filter):
+ logging.info(
+ 'Excluding module %s because it does not match the filter',
+ self.input_obj())
+ os.remove(self.cmd_file())
+ return None
+ if is_thinlto:
+ index_file = get_thinlto_index(cmdline, self.obj_base_dir())
+ shutil.copy(index_file, self.thinlto_index_file())
+
+ subprocess.check_output(
+ self._get_extraction_bc_command(llvm_objcopy_path,
+ bitcode_section_name),
+ stderr=subprocess.STDOUT,
+ encoding='utf-8')
+ except subprocess.CalledProcessError as e:
+ # This may happen if .o file was build from asm (.S source).
+ logging.warning('%s was not processed: %s', self.input_obj(), e)
+ logging.info(e.output)
+ return None
+ assert (os.path.exists(self.cmd_file()) and
+ os.path.exists(self.bc_file()) and
+ (not is_thinlto or os.path.exists(self.thinlto_index_file())))
+ return self.relative_output_path()
+
+ def _extract_lld_artifacts(self) -> Optional[str]:
+ """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.
+ """
+ if not os.path.exists(self.lld_src_bc()):
+ logging.info('%s does not exist.', self.lld_src_bc())
+ return None
+ if not os.path.exists(self.lld_src_thinlto()):
+ logging.info('%s does not exist.', self.lld_src_thinlto())
+ return None
+ os.makedirs(self.dest_dir(), exist_ok=True)
+
+ # Copy over the files
+ shutil.copy(self.lld_src_bc(), self.bc_file())
+ shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
+
+ assert os.path.exists(self.bc_file())
+ assert os.path.exists(self.thinlto_index_file())
+ return self._obj_relative_path
+
+ def extract(self,
+ llvm_objcopy_path: Optional[str] = None,
+ cmd_filter: Optional[str] = None,
+ thinlto_build: Optional[str] = None,
+ cmd_section_name: Optional[str] = '.llvmcmd',
+ bitcode_section_name: Optional[str] = '.llvmbc') -> Optional[str]:
+ if thinlto_build == 'local':
+ return self._extract_lld_artifacts()
+ return self._extract_clang_artifacts(
+ llvm_objcopy_path=llvm_objcopy_path,
+ cmd_filter=cmd_filter,
+ is_thinlto=thinlto_build == 'distributed',
+ cmd_section_name=cmd_section_name,
+ bitcode_section_name=bitcode_section_name)
+
+
+def convert_compile_command_to_objectfile(
+ command: Dict[str, str], output_dir: str) -> Optional[TrainingIRExtractor]:
+ obj_base_dir = command['directory']
+ if 'arguments' in command:
+ cmd_parts = command['arguments']
+ elif 'command' in command:
+ cmd_parts = command['command'].split()
+ else:
+ logging.info('compile_commands element has no command and arguments')
+ return None
+
+ try:
+ obj_index = cmd_parts.index('-o') + 1
+ except ValueError:
+ # This could happen if there are non-clang commands in compile_commands.json
+ logging.info('Command has no -o option: %s', ' '.join(cmd_parts))
+ return None
+ obj_rel_path = cmd_parts[obj_index]
+ # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
+ return TrainingIRExtractor(
+ obj_relative_path=obj_rel_path,
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir)
+
+
+def load_from_compile_commands(json_array: List[Dict[str, str]],
+ output_dir: str) -> List[TrainingIRExtractor]:
+ objs = [
+ convert_compile_command_to_objectfile(cmd, output_dir)
+ for cmd in json_array
+ ]
+ # Filter out None, in case there were non-clang commands in the .json
+ return [obj for obj in objs if obj is not None]
+
+
+def load_from_lld_params(params_array: List[str], obj_base_dir: str,
+ output_dir: str) -> List[TrainingIRExtractor]:
+ """Create an ObjectFile array based on lld's parameters."""
+ # yank out -o and the output. After that, anything not starting with '-', and
+ # ending in a '.o', is an object file.
+ try:
+ minus_o_idx = params_array.index('-o')
+ del params_array[minus_o_idx:minus_o_idx + 2]
+ just_obj_paths = [
+ o for o in params_array if not o.startswith('-') and o.endswith('.o')
+ ]
+ except ValueError:
+ logging.info('This params file does not have an explicit -o option.')
+ just_obj_paths = params_array
+
+ def make_obj(obj_file: str) -> TrainingIRExtractor:
+ return TrainingIRExtractor(
+ obj_relative_path=obj_file,
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir)
+
+ return [make_obj(obj_file) for obj_file in just_obj_paths]
+
+
+def load_from_directory(obj_base_dir: str,
+ output_dir: str) -> List[TrainingIRExtractor]:
+ """Create an object file array by globbing an entire drectory.
+
+ Args:
+ obj_base_dir: The base build directory that all object files will be
+ written out as being relative to.
+ output_dir: The output directory where extracted .bc and .cmd files should
+ be placed.
+ """
+ paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.o')]
+
+ def make_spec(obj_file: str):
+ return TrainingIRExtractor(
+ obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir)
+
+ return [make_spec(path) for path in paths]
+
+
+def load_for_lld_thinlto(obj_base_dir: str,
+ output_dir: str) -> List[TrainingIRExtractor]:
+ # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+ # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
+ # are also emitted next to the postimport bitcode, with the suffix
+ # .thinlto.bc instead
+ paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.3.import.bc')]
+
+ def make_spec(obj_file: str):
+ return TrainingIRExtractor(
+ # Cut away .3.import.bc
+ obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir)
+
+ return [make_spec(path) for path in paths]
+
+
+def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
+ llvm_objcopy_path: str, cmd_filter: str, thinlto_build: str,
+ cmd_section_name: str, bitcode_section_name: str):
+ """Extracts all specified object files into the corpus directory.
+
+ Args:
+ objs: A list of TrainingIRExtractor Objects that represent the object files
+ to extract bitcode/commands from.
+ num_workers: The number of parallel processes to spawn to run the
+ extraction.
+ llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
+ cmd_filter: A regular expression that is used to select for compilations
+ performed with specific flags. If you want to include all compilations,
+ set this to None.
+ thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
+ Set this to None if the build was not done with ThinLTO.
+ cmd_section_name: The name of the command line section created by the
+ bitcode embedding.
+ bitcode_section_name: The name of the bitcode section created by the
+ bitcode embedding.
+ """
+ extract_artifacts = functools.partial(
+ TrainingIRExtractor.extract,
+ llvm_objcopy_path=llvm_objcopy_path,
+ cmd_filter=cmd_filter,
+ thinlto_build=thinlto_build,
+ cmd_section_name=cmd_section_name,
+ bitcode_section_name=bitcode_section_name)
+
+ with multiprocessing.Pool(num_workers) as pool:
+ relative_output_paths = pool.map(extract_artifacts, objs)
+ pool.close()
+ pool.join()
+ return relative_output_paths
+
+
+def write_corpus_manifest(thinlto_build: str, relative_output_paths: List[str],
+ output_dir: str):
+ """Writes a corpus_manifest.json containing all necessary information about
+ the corpus.
+
+ Args:
+ thinlto_build: Whether or not the build was done with ThinLTO and if so,
+ what kind of ThinLTO. Set this to none if the build was not performed with
+ ThinLTO.
+ relative_output_paths: The relative (to the corpus directory) output paths
+ of all the bitcode files that should be placed in the corpus manifest
+ output_dir: The corpus directory where the corpus manifest should be
+ placed.
+ """
+ # This comes first rather than later so global_command_override is at the top
+ # of the .json after being written
+ if thinlto_build == 'local':
+ corpus_description = {
+ 'global_command_override': constant.UNSPECIFIED_OVERRIDE
+ }
+ else:
+ corpus_description = {}
+
+ corpus_description.update({
+ 'has_thinlto': thinlto_build is not None,
+ 'modules': [path for path in relative_output_paths if path is not None]
+ })
+
+ with open(
+ os.path.join(output_dir, 'corpus_description.json'),
+ 'w',
+ encoding='utf-8') as f:
+ json.dump(corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/py/src/mlgo/extract_ir_test.py
new file mode 100644
index 00000000000000..8811134aab4fce
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir_test.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for compiler_opt.tools.extract_ir."""
+
+# pylint: disable=protected-access
+import os.path
+
+from absl.testing import absltest
+
+from compiler_opt.tools import extract_ir_lib
+
+
+class ExtractIrTest(absltest.TestCase):
+
+ def test_one_conversion(self):
+ obj = extract_ir_lib.convert_compile_command_to_objectfile(
+ {
+ 'directory': '/output/directory',
+ 'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
+ 'file': '/some/path/lib/foo/bar.cc'
+ }, '/corpus/destination/path')
+ self.assertIsNotNone(obj)
+ # pytype: disable=attribute-error
+ # Pytype complains about obj being None
+ self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
+ self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
+ self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
+ self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+ self.assertEqual(obj.thinlto_index_file(),
+ '/corpus/destination/path/lib/bar.o.thinlto.bc')
+ # pytype: enable=attribute-error
+
+ def test_one_conversion_arguments_style(self):
+ obj = extract_ir_lib.convert_compile_command_to_objectfile(
+ {
+ 'directory': '/output/directory',
+ 'arguments':
+ ['-cc1', '-c', '/some/path/lib/foo/bar.cc', '-o', 'lib/bar.o'],
+ 'file': '/some/path/lib/foo/bar.cc'
+ }, '/corpus/destination/path')
+ self.assertIsNotNone(obj)
+ # pytype: disable=attribute-error
+ # Pytype complains about obj being None
+ self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
+ self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
+ self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
+ self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+ self.assertEqual(obj.thinlto_index_file(),
+ '/corpus/destination/path/lib/bar.o.thinlto.bc')
+ # pytype: enable=attribute-error
+
+ def test_arr_conversion(self):
+ res = extract_ir_lib.load_from_compile_commands([{
+ 'directory': '/output/directory',
+ 'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
+ 'file': '/some/path/lib/foo/bar.cc'
+ }, {
+ 'directory': '/output/directory',
+ 'command': '-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o',
+ 'file': '/some/path/lib/foo/baz.cc'
+ }], '/corpus/destination/path')
+ res = list(res)
+ self.assertLen(res, 2)
+ self.assertEqual(res[0].input_obj(), '/output/directory/lib/bar.o')
+ self.assertEqual(res[0].relative_output_path(), 'lib/bar.o')
+ self.assertEqual(res[0].cmd_file(),
+ '/corpus/destination/path/lib/bar.o.cmd')
+ self.assertEqual(res[0].bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+ self.assertEqual(res[0].thinlto_index_file(),
+ '/corpus/destination/path/lib/bar.o.thinlto.bc')
+
+ self.assertEqual(res[1].input_obj(), '/output/directory/lib/other/baz.o')
+ self.assertEqual(res[1].relative_output_path(), 'lib/other/baz.o')
+ self.assertEqual(res[1].cmd_file(),
+ '/corpus/destination/path/lib/other/baz.o.cmd')
+ self.assertEqual(res[1].bc_file(),
+ '/corpus/destination/path/lib/other/baz.o.bc')
+ self.assertEqual(res[1].thinlto_index_file(),
+ '/corpus/destination/path/lib/other/baz.o.thinlto.bc')
+
+ def test_command_extraction(self):
+ obj = extract_ir_lib.TrainingIRExtractor(
+ obj_relative_path='lib/obj_file.o',
+ output_base_dir='/where/corpus/goes',
+ obj_base_dir='/foo/bar')
+ self.assertEqual(
+ obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
+ '/bin/llvm_objcopy_path',
+ '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
+ '/foo/bar/lib/obj_file.o', '/dev/null'
+ ])
+ self.assertEqual(
+ obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
+ '/bin/llvm_objcopy_path',
+ '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
+ '/foo/bar/lib/obj_file.o', '/dev/null'
+ ])
+
+ def test_command_extraction_no_basedir(self):
+ obj = extract_ir_lib.TrainingIRExtractor('lib/obj_file.o',
+ '/where/corpus/goes')
+ self.assertEqual(
+ obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
+ '/bin/llvm_objcopy_path',
+ '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
+ 'lib/obj_file.o', '/dev/null'
+ ])
+ self.assertEqual(
+ obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
+ '/bin/llvm_objcopy_path',
+ '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
+ 'lib/obj_file.o', '/dev/null'
+ ])
+
+ def test_lld_params(self):
+ lld_opts = [
+ '-o', 'output/dir/exe', 'lib/obj1.o', 'somelib.a', '-W,blah',
+ 'lib/dir/obj2.o'
+ ]
+ obj = extract_ir_lib.load_from_lld_params(lld_opts, '/some/path',
+ '/tmp/out')
+ self.assertLen(obj, 2)
+ self.assertEqual(obj[0].input_obj(), '/some/path/lib/obj1.o')
+ self.assertEqual(obj[0].relative_output_path(), 'lib/obj1.o')
+ self.assertEqual(obj[0].cmd_file(), '/tmp/out/lib/obj1.o.cmd')
+ self.assertEqual(obj[0].thinlto_index_file(),
+ '/tmp/out/lib/obj1.o.thinlto.bc')
+ self.assertEqual(obj[1].input_obj(), '/some/path/lib/dir/obj2.o')
+
+ def test_load_from_directory(self):
+ tempdir = self.create_tempdir()
+ subdir = tempdir.mkdir(dir_path='subdir')
+ subdir.create_file(file_path='test1.o')
+ subdir.create_file(file_path='test2.o')
+ outdir = self.create_tempdir()
+ objs = extract_ir_lib.load_from_directory(tempdir.full_path,
+ outdir.full_path)
+ self.assertLen(objs, 2)
+ for index, obj in enumerate(
+ sorted(objs, key=lambda x: x._obj_relative_path)):
+ self.assertEqual(obj._obj_relative_path, f'subdir/test{index + 1:d}.o')
+ self.assertEqual(obj._obj_base_dir, tempdir.full_path)
+ self.assertEqual(obj._output_base_dir, outdir.full_path)
+
+ def test_lld_thinlto_discovery(self):
+ tempdir = self.create_tempdir()
+ tempdir.create_file(file_path='1.3.import.bc')
+ tempdir.create_file(file_path='2.3.import.bc')
+ tempdir.create_file(file_path='3.3.import.bc')
+ tempdir.create_file(file_path='1.thinlto.bc')
+ tempdir.create_file(file_path='2.thinlto.bc')
+ tempdir.create_file(file_path='3.thinlto.bc')
+ outdir = self.create_tempdir()
+ obj = extract_ir_lib.load_for_lld_thinlto(tempdir.full_path,
+ outdir.full_path)
+ self.assertLen(obj, 3)
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ self.assertEqual(o._obj_relative_path, f'{i + 1:d}')
+ self.assertEqual(o._obj_base_dir, tempdir.full_path)
+ self.assertEqual(o._output_base_dir, outdir.full_path)
+
+ def test_lld_thinlto_discovery_nested(self):
+ outer = self.create_tempdir()
+ tempdir = outer.mkdir(dir_path='nest')
+ tempdir.create_file(file_path='1.3.import.bc')
+ tempdir.create_file(file_path='2.3.import.bc')
+ tempdir.create_file(file_path='3.3.import.bc')
+ tempdir.create_file(file_path='1.thinlto.bc')
+ tempdir.create_file(file_path='2.thinlto.bc')
+ tempdir.create_file(file_path='3.thinlto.bc')
+ outdir = self.create_tempdir()
+ obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+ self.assertLen(obj, 3)
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ self.assertEqual(o._obj_relative_path, f'nest/{i + 1:d}')
+ self.assertEqual(o._obj_base_dir, outer.full_path)
+ self.assertEqual(o._output_base_dir, outdir.full_path)
+
+ def test_lld_thinlto_extraction(self):
+ outer = self.create_tempdir()
+ tempdir = outer.mkdir(dir_path='nest')
+ tempdir.create_file(file_path='1.3.import.bc')
+ tempdir.create_file(file_path='2.3.import.bc')
+ tempdir.create_file(file_path='3.3.import.bc')
+ tempdir.create_file(file_path='1.thinlto.bc')
+ tempdir.create_file(file_path='2.thinlto.bc')
+ tempdir.create_file(file_path='3.thinlto.bc')
+ outdir = self.create_tempdir()
+ obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ mod_path = o.extract(thinlto_build='local')
+ self.assertEqual(mod_path, f'nest/{i + 1:d}')
+ self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/1.bc')))
+ self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/2.bc')))
+ self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/3.bc')))
+ self.assertTrue(
+ os.path.exists(os.path.join(outdir.full_path, 'nest/1.thinlto.bc')))
+ self.assertTrue(
+ os.path.exists(os.path.join(outdir.full_path, 'nest/2.thinlto.bc')))
+ self.assertTrue(
+ os.path.exists(os.path.join(outdir.full_path, 'nest/3.thinlto.bc')))
+
+ def test_filtering(self):
+ cmdline = '-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o'
+ self.assertTrue(extract_ir_lib.should_include_module(cmdline, None))
+ self.assertTrue(extract_ir_lib.should_include_module(cmdline, '.*'))
+ self.assertTrue(extract_ir_lib.should_include_module(cmdline, '^-Oz$'))
+ self.assertFalse(extract_ir_lib.should_include_module(cmdline, '^-O3$'))
+
+ def test_thinlto_index_extractor(self):
+ cmdline = ('-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/'
+ 'out.o\0-fthinlto-index=foo/bar.thinlto.bc')
+ self.assertEqual(
+ extract_ir_lib.get_thinlto_index(cmdline, '/the/base/dir'),
+ '/the/base/dir/foo/bar.thinlto.bc')
+
+
+if __name__ == '__main__':
+ absltest.main()
diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/py/src/mlgo/make_corpus.py
new file mode 100644
index 00000000000000..24493d894be723
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus.py
@@ -0,0 +1,58 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+ --input_dir=<path to input directory> \
+ --output_dir=<path to output directory> \
+ --default_args="<list of space separated flags>"
+"""
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from compiler_opt.tools import make_corpus_lib
+
+flags.DEFINE_string('input_dir', None, 'The input directory.')
+flags.DEFINE_string('output_dir', None, 'The output directory.')
+flags.DEFINE_string(
+ 'default_args', '',
+ 'The compiler flags to compile with when using downstream tooling.')
+
+flags.mark_flag_as_required('input_dir')
+flags.mark_flag_as_required('output_dir')
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+ logging.warning(
+ 'Using this tool does not guarantee that the bitcode is taken at '
+ 'the correct stage for consumption during model training. Make '
+ 'sure to validate assumptions about where the bitcode is coming '
+ 'from before using it in production.')
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
+ make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir,
+ FLAGS.output_dir)
+ make_corpus_lib.write_corpus_manifest(relative_paths, FLAGS.output_dir,
+ FLAGS.default_args.split())
+
+
+if __name__ == '__main__':
+ app.run(main)
diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/py/src/mlgo/make_corpus_lib.py
new file mode 100644
index 00000000000000..3598fc12a04d14
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus_lib.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library functions for making a corpus from arbitrary bitcode."""
+
+import pathlib
+import os
+import shutil
+import json
+
+from typing import List, Optional
+
+BITCODE_EXTENSION = '.bc'
+
+
+def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]:
+ """Finds bitcode files to extract from a given directory.
+
+ Args:
+ bitcode_base_dir: The base directory where the bitcode to be copied
+ is from.
+ output_dir: The directory to place the bitcode in.
+
+ Returns an array of paths representing the relative path to the bitcode
+ file from the base direcotry.
+ """
+ paths = [
+ str(p)[:-len(BITCODE_EXTENSION)]
+ for p in pathlib.Path(bitcode_base_dir).glob('**/*' + BITCODE_EXTENSION)
+ ]
+
+ return [
+ os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths
+ ]
+
+
+def copy_bitcode(relative_paths: List[str], bitcode_base_dir: str,
+ output_dir: str) -> None:
+ """Copies bitcode files from the base directory to the output directory.
+
+ Args:
+ relative_paths: An array of relative paths to bitcode files that are copied
+ over to the output directory, preserving relative location.
+ bitcode_base_dir: The base directory where the bitcode is located.
+ output_dir: The output directory to place the bitcode in.
+ """
+ for relative_path in relative_paths:
+ base_path = os.path.join(bitcode_base_dir,
+ relative_path + BITCODE_EXTENSION)
+ destination_path = os.path.join(output_dir,
+ relative_path + BITCODE_EXTENSION)
+ os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+ shutil.copy(base_path, destination_path)
+
+
+def write_corpus_manifest(relative_output_paths: List[str],
+ output_dir: str,
+ default_args: Optional[List[str]] = None) -> None:
+ """Creates a corpus manifest describing the bitcode that has been found.
+
+ Args:
+ relative_output_paths: A list of paths to each bitcode file relative to the
+ output directory.
+ outout_dir: The output directory where the corpus is being created.
+ default_args: An array of compiler flags that should be used to compile
+ the bitcode when using further downstream tooling."""
+ if default_args is None:
+ default_args = []
+ corpus_description = {
+ 'global_command_override': default_args,
+ 'has_thinlto': False,
+ 'modules': [path for path in relative_output_paths if path is not None]
+ }
+
+ with open(
+ os.path.join(output_dir, 'corpus_description.json'),
+ 'w',
+ encoding='utf-8') as description_file:
+ json.dump(corpus_description, description_file, indent=2)
diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/py/src/mlgo/make_corpus_test.py
new file mode 100644
index 00000000000000..8ed598695d06ee
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus_test.py
@@ -0,0 +1,66 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test for compiler_opt.tools.make_corpus_lib"""
+
+import json
+import os
+
+from absl.testing import absltest
+
+from compiler_opt.tools import make_corpus_lib
+
+
+class MakeCorpusTest(absltest.TestCase):
+
+ def test_load_bitcode_from_directory(self):
+ outer = self.create_tempdir()
+ tempdir = outer.mkdir(dir_path='nested')
+ tempdir.create_file('test1.bc')
+ tempdir.create_file('test2.bc')
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(outer)
+ relative_paths = sorted(relative_paths)
+ self.assertEqual(relative_paths[0], 'nested/test1')
+ self.assertEqual(relative_paths[1], 'nested/test2')
+
+ def test_copy_bitcode(self):
+ build_dir = self.create_tempdir()
+ nested_dir = build_dir.mkdir(dir_path='nested')
+ nested_dir.create_file('test1.bc')
+ nested_dir.create_file('test2.bc')
+ relative_paths = ['nested/test1', 'nested/test2']
+ corpus_dir = self.create_tempdir()
+ make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
+ output_files = sorted(os.listdir(os.path.join(corpus_dir, './nested')))
+ self.assertEqual(output_files[0], 'test1.bc')
+ self.assertEqual(output_files[1], 'test2.bc')
+
+ def test_write_corpus_manifest(self):
+ relative_output_paths = ['test/test1', 'test/test2']
+ output_dir = self.create_tempdir()
+ default_args = ['-O3', '-c']
+ make_corpus_lib.write_corpus_manifest(relative_output_paths, output_dir,
+ default_args)
+ with open(
+ os.path.join(output_dir, 'corpus_description.json'),
+ encoding='utf-8') as corpus_description_file:
+ corpus_description = json.load(corpus_description_file)
+ self.assertEqual(corpus_description['global_command_override'],
+ default_args)
+ self.assertEqual(corpus_description['has_thinlto'], False)
+ self.assertEqual(corpus_description['modules'], relative_output_paths)
+
+
+if __name__ == '__main__':
+ absltest.main()
>From 3f8d1e7052734979806d94cccfde5a8a05f6dece Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Sun, 14 Jan 2024 21:14:47 -0800
Subject: [PATCH 2/7] Add proper copyright headers
---
llvm/py/src/mlgo/combine_training_corpus.py | 17 +++--------------
llvm/py/src/mlgo/combine_training_corpus_lib.py | 17 +++--------------
.../py/src/mlgo/combine_training_corpus_test.py | 17 +++--------------
llvm/py/src/mlgo/extract_ir.py | 17 +++--------------
llvm/py/src/mlgo/extract_ir_lib.py | 17 +++--------------
llvm/py/src/mlgo/extract_ir_test.py | 17 +++--------------
llvm/py/src/mlgo/make_corpus.py | 17 +++--------------
llvm/py/src/mlgo/make_corpus_lib.py | 17 +++--------------
llvm/py/src/mlgo/make_corpus_test.py | 17 +++--------------
9 files changed, 27 insertions(+), 126 deletions(-)
diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/py/src/mlgo/combine_training_corpus.py
index 94ee1cbac9cea4..e62bcb61e9d9e1 100644
--- a/llvm/py/src/mlgo/combine_training_corpus.py
+++ b/llvm/py/src/mlgo/combine_training_corpus.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
r"""Combine multiple training corpus into a single training corpus.
Currently only support the case that multiple corpus share the same
diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/py/src/mlgo/combine_training_corpus_lib.py
index 0359961266a240..1050e5099ae21c 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_lib.py
+++ b/llvm/py/src/mlgo/combine_training_corpus_lib.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Library for combining training corpora."""
import os
diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/py/src/mlgo/combine_training_corpus_test.py
index 47dd602967b68f..3c793947db139e 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_test.py
+++ b/llvm/py/src/mlgo/combine_training_corpus_test.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Tests for combining training corpora."""
import json
diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/py/src/mlgo/extract_ir.py
index 2a1ef3978888d6..58e31a0475e124 100644
--- a/llvm/py/src/mlgo/extract_ir.py
+++ b/llvm/py/src/mlgo/extract_ir.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Extract IR for training.
Extract IR for training, either from a compile_commands.json file produced by
diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/py/src/mlgo/extract_ir_lib.py
index c1d2a54b9a9e7c..83d2b26d1f71ce 100644
--- a/llvm/py/src/mlgo/extract_ir_lib.py
+++ b/llvm/py/src/mlgo/extract_ir_lib.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Library functions for IR extraction."""
import os
diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/py/src/mlgo/extract_ir_test.py
index 8811134aab4fce..d7de50530032cc 100644
--- a/llvm/py/src/mlgo/extract_ir_test.py
+++ b/llvm/py/src/mlgo/extract_ir_test.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Tests for compiler_opt.tools.extract_ir."""
# pylint: disable=protected-access
diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/py/src/mlgo/make_corpus.py
index 24493d894be723..989d9790b5bcd9 100644
--- a/llvm/py/src/mlgo/make_corpus.py
+++ b/llvm/py/src/mlgo/make_corpus.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Tool for making a corpus from arbitrary bitcode.
To create a corpus from a set of bitcode files in an input directory, run
diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/py/src/mlgo/make_corpus_lib.py
index 3598fc12a04d14..97db20a9859e17 100644
--- a/llvm/py/src/mlgo/make_corpus_lib.py
+++ b/llvm/py/src/mlgo/make_corpus_lib.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Library functions for making a corpus from arbitrary bitcode."""
import pathlib
diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/py/src/mlgo/make_corpus_test.py
index 8ed598695d06ee..fcb861ebb91f32 100644
--- a/llvm/py/src/mlgo/make_corpus_test.py
+++ b/llvm/py/src/mlgo/make_corpus_test.py
@@ -1,17 +1,6 @@
-# coding=utf-8
-# Copyright 2020 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Test for compiler_opt.tools.make_corpus_lib"""
import json
>From 2bc8ac318e02672f4bfe87df8cbe19a1c00205dc Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Mon, 15 Jan 2024 05:37:27 +0000
Subject: [PATCH 3/7] Format files using black
---
llvm/py/src/mlgo/combine_training_corpus.py | 12 +-
.../src/mlgo/combine_training_corpus_lib.py | 48 +-
.../src/mlgo/combine_training_corpus_test.py | 154 ++--
llvm/py/src/mlgo/extract_ir.py | 194 +++---
llvm/py/src/mlgo/extract_ir_lib.py | 655 +++++++++---------
llvm/py/src/mlgo/extract_ir_test.py | 447 ++++++------
llvm/py/src/mlgo/make_corpus.py | 43 +-
llvm/py/src/mlgo/make_corpus_lib.py | 106 ++-
llvm/py/src/mlgo/make_corpus_test.py | 81 ++-
9 files changed, 923 insertions(+), 817 deletions(-)
diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/py/src/mlgo/combine_training_corpus.py
index e62bcb61e9d9e1..c14c9381a18a6b 100644
--- a/llvm/py/src/mlgo/combine_training_corpus.py
+++ b/llvm/py/src/mlgo/combine_training_corpus.py
@@ -28,17 +28,17 @@
from compiler_opt.tools import combine_training_corpus_lib
-flags.DEFINE_string('root_dir', '', 'root dir of module paths to combine.')
+flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
FLAGS = flags.FLAGS
def main(argv):
- if len(argv) > 1:
- raise app.UsageError('Too many command-line arguments.')
+ if len(argv) > 1:
+ raise app.UsageError("Too many command-line arguments.")
- combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
+ combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
-if __name__ == '__main__':
- app.run(main)
+if __name__ == "__main__":
+ app.run(main)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/py/src/mlgo/combine_training_corpus_lib.py
index 1050e5099ae21c..1de182e4cb80dd 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_lib.py
+++ b/llvm/py/src/mlgo/combine_training_corpus_lib.py
@@ -10,30 +10,30 @@
import tensorflow as tf
-_FILE_NAME = 'corpus_description.json'
+_FILE_NAME = "corpus_description.json"
def combine_corpus(root_dir: str) -> None:
- module_names = []
- output_corpus_description = {}
-
- corpus_description_glob = os.path.join(root_dir, '*/' + _FILE_NAME)
- for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
- logging.info('processing %s', corpus_description_path)
-
- with tf.io.gfile.GFile(corpus_description_path, 'r') as f:
- corpus_description = json.load(f)
- sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
- module_names.extend([
- os.path.join(sub_dir, name) for name in corpus_description['modules']
- ])
- del corpus_description['modules']
- if len(output_corpus_description) == 0:
- output_corpus_description = corpus_description
- elif corpus_description != output_corpus_description:
- raise ValueError('Input corpora differ by more than modules.')
-
- output_corpus_description['modules'] = module_names
-
- with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), 'w') as f:
- json.dump(output_corpus_description, f, indent=2)
+ module_names = []
+ output_corpus_description = {}
+
+ corpus_description_glob = os.path.join(root_dir, "*/" + _FILE_NAME)
+ for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
+ logging.info("processing %s", corpus_description_path)
+
+ with tf.io.gfile.GFile(corpus_description_path, "r") as f:
+ corpus_description = json.load(f)
+ sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
+ module_names.extend(
+ [os.path.join(sub_dir, name) for name in corpus_description["modules"]]
+ )
+ del corpus_description["modules"]
+ if len(output_corpus_description) == 0:
+ output_corpus_description = corpus_description
+ elif corpus_description != output_corpus_description:
+ raise ValueError("Input corpora differ by more than modules.")
+
+ output_corpus_description["modules"] = module_names
+
+ with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), "w") as f:
+ json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/py/src/mlgo/combine_training_corpus_test.py
index 3c793947db139e..969d8472964971 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_test.py
+++ b/llvm/py/src/mlgo/combine_training_corpus_test.py
@@ -12,82 +12,88 @@
class CombineTrainingCorpusTest(absltest.TestCase):
+ def test_combine_corpus(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path="subcorpus1")
+ subcorpus2_dir = corpus_dir.mkdir(dir_path="subcorpus2")
+ subcorpus1_description = {
+ "has_thinlto": False,
+ "modules": ["test1.o", "test2.o"],
+ }
+ subcorpus2_description = {
+ "has_thinlto": False,
+ "modules": ["test3.o", "test4.o"],
+ }
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path="corpus_description.json"
+ )
+ subcorpus2_description_file = subcorpus2_dir.create_file(
+ file_path="corpus_description.json"
+ )
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+ combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+ with open(
+ os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+ ) as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ self.assertEqual(combined_corpus_description["has_thinlto"], False)
+ self.assertLen(combined_corpus_description["modules"], 4)
+ self.assertIn("subcorpus1/test1.o", combined_corpus_description["modules"])
+ self.assertIn("subcorpus1/test2.o", combined_corpus_description["modules"])
+ self.assertIn("subcorpus2/test3.o", combined_corpus_description["modules"])
+ self.assertIn("subcorpus2/test4.o", combined_corpus_description["modules"])
- def test_combine_corpus(self):
- corpus_dir = self.create_tempdir()
- subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
- subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
- subcorpus1_description = {
- 'has_thinlto': False,
- 'modules': ['test1.o', 'test2.o']
- }
- subcorpus2_description = {
- 'has_thinlto': False,
- 'modules': ['test3.o', 'test4.o']
- }
- subcorpus1_description_file = subcorpus1_dir.create_file(
- file_path='corpus_description.json')
- subcorpus2_description_file = subcorpus2_dir.create_file(
- file_path='corpus_description.json')
- subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
- subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
- combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
- with open(
- os.path.join(corpus_dir, 'corpus_description.json'),
- encoding='utf-8') as combined_corpus_description_file:
- combined_corpus_description = json.load(combined_corpus_description_file)
- self.assertEqual(combined_corpus_description['has_thinlto'], False)
- self.assertLen(combined_corpus_description['modules'], 4)
- self.assertIn('subcorpus1/test1.o', combined_corpus_description['modules'])
- self.assertIn('subcorpus1/test2.o', combined_corpus_description['modules'])
- self.assertIn('subcorpus2/test3.o', combined_corpus_description['modules'])
- self.assertIn('subcorpus2/test4.o', combined_corpus_description['modules'])
+ def test_empty_folder(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path="subcorpus1")
+ _ = corpus_dir.mkdir(dir_path="empty_dir")
+ subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path="corpus_description.json"
+ )
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+ with open(
+ os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+ ) as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ self.assertLen(combined_corpus_description["modules"], 2)
- def test_empty_folder(self):
- corpus_dir = self.create_tempdir()
- subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
- _ = corpus_dir.mkdir(dir_path='empty_dir')
- subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
- subcorpus1_description_file = subcorpus1_dir.create_file(
- file_path='corpus_description.json')
- subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
- combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
- with open(
- os.path.join(corpus_dir, 'corpus_description.json'),
- encoding='utf-8') as combined_corpus_description_file:
- combined_corpus_description = json.load(combined_corpus_description_file)
- self.assertLen(combined_corpus_description['modules'], 2)
+ def test_ignore_extra_file(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path="subcorpus1")
+ _ = corpus_dir.create_file(file_path="empty.log")
+ subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path="corpus_description.json"
+ )
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+ with open(
+ os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+ ) as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ self.assertLen(combined_corpus_description["modules"], 2)
- def test_ignore_extra_file(self):
- corpus_dir = self.create_tempdir()
- subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
- _ = corpus_dir.create_file(file_path='empty.log')
- subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
- subcorpus1_description_file = subcorpus1_dir.create_file(
- file_path='corpus_description.json')
- subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
- combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
- with open(
- os.path.join(corpus_dir, 'corpus_description.json'),
- encoding='utf-8') as combined_corpus_description_file:
- combined_corpus_description = json.load(combined_corpus_description_file)
- self.assertLen(combined_corpus_description['modules'], 2)
+ def test_different_corpora(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path="subcorpus1")
+ subcorpus2_dir = corpus_dir.mkdir(dir_path="subcorpus2")
+ subcorpus1_description = {"has_thinlto": False, "modules": ["test1.o"]}
+ subcorpus2_description = {"has_thinlto": True, "modules": ["test2.o"]}
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path="corpus_description.json"
+ )
+ subcorpus2_description_file = subcorpus2_dir.create_file(
+ file_path="corpus_description.json"
+ )
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+ self.assertRaises(
+ ValueError, combine_training_corpus_lib.combine_corpus, corpus_dir.full_path
+ )
- def test_different_corpora(self):
- corpus_dir = self.create_tempdir()
- subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
- subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
- subcorpus1_description = {'has_thinlto': False, 'modules': ['test1.o']}
- subcorpus2_description = {'has_thinlto': True, 'modules': ['test2.o']}
- subcorpus1_description_file = subcorpus1_dir.create_file(
- file_path='corpus_description.json')
- subcorpus2_description_file = subcorpus2_dir.create_file(
- file_path='corpus_description.json')
- subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
- subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
- self.assertRaises(ValueError, combine_training_corpus_lib.combine_corpus,
- corpus_dir.full_path)
-
-if __name__ == '__main__':
- absltest.main()
+if __name__ == "__main__":
+ absltest.main()
diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/py/src/mlgo/extract_ir.py
index 58e31a0475e124..395a298ecec81d 100644
--- a/llvm/py/src/mlgo/extract_ir.py
+++ b/llvm/py/src/mlgo/extract_ir.py
@@ -34,98 +34,128 @@
from compiler_opt.tools import extract_ir_lib
flags.DEFINE_string(
- 'input', None,
- 'Input file or directory - either compile_commands.json, a linker parameter'
- 'list, or a path to a directory containing object files.')
+ "input",
+ None,
+ "Input file or directory - either compile_commands.json, a linker parameter"
+ "list, or a path to a directory containing object files.",
+)
flags.DEFINE_enum(
- 'input_type', 'json', ['json', 'params', 'directory'],
- 'Input file type - json, params, or directory. params latter refers to lld'
- 'params.')
-flags.DEFINE_string('output_dir', None, 'Output directory')
+ "input_type",
+ "json",
+ ["json", "params", "directory"],
+ "Input file type - json, params, or directory. params latter refers to lld"
+ "params.",
+)
+flags.DEFINE_string("output_dir", None, "Output directory")
flags.DEFINE_integer(
- 'num_workers', None,
- 'Number of parallel workers for objcopy. `None` for maximum available.')
-flags.DEFINE_string('llvm_objcopy_path', 'llvm-objcopy', 'Path to llvm-objcopy')
+ "num_workers",
+ None,
+ "Number of parallel workers for objcopy. `None` for maximum available.",
+)
+flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
flags.DEFINE_string(
- 'obj_base_dir', '',
- 'Base directory for object files. Defaults to current working dir.')
+ "obj_base_dir",
+ "",
+ "Base directory for object files. Defaults to current working dir.",
+)
flags.DEFINE_string(
- 'cmd_filter', None,
- 'Include only those modules with a command line matching this regexp. '
- 'Setting it to None for not filtering. Note that the regexp is applied '
- 'independently for each separate command line option. For example, ^-Oz$ '
- 'will match Oz - built binaries. Does not work with thinlto_build=lld.')
+ "cmd_filter",
+ None,
+ "Include only those modules with a command line matching this regexp. "
+ "Setting it to None for not filtering. Note that the regexp is applied "
+ "independently for each separate command line option. For example, ^-Oz$ "
+ "will match Oz - built binaries. Does not work with thinlto_build=lld.",
+)
flags.DEFINE_enum(
- 'thinlto_build', None, ['distributed', 'local'],
- 'Set if the build was performed with either \'distributed\' or '
- '\'local\' ThinLTO. This ensures the thinlto.bc files are also copied. '
- 'The build is assumed to have had '
- '-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed '
- 'case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files '
- 'passed in the local case.')
+ "thinlto_build",
+ None,
+ ["distributed", "local"],
+ "Set if the build was performed with either 'distributed' or "
+ "'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
+ "The build is assumed to have had "
+ "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
+ "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
+ "passed in the local case.",
+)
flags.DEFINE_string(
- 'cmd_section_name', '.llvmcmd',
- 'The section name passed to llvm-objcopy. For ELF object files, the '
- 'default .llvmcmd is correct. For Mach-O object files, one should use '
- 'something like __LLVM,__cmdline')
+ "cmd_section_name",
+ ".llvmcmd",
+ "The section name passed to llvm-objcopy. For ELF object files, the "
+ "default .llvmcmd is correct. For Mach-O object files, one should use "
+ "something like __LLVM,__cmdline",
+)
flags.DEFINE_string(
- 'bitcode_section_name', '.llvmbc',
- 'The section name passed to llvm-objcopy. For ELF object files, the '
- 'default .llvmbc is correct. For Mach-O object files, one should use '
- '__LLVM,__bitcode')
+ "bitcode_section_name",
+ ".llvmbc",
+ "The section name passed to llvm-objcopy. For ELF object files, the "
+ "default .llvmbc is correct. For Mach-O object files, one should use "
+ "__LLVM,__bitcode",
+)
-flags.mark_flag_as_required('output_dir')
+flags.mark_flag_as_required("output_dir")
FLAGS = flags.FLAGS
def main(argv):
- if len(argv) > 1:
- raise app.UsageError('Too many command-line arguments.')
-
- objs = []
- if FLAGS.input is not None and FLAGS.thinlto_build == 'local':
- raise ValueError('--thinlto_build=local cannot be run with --input')
- if FLAGS.input is None:
- if FLAGS.thinlto_build != 'local':
- raise ValueError('--input or --thinlto_build=local must be provided')
- objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir,
- FLAGS.output_dir)
- elif FLAGS.input_type == 'json':
- with open(FLAGS.input, encoding='utf-8') as f:
- objs = extract_ir_lib.load_from_compile_commands(
- json.load(f), FLAGS.output_dir)
- elif FLAGS.input_type == 'params':
- if not FLAGS.obj_base_dir:
- logging.info(
- '-obj_base_dir is unspecified, assuming current directory.'
- 'If no objects are found, use this option to specify the root'
- 'directory for the object file paths in the input file.')
- with open(FLAGS.input, encoding='utf-8') as f:
- objs = extract_ir_lib.load_from_lld_params(
- [l.strip() for l in f.readlines()], FLAGS.obj_base_dir,
- FLAGS.output_dir)
- elif FLAGS.input_type == 'directory':
- logging.warning(
- 'Using the directory input is only recommended if the build system'
- 'your project uses does not support any structured output that'
- 'ml-compiler-opt understands. If your build system provides a'
- 'structured compilation database, use that instead')
- objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
- else:
- logging.error('Unknown input type: %s', FLAGS.input_type)
-
- relative_output_paths = extract_ir_lib.run_extraction(
- objs, FLAGS.num_workers, FLAGS.llvm_objcopy_path, FLAGS.cmd_filter,
- FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name)
-
- extract_ir_lib.write_corpus_manifest(FLAGS.thinlto_build,
- relative_output_paths, FLAGS.output_dir)
-
- logging.info('Converted %d files out of %d',
- len(objs) - relative_output_paths.count(None), len(objs))
-
-
-if __name__ == '__main__':
- multiprocessing.set_start_method('fork')
- app.run(main)
+ if len(argv) > 1:
+ raise app.UsageError("Too many command-line arguments.")
+
+ objs = []
+ if FLAGS.input is not None and FLAGS.thinlto_build == "local":
+ raise ValueError("--thinlto_build=local cannot be run with --input")
+ if FLAGS.input is None:
+ if FLAGS.thinlto_build != "local":
+ raise ValueError("--input or --thinlto_build=local must be provided")
+ objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
+ elif FLAGS.input_type == "json":
+ with open(FLAGS.input, encoding="utf-8") as f:
+ objs = extract_ir_lib.load_from_compile_commands(
+ json.load(f), FLAGS.output_dir
+ )
+ elif FLAGS.input_type == "params":
+ if not FLAGS.obj_base_dir:
+ logging.info(
+ "-obj_base_dir is unspecified, assuming current directory."
+ "If no objects are found, use this option to specify the root"
+ "directory for the object file paths in the input file."
+ )
+ with open(FLAGS.input, encoding="utf-8") as f:
+ objs = extract_ir_lib.load_from_lld_params(
+ [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
+ )
+ elif FLAGS.input_type == "directory":
+ logging.warning(
+ "Using the directory input is only recommended if the build system"
+ "your project uses does not support any structured output that"
+ "ml-compiler-opt understands. If your build system provides a"
+ "structured compilation database, use that instead"
+ )
+ objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+ else:
+ logging.error("Unknown input type: %s", FLAGS.input_type)
+
+ relative_output_paths = extract_ir_lib.run_extraction(
+ objs,
+ FLAGS.num_workers,
+ FLAGS.llvm_objcopy_path,
+ FLAGS.cmd_filter,
+ FLAGS.thinlto_build,
+ FLAGS.cmd_section_name,
+ FLAGS.bitcode_section_name,
+ )
+
+ extract_ir_lib.write_corpus_manifest(
+ FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
+ )
+
+ logging.info(
+ "Converted %d files out of %d",
+ len(objs) - relative_output_paths.count(None),
+ len(objs),
+ )
+
+
+if __name__ == "__main__":
+ multiprocessing.set_start_method("fork")
+ app.run(main)
diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/py/src/mlgo/extract_ir_lib.py
index 83d2b26d1f71ce..ce6a4a17a8e6ac 100644
--- a/llvm/py/src/mlgo/extract_ir_lib.py
+++ b/llvm/py/src/mlgo/extract_ir_lib.py
@@ -22,341 +22,374 @@
# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
# \0 - separated list of strings, to a \n one.
def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
- """Determine if the module should be included."""
- if match_regexp is None:
- return True
- lines = cmdline.split('\0')
- return any(len(re.findall(match_regexp, l)) for l in lines)
+ """Determine if the module should be included."""
+ if match_regexp is None:
+ return True
+ lines = cmdline.split("\0")
+ return any(len(re.findall(match_regexp, l)) for l in lines)
def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
- opts = cmdline.split('\0')
- for option in opts:
- if option.startswith('-fthinlto-index'):
- return os.path.join(basedir, option.split('=')[1])
- return None
+ opts = cmdline.split("\0")
+ for option in opts:
+ if option.startswith("-fthinlto-index"):
+ return os.path.join(basedir, option.split("=")[1])
+ return None
class TrainingIRExtractor:
- """IR and command line extraction from an object file."""
-
- def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
- """Set up a TrainingIRExtractor.
-
- Args:
- obj_relative_path: relative path to the input object file. It will be also
- used to construct the absolute path of the output IR and cmd files, by
- appending it to output_base_dir.
- output_base_dir: the directory under which the output will be produced.
- obj_base_dir: the base directory for all the input object files.
- """
- self._obj_relative_path = obj_relative_path
- self._output_base_dir = output_base_dir
- self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ''
-
- def obj_base_dir(self):
- return self._obj_base_dir
+ """IR and command line extraction from an object file."""
+
+ def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
+ """Set up a TrainingIRExtractor.
+
+ Args:
+ obj_relative_path: relative path to the input object file. It will be also
+ used to construct the absolute path of the output IR and cmd files, by
+ appending it to output_base_dir.
+ output_base_dir: the directory under which the output will be produced.
+ obj_base_dir: the base directory for all the input object files.
+ """
+ self._obj_relative_path = obj_relative_path
+ self._output_base_dir = output_base_dir
+ self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ""
+
+ def obj_base_dir(self):
+ return self._obj_base_dir
+
+ def output_base_dir(self):
+ return self._output_base_dir
+
+ def relative_output_path(self):
+ return self._obj_relative_path
+
+ def input_obj(self):
+ return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+
+ def lld_src_bc(self):
+ # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+ # IR bitcode saved by lld. It is hardcoded into lld.
+ return os.path.join(
+ self._obj_base_dir, self._obj_relative_path + ".3.import.bc"
+ )
+
+ def lld_src_thinlto(self):
+ return os.path.join(self._obj_base_dir, self._obj_relative_path + ".thinlto.bc")
+
+ def dest_dir(self):
+ return os.path.join(
+ self.output_base_dir(), os.path.dirname(self._obj_relative_path)
+ )
+
+ def module_name(self):
+ return os.path.basename(self._obj_relative_path)
+
+ def cmd_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + ".cmd")
+
+ def bc_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + ".bc")
+
+ def thinlto_index_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + ".thinlto.bc")
+
+ def _get_extraction_cmd_command(
+ self, llvm_objcopy_path: str, cmd_section_name: str
+ ):
+ """Get llvm-objcopy and process args to a produce a command string that,
+ when invoked, will extract the cmd section info ths self.cmd_file() file.
+ """
+ return [
+ llvm_objcopy_path,
+ "--dump-section=" + cmd_section_name + "=" + self.cmd_file(),
+ self.input_obj(),
+ "/dev/null",
+ ]
+
+ def _get_extraction_bc_command(
+ self, llvm_objcopy_path: str, bitcode_section_name: str
+ ):
+ """Gets llvm-objcopy and process args to produce a command string that,
+ when invoked, will extract the bitcode section into the self.bc_file()
+ file.
+ """
+ return [
+ llvm_objcopy_path,
+ "--dump-section=" + bitcode_section_name + "=" + self.bc_file(),
+ self.input_obj(),
+ "/dev/null",
+ ]
+
+ def _extract_clang_artifacts(
+ self,
+ llvm_objcopy_path: str,
+ cmd_filter: str,
+ is_thinlto: bool,
+ cmd_section_name: str,
+ bitcode_section_name: str,
+ ) -> Optional[str]:
+ """Run llvm-objcopy to extract the .bc and command line."""
+ if not os.path.exists(self.input_obj()):
+ logging.info("%s does not exist.", self.input_obj())
+ return None
+ os.makedirs(self.dest_dir(), exist_ok=True)
+ try:
+ subprocess.check_output(
+ self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
+ stderr=subprocess.STDOUT,
+ encoding="utf-8",
+ )
+ if cmd_filter is not None or is_thinlto:
+ with open(self.cmd_file(), encoding="utf-8") as f:
+ lines = f.readlines()
+ assert len(lines) == 1
+ cmdline = lines[0]
+ if not should_include_module(cmdline, cmd_filter):
+ logging.info(
+ "Excluding module %s because it does not match the filter",
+ self.input_obj(),
+ )
+ os.remove(self.cmd_file())
+ return None
+ if is_thinlto:
+ index_file = get_thinlto_index(cmdline, self.obj_base_dir())
+ shutil.copy(index_file, self.thinlto_index_file())
+
+ subprocess.check_output(
+ self._get_extraction_bc_command(
+ llvm_objcopy_path, bitcode_section_name
+ ),
+ stderr=subprocess.STDOUT,
+ encoding="utf-8",
+ )
+ except subprocess.CalledProcessError as e:
+ # This may happen if .o file was build from asm (.S source).
+ logging.warning("%s was not processed: %s", self.input_obj(), e)
+ logging.info(e.output)
+ return None
+ assert (
+ os.path.exists(self.cmd_file())
+ and os.path.exists(self.bc_file())
+ and (not is_thinlto or os.path.exists(self.thinlto_index_file()))
+ )
+ return self.relative_output_path()
+
+ def _extract_lld_artifacts(self) -> Optional[str]:
+ """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation."""
+ if not os.path.exists(self.lld_src_bc()):
+ logging.info("%s does not exist.", self.lld_src_bc())
+ return None
+ if not os.path.exists(self.lld_src_thinlto()):
+ logging.info("%s does not exist.", self.lld_src_thinlto())
+ return None
+ os.makedirs(self.dest_dir(), exist_ok=True)
+
+ # Copy over the files
+ shutil.copy(self.lld_src_bc(), self.bc_file())
+ shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
+
+ assert os.path.exists(self.bc_file())
+ assert os.path.exists(self.thinlto_index_file())
+ return self._obj_relative_path
+
+ def extract(
+ self,
+ llvm_objcopy_path: Optional[str] = None,
+ cmd_filter: Optional[str] = None,
+ thinlto_build: Optional[str] = None,
+ cmd_section_name: Optional[str] = ".llvmcmd",
+ bitcode_section_name: Optional[str] = ".llvmbc",
+ ) -> Optional[str]:
+ if thinlto_build == "local":
+ return self._extract_lld_artifacts()
+ return self._extract_clang_artifacts(
+ llvm_objcopy_path=llvm_objcopy_path,
+ cmd_filter=cmd_filter,
+ is_thinlto=thinlto_build == "distributed",
+ cmd_section_name=cmd_section_name,
+ bitcode_section_name=bitcode_section_name,
+ )
- def output_base_dir(self):
- return self._output_base_dir
- def relative_output_path(self):
- return self._obj_relative_path
+def convert_compile_command_to_objectfile(
+ command: Dict[str, str], output_dir: str
+) -> Optional[TrainingIRExtractor]:
+ obj_base_dir = command["directory"]
+ if "arguments" in command:
+ cmd_parts = command["arguments"]
+ elif "command" in command:
+ cmd_parts = command["command"].split()
+ else:
+ logging.info("compile_commands element has no command and arguments")
+ return None
- def input_obj(self):
- return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+ try:
+ obj_index = cmd_parts.index("-o") + 1
+ except ValueError:
+ # This could happen if there are non-clang commands in compile_commands.json
+ logging.info("Command has no -o option: %s", " ".join(cmd_parts))
+ return None
+ obj_rel_path = cmd_parts[obj_index]
+ # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
+ return TrainingIRExtractor(
+ obj_relative_path=obj_rel_path,
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir,
+ )
- def lld_src_bc(self):
- # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
- # IR bitcode saved by lld. It is hardcoded into lld.
- return os.path.join(self._obj_base_dir,
- self._obj_relative_path + '.3.import.bc')
- def lld_src_thinlto(self):
- return os.path.join(self._obj_base_dir,
- self._obj_relative_path + '.thinlto.bc')
+def load_from_compile_commands(
+ json_array: List[Dict[str, str]], output_dir: str
+) -> List[TrainingIRExtractor]:
+ objs = [
+ convert_compile_command_to_objectfile(cmd, output_dir) for cmd in json_array
+ ]
+ # Filter out None, in case there were non-clang commands in the .json
+ return [obj for obj in objs if obj is not None]
- def dest_dir(self):
- return os.path.join(self.output_base_dir(),
- os.path.dirname(self._obj_relative_path))
- def module_name(self):
- return os.path.basename(self._obj_relative_path)
+def load_from_lld_params(
+ params_array: List[str], obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+ """Create an ObjectFile array based on lld's parameters."""
+ # yank out -o and the output. After that, anything not starting with '-', and
+ # ending in a '.o', is an object file.
+ try:
+ minus_o_idx = params_array.index("-o")
+ del params_array[minus_o_idx : minus_o_idx + 2]
+ just_obj_paths = [
+ o for o in params_array if not o.startswith("-") and o.endswith(".o")
+ ]
+ except ValueError:
+ logging.info("This params file does not have an explicit -o option.")
+ just_obj_paths = params_array
+
+ def make_obj(obj_file: str) -> TrainingIRExtractor:
+ return TrainingIRExtractor(
+ obj_relative_path=obj_file,
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir,
+ )
+
+ return [make_obj(obj_file) for obj_file in just_obj_paths]
+
+
+def load_from_directory(
+ obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+ """Create an object file array by globbing an entire drectory.
- def cmd_file(self):
- return os.path.join(self.dest_dir(), self.module_name() + '.cmd')
+ Args:
+ obj_base_dir: The base build directory that all object files will be
+ written out as being relative to.
+ output_dir: The output directory where extracted .bc and .cmd files should
+ be placed.
+ """
+ paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.o")]
- def bc_file(self):
- return os.path.join(self.dest_dir(), self.module_name() + '.bc')
+ def make_spec(obj_file: str):
+ return TrainingIRExtractor(
+ obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir,
+ )
- def thinlto_index_file(self):
- return os.path.join(self.dest_dir(), self.module_name() + '.thinlto.bc')
+ return [make_spec(path) for path in paths]
- def _get_extraction_cmd_command(self, llvm_objcopy_path: str,
- cmd_section_name: str):
- """Get llvm-objcopy and process args to a produce a command string that,
- when invoked, will extract the cmd section info ths self.cmd_file() file.
- """
- return [
- llvm_objcopy_path,
- '--dump-section=' + cmd_section_name + '=' + self.cmd_file(),
- self.input_obj(), '/dev/null'
- ]
- def _get_extraction_bc_command(self, llvm_objcopy_path: str,
- bitcode_section_name: str):
- """Gets llvm-objcopy and process args to produce a command string that,
- when invoked, will extract the bitcode section into the self.bc_file()
- file.
- """
- return [
- llvm_objcopy_path,
- '--dump-section=' + bitcode_section_name + '=' + self.bc_file(),
- self.input_obj(), '/dev/null'
- ]
+def load_for_lld_thinlto(
+ obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+ # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+ # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
+ # are also emitted next to the postimport bitcode, with the suffix
+ # .thinlto.bc instead
+ paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.3.import.bc")]
+
+ def make_spec(obj_file: str):
+ return TrainingIRExtractor(
+ # Cut away .3.import.bc
+ obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir,
+ )
+
+ return [make_spec(path) for path in paths]
+
+
+def run_extraction(
+ objs: List[TrainingIRExtractor],
+ num_workers: int,
+ llvm_objcopy_path: str,
+ cmd_filter: str,
+ thinlto_build: str,
+ cmd_section_name: str,
+ bitcode_section_name: str,
+):
+ """Extracts all specified object files into the corpus directory.
- def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str,
- is_thinlto: bool, cmd_section_name: str,
- bitcode_section_name: str) -> Optional[str]:
- """Run llvm-objcopy to extract the .bc and command line."""
- if not os.path.exists(self.input_obj()):
- logging.info('%s does not exist.', self.input_obj())
- return None
- os.makedirs(self.dest_dir(), exist_ok=True)
- try:
- subprocess.check_output(
- self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
- stderr=subprocess.STDOUT,
- encoding='utf-8')
- if cmd_filter is not None or is_thinlto:
- with open(self.cmd_file(), encoding='utf-8') as f:
- lines = f.readlines()
- assert len(lines) == 1
- cmdline = lines[0]
- if not should_include_module(cmdline, cmd_filter):
- logging.info(
- 'Excluding module %s because it does not match the filter',
- self.input_obj())
- os.remove(self.cmd_file())
- return None
- if is_thinlto:
- index_file = get_thinlto_index(cmdline, self.obj_base_dir())
- shutil.copy(index_file, self.thinlto_index_file())
-
- subprocess.check_output(
- self._get_extraction_bc_command(llvm_objcopy_path,
- bitcode_section_name),
- stderr=subprocess.STDOUT,
- encoding='utf-8')
- except subprocess.CalledProcessError as e:
- # This may happen if .o file was build from asm (.S source).
- logging.warning('%s was not processed: %s', self.input_obj(), e)
- logging.info(e.output)
- return None
- assert (os.path.exists(self.cmd_file()) and
- os.path.exists(self.bc_file()) and
- (not is_thinlto or os.path.exists(self.thinlto_index_file())))
- return self.relative_output_path()
-
- def _extract_lld_artifacts(self) -> Optional[str]:
- """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.
+ Args:
+ objs: A list of TrainingIRExtractor Objects that represent the object files
+ to extract bitcode/commands from.
+ num_workers: The number of parallel processes to spawn to run the
+ extraction.
+ llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
+ cmd_filter: A regular expression that is used to select for compilations
+ performed with specific flags. If you want to include all compilations,
+ set this to None.
+ thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
+ Set this to None if the build was not done with ThinLTO.
+ cmd_section_name: The name of the command line section created by the
+ bitcode embedding.
+ bitcode_section_name: The name of the bitcode section created by the
+ bitcode embedding.
"""
- if not os.path.exists(self.lld_src_bc()):
- logging.info('%s does not exist.', self.lld_src_bc())
- return None
- if not os.path.exists(self.lld_src_thinlto()):
- logging.info('%s does not exist.', self.lld_src_thinlto())
- return None
- os.makedirs(self.dest_dir(), exist_ok=True)
-
- # Copy over the files
- shutil.copy(self.lld_src_bc(), self.bc_file())
- shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
-
- assert os.path.exists(self.bc_file())
- assert os.path.exists(self.thinlto_index_file())
- return self._obj_relative_path
-
- def extract(self,
- llvm_objcopy_path: Optional[str] = None,
- cmd_filter: Optional[str] = None,
- thinlto_build: Optional[str] = None,
- cmd_section_name: Optional[str] = '.llvmcmd',
- bitcode_section_name: Optional[str] = '.llvmbc') -> Optional[str]:
- if thinlto_build == 'local':
- return self._extract_lld_artifacts()
- return self._extract_clang_artifacts(
+ extract_artifacts = functools.partial(
+ TrainingIRExtractor.extract,
llvm_objcopy_path=llvm_objcopy_path,
cmd_filter=cmd_filter,
- is_thinlto=thinlto_build == 'distributed',
+ thinlto_build=thinlto_build,
cmd_section_name=cmd_section_name,
- bitcode_section_name=bitcode_section_name)
+ bitcode_section_name=bitcode_section_name,
+ )
+ with multiprocessing.Pool(num_workers) as pool:
+ relative_output_paths = pool.map(extract_artifacts, objs)
+ pool.close()
+ pool.join()
+ return relative_output_paths
-def convert_compile_command_to_objectfile(
- command: Dict[str, str], output_dir: str) -> Optional[TrainingIRExtractor]:
- obj_base_dir = command['directory']
- if 'arguments' in command:
- cmd_parts = command['arguments']
- elif 'command' in command:
- cmd_parts = command['command'].split()
- else:
- logging.info('compile_commands element has no command and arguments')
- return None
-
- try:
- obj_index = cmd_parts.index('-o') + 1
- except ValueError:
- # This could happen if there are non-clang commands in compile_commands.json
- logging.info('Command has no -o option: %s', ' '.join(cmd_parts))
- return None
- obj_rel_path = cmd_parts[obj_index]
- # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
- return TrainingIRExtractor(
- obj_relative_path=obj_rel_path,
- output_base_dir=output_dir,
- obj_base_dir=obj_base_dir)
-
-
-def load_from_compile_commands(json_array: List[Dict[str, str]],
- output_dir: str) -> List[TrainingIRExtractor]:
- objs = [
- convert_compile_command_to_objectfile(cmd, output_dir)
- for cmd in json_array
- ]
- # Filter out None, in case there were non-clang commands in the .json
- return [obj for obj in objs if obj is not None]
-
-
-def load_from_lld_params(params_array: List[str], obj_base_dir: str,
- output_dir: str) -> List[TrainingIRExtractor]:
- """Create an ObjectFile array based on lld's parameters."""
- # yank out -o and the output. After that, anything not starting with '-', and
- # ending in a '.o', is an object file.
- try:
- minus_o_idx = params_array.index('-o')
- del params_array[minus_o_idx:minus_o_idx + 2]
- just_obj_paths = [
- o for o in params_array if not o.startswith('-') and o.endswith('.o')
- ]
- except ValueError:
- logging.info('This params file does not have an explicit -o option.')
- just_obj_paths = params_array
-
- def make_obj(obj_file: str) -> TrainingIRExtractor:
- return TrainingIRExtractor(
- obj_relative_path=obj_file,
- output_base_dir=output_dir,
- obj_base_dir=obj_base_dir)
-
- return [make_obj(obj_file) for obj_file in just_obj_paths]
-
-
-def load_from_directory(obj_base_dir: str,
- output_dir: str) -> List[TrainingIRExtractor]:
- """Create an object file array by globbing an entire drectory.
- Args:
- obj_base_dir: The base build directory that all object files will be
- written out as being relative to.
- output_dir: The output directory where extracted .bc and .cmd files should
- be placed.
- """
- paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.o')]
+def write_corpus_manifest(
+ thinlto_build: str, relative_output_paths: List[str], output_dir: str
+):
+ """Writes a corpus_manifest.json containing all necessary information about
+ the corpus.
- def make_spec(obj_file: str):
- return TrainingIRExtractor(
- obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
- output_base_dir=output_dir,
- obj_base_dir=obj_base_dir)
-
- return [make_spec(path) for path in paths]
-
-
-def load_for_lld_thinlto(obj_base_dir: str,
- output_dir: str) -> List[TrainingIRExtractor]:
- # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
- # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
- # are also emitted next to the postimport bitcode, with the suffix
- # .thinlto.bc instead
- paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.3.import.bc')]
-
- def make_spec(obj_file: str):
- return TrainingIRExtractor(
- # Cut away .3.import.bc
- obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
- output_base_dir=output_dir,
- obj_base_dir=obj_base_dir)
-
- return [make_spec(path) for path in paths]
-
-
-def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
- llvm_objcopy_path: str, cmd_filter: str, thinlto_build: str,
- cmd_section_name: str, bitcode_section_name: str):
- """Extracts all specified object files into the corpus directory.
-
- Args:
- objs: A list of TrainingIRExtractor Objects that represent the object files
- to extract bitcode/commands from.
- num_workers: The number of parallel processes to spawn to run the
- extraction.
- llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
- cmd_filter: A regular expression that is used to select for compilations
- performed with specific flags. If you want to include all compilations,
- set this to None.
- thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
- Set this to None if the build was not done with ThinLTO.
- cmd_section_name: The name of the command line section created by the
- bitcode embedding.
- bitcode_section_name: The name of the bitcode section created by the
- bitcode embedding.
- """
- extract_artifacts = functools.partial(
- TrainingIRExtractor.extract,
- llvm_objcopy_path=llvm_objcopy_path,
- cmd_filter=cmd_filter,
- thinlto_build=thinlto_build,
- cmd_section_name=cmd_section_name,
- bitcode_section_name=bitcode_section_name)
-
- with multiprocessing.Pool(num_workers) as pool:
- relative_output_paths = pool.map(extract_artifacts, objs)
- pool.close()
- pool.join()
- return relative_output_paths
-
-
-def write_corpus_manifest(thinlto_build: str, relative_output_paths: List[str],
- output_dir: str):
- """Writes a corpus_manifest.json containing all necessary information about
- the corpus.
-
- Args:
- thinlto_build: Whether or not the build was done with ThinLTO and if so,
- what kind of ThinLTO. Set this to none if the build was not performed with
- ThinLTO.
- relative_output_paths: The relative (to the corpus directory) output paths
- of all the bitcode files that should be placed in the corpus manifest
- output_dir: The corpus directory where the corpus manifest should be
- placed.
- """
- # This comes first rather than later so global_command_override is at the top
- # of the .json after being written
- if thinlto_build == 'local':
- corpus_description = {
- 'global_command_override': constant.UNSPECIFIED_OVERRIDE
- }
- else:
- corpus_description = {}
-
- corpus_description.update({
- 'has_thinlto': thinlto_build is not None,
- 'modules': [path for path in relative_output_paths if path is not None]
- })
-
- with open(
- os.path.join(output_dir, 'corpus_description.json'),
- 'w',
- encoding='utf-8') as f:
- json.dump(corpus_description, f, indent=2)
+ Args:
+ thinlto_build: Whether or not the build was done with ThinLTO and if so,
+ what kind of ThinLTO. Set this to none if the build was not performed with
+ ThinLTO.
+ relative_output_paths: The relative (to the corpus directory) output paths
+ of all the bitcode files that should be placed in the corpus manifest
+ output_dir: The corpus directory where the corpus manifest should be
+ placed.
+ """
+ # This comes first rather than later so global_command_override is at the top
+ # of the .json after being written
+ if thinlto_build == "local":
+ corpus_description = {"global_command_override": constant.UNSPECIFIED_OVERRIDE}
+ else:
+ corpus_description = {}
+
+ corpus_description.update(
+ {
+ "has_thinlto": thinlto_build is not None,
+ "modules": [path for path in relative_output_paths if path is not None],
+ }
+ )
+
+ with open(
+ os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
+ ) as f:
+ json.dump(corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/py/src/mlgo/extract_ir_test.py
index d7de50530032cc..ae9b3b30f9a5c9 100644
--- a/llvm/py/src/mlgo/extract_ir_test.py
+++ b/llvm/py/src/mlgo/extract_ir_test.py
@@ -12,209 +12,246 @@
class ExtractIrTest(absltest.TestCase):
+ def test_one_conversion(self):
+ obj = extract_ir_lib.convert_compile_command_to_objectfile(
+ {
+ "directory": "/output/directory",
+ "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
+ "file": "/some/path/lib/foo/bar.cc",
+ },
+ "/corpus/destination/path",
+ )
+ self.assertIsNotNone(obj)
+ # pytype: disable=attribute-error
+ # Pytype complains about obj being None
+ self.assertEqual(obj.input_obj(), "/output/directory/lib/bar.o")
+ self.assertEqual(obj.relative_output_path(), "lib/bar.o")
+ self.assertEqual(obj.cmd_file(), "/corpus/destination/path/lib/bar.o.cmd")
+ self.assertEqual(obj.bc_file(), "/corpus/destination/path/lib/bar.o.bc")
+ self.assertEqual(
+ obj.thinlto_index_file(), "/corpus/destination/path/lib/bar.o.thinlto.bc"
+ )
+ # pytype: enable=attribute-error
- def test_one_conversion(self):
- obj = extract_ir_lib.convert_compile_command_to_objectfile(
- {
- 'directory': '/output/directory',
- 'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
- 'file': '/some/path/lib/foo/bar.cc'
- }, '/corpus/destination/path')
- self.assertIsNotNone(obj)
- # pytype: disable=attribute-error
- # Pytype complains about obj being None
- self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
- self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
- self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
- self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
- self.assertEqual(obj.thinlto_index_file(),
- '/corpus/destination/path/lib/bar.o.thinlto.bc')
- # pytype: enable=attribute-error
-
- def test_one_conversion_arguments_style(self):
- obj = extract_ir_lib.convert_compile_command_to_objectfile(
- {
- 'directory': '/output/directory',
- 'arguments':
- ['-cc1', '-c', '/some/path/lib/foo/bar.cc', '-o', 'lib/bar.o'],
- 'file': '/some/path/lib/foo/bar.cc'
- }, '/corpus/destination/path')
- self.assertIsNotNone(obj)
- # pytype: disable=attribute-error
- # Pytype complains about obj being None
- self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
- self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
- self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
- self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
- self.assertEqual(obj.thinlto_index_file(),
- '/corpus/destination/path/lib/bar.o.thinlto.bc')
- # pytype: enable=attribute-error
-
- def test_arr_conversion(self):
- res = extract_ir_lib.load_from_compile_commands([{
- 'directory': '/output/directory',
- 'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
- 'file': '/some/path/lib/foo/bar.cc'
- }, {
- 'directory': '/output/directory',
- 'command': '-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o',
- 'file': '/some/path/lib/foo/baz.cc'
- }], '/corpus/destination/path')
- res = list(res)
- self.assertLen(res, 2)
- self.assertEqual(res[0].input_obj(), '/output/directory/lib/bar.o')
- self.assertEqual(res[0].relative_output_path(), 'lib/bar.o')
- self.assertEqual(res[0].cmd_file(),
- '/corpus/destination/path/lib/bar.o.cmd')
- self.assertEqual(res[0].bc_file(), '/corpus/destination/path/lib/bar.o.bc')
- self.assertEqual(res[0].thinlto_index_file(),
- '/corpus/destination/path/lib/bar.o.thinlto.bc')
-
- self.assertEqual(res[1].input_obj(), '/output/directory/lib/other/baz.o')
- self.assertEqual(res[1].relative_output_path(), 'lib/other/baz.o')
- self.assertEqual(res[1].cmd_file(),
- '/corpus/destination/path/lib/other/baz.o.cmd')
- self.assertEqual(res[1].bc_file(),
- '/corpus/destination/path/lib/other/baz.o.bc')
- self.assertEqual(res[1].thinlto_index_file(),
- '/corpus/destination/path/lib/other/baz.o.thinlto.bc')
-
- def test_command_extraction(self):
- obj = extract_ir_lib.TrainingIRExtractor(
- obj_relative_path='lib/obj_file.o',
- output_base_dir='/where/corpus/goes',
- obj_base_dir='/foo/bar')
- self.assertEqual(
- obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
- '/bin/llvm_objcopy_path',
- '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
- '/foo/bar/lib/obj_file.o', '/dev/null'
- ])
- self.assertEqual(
- obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
- '/bin/llvm_objcopy_path',
- '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
- '/foo/bar/lib/obj_file.o', '/dev/null'
- ])
-
- def test_command_extraction_no_basedir(self):
- obj = extract_ir_lib.TrainingIRExtractor('lib/obj_file.o',
- '/where/corpus/goes')
- self.assertEqual(
- obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
- '/bin/llvm_objcopy_path',
- '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
- 'lib/obj_file.o', '/dev/null'
- ])
- self.assertEqual(
- obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
- '/bin/llvm_objcopy_path',
- '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
- 'lib/obj_file.o', '/dev/null'
- ])
-
- def test_lld_params(self):
- lld_opts = [
- '-o', 'output/dir/exe', 'lib/obj1.o', 'somelib.a', '-W,blah',
- 'lib/dir/obj2.o'
- ]
- obj = extract_ir_lib.load_from_lld_params(lld_opts, '/some/path',
- '/tmp/out')
- self.assertLen(obj, 2)
- self.assertEqual(obj[0].input_obj(), '/some/path/lib/obj1.o')
- self.assertEqual(obj[0].relative_output_path(), 'lib/obj1.o')
- self.assertEqual(obj[0].cmd_file(), '/tmp/out/lib/obj1.o.cmd')
- self.assertEqual(obj[0].thinlto_index_file(),
- '/tmp/out/lib/obj1.o.thinlto.bc')
- self.assertEqual(obj[1].input_obj(), '/some/path/lib/dir/obj2.o')
-
- def test_load_from_directory(self):
- tempdir = self.create_tempdir()
- subdir = tempdir.mkdir(dir_path='subdir')
- subdir.create_file(file_path='test1.o')
- subdir.create_file(file_path='test2.o')
- outdir = self.create_tempdir()
- objs = extract_ir_lib.load_from_directory(tempdir.full_path,
- outdir.full_path)
- self.assertLen(objs, 2)
- for index, obj in enumerate(
- sorted(objs, key=lambda x: x._obj_relative_path)):
- self.assertEqual(obj._obj_relative_path, f'subdir/test{index + 1:d}.o')
- self.assertEqual(obj._obj_base_dir, tempdir.full_path)
- self.assertEqual(obj._output_base_dir, outdir.full_path)
-
- def test_lld_thinlto_discovery(self):
- tempdir = self.create_tempdir()
- tempdir.create_file(file_path='1.3.import.bc')
- tempdir.create_file(file_path='2.3.import.bc')
- tempdir.create_file(file_path='3.3.import.bc')
- tempdir.create_file(file_path='1.thinlto.bc')
- tempdir.create_file(file_path='2.thinlto.bc')
- tempdir.create_file(file_path='3.thinlto.bc')
- outdir = self.create_tempdir()
- obj = extract_ir_lib.load_for_lld_thinlto(tempdir.full_path,
- outdir.full_path)
- self.assertLen(obj, 3)
- for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
- self.assertEqual(o._obj_relative_path, f'{i + 1:d}')
- self.assertEqual(o._obj_base_dir, tempdir.full_path)
- self.assertEqual(o._output_base_dir, outdir.full_path)
-
- def test_lld_thinlto_discovery_nested(self):
- outer = self.create_tempdir()
- tempdir = outer.mkdir(dir_path='nest')
- tempdir.create_file(file_path='1.3.import.bc')
- tempdir.create_file(file_path='2.3.import.bc')
- tempdir.create_file(file_path='3.3.import.bc')
- tempdir.create_file(file_path='1.thinlto.bc')
- tempdir.create_file(file_path='2.thinlto.bc')
- tempdir.create_file(file_path='3.thinlto.bc')
- outdir = self.create_tempdir()
- obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
- self.assertLen(obj, 3)
- for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
- self.assertEqual(o._obj_relative_path, f'nest/{i + 1:d}')
- self.assertEqual(o._obj_base_dir, outer.full_path)
- self.assertEqual(o._output_base_dir, outdir.full_path)
-
- def test_lld_thinlto_extraction(self):
- outer = self.create_tempdir()
- tempdir = outer.mkdir(dir_path='nest')
- tempdir.create_file(file_path='1.3.import.bc')
- tempdir.create_file(file_path='2.3.import.bc')
- tempdir.create_file(file_path='3.3.import.bc')
- tempdir.create_file(file_path='1.thinlto.bc')
- tempdir.create_file(file_path='2.thinlto.bc')
- tempdir.create_file(file_path='3.thinlto.bc')
- outdir = self.create_tempdir()
- obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
- for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
- mod_path = o.extract(thinlto_build='local')
- self.assertEqual(mod_path, f'nest/{i + 1:d}')
- self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/1.bc')))
- self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/2.bc')))
- self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/3.bc')))
- self.assertTrue(
- os.path.exists(os.path.join(outdir.full_path, 'nest/1.thinlto.bc')))
- self.assertTrue(
- os.path.exists(os.path.join(outdir.full_path, 'nest/2.thinlto.bc')))
- self.assertTrue(
- os.path.exists(os.path.join(outdir.full_path, 'nest/3.thinlto.bc')))
-
- def test_filtering(self):
- cmdline = '-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o'
- self.assertTrue(extract_ir_lib.should_include_module(cmdline, None))
- self.assertTrue(extract_ir_lib.should_include_module(cmdline, '.*'))
- self.assertTrue(extract_ir_lib.should_include_module(cmdline, '^-Oz$'))
- self.assertFalse(extract_ir_lib.should_include_module(cmdline, '^-O3$'))
-
- def test_thinlto_index_extractor(self):
- cmdline = ('-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/'
- 'out.o\0-fthinlto-index=foo/bar.thinlto.bc')
- self.assertEqual(
- extract_ir_lib.get_thinlto_index(cmdline, '/the/base/dir'),
- '/the/base/dir/foo/bar.thinlto.bc')
-
-
-if __name__ == '__main__':
- absltest.main()
+ def test_one_conversion_arguments_style(self):
+ obj = extract_ir_lib.convert_compile_command_to_objectfile(
+ {
+ "directory": "/output/directory",
+ "arguments": [
+ "-cc1",
+ "-c",
+ "/some/path/lib/foo/bar.cc",
+ "-o",
+ "lib/bar.o",
+ ],
+ "file": "/some/path/lib/foo/bar.cc",
+ },
+ "/corpus/destination/path",
+ )
+ self.assertIsNotNone(obj)
+ # pytype: disable=attribute-error
+ # Pytype complains about obj being None
+ self.assertEqual(obj.input_obj(), "/output/directory/lib/bar.o")
+ self.assertEqual(obj.relative_output_path(), "lib/bar.o")
+ self.assertEqual(obj.cmd_file(), "/corpus/destination/path/lib/bar.o.cmd")
+ self.assertEqual(obj.bc_file(), "/corpus/destination/path/lib/bar.o.bc")
+ self.assertEqual(
+ obj.thinlto_index_file(), "/corpus/destination/path/lib/bar.o.thinlto.bc"
+ )
+ # pytype: enable=attribute-error
+
+ def test_arr_conversion(self):
+ res = extract_ir_lib.load_from_compile_commands(
+ [
+ {
+ "directory": "/output/directory",
+ "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
+ "file": "/some/path/lib/foo/bar.cc",
+ },
+ {
+ "directory": "/output/directory",
+ "command": "-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o",
+ "file": "/some/path/lib/foo/baz.cc",
+ },
+ ],
+ "/corpus/destination/path",
+ )
+ res = list(res)
+ self.assertLen(res, 2)
+ self.assertEqual(res[0].input_obj(), "/output/directory/lib/bar.o")
+ self.assertEqual(res[0].relative_output_path(), "lib/bar.o")
+ self.assertEqual(res[0].cmd_file(), "/corpus/destination/path/lib/bar.o.cmd")
+ self.assertEqual(res[0].bc_file(), "/corpus/destination/path/lib/bar.o.bc")
+ self.assertEqual(
+ res[0].thinlto_index_file(), "/corpus/destination/path/lib/bar.o.thinlto.bc"
+ )
+
+ self.assertEqual(res[1].input_obj(), "/output/directory/lib/other/baz.o")
+ self.assertEqual(res[1].relative_output_path(), "lib/other/baz.o")
+ self.assertEqual(
+ res[1].cmd_file(), "/corpus/destination/path/lib/other/baz.o.cmd"
+ )
+ self.assertEqual(
+ res[1].bc_file(), "/corpus/destination/path/lib/other/baz.o.bc"
+ )
+ self.assertEqual(
+ res[1].thinlto_index_file(),
+ "/corpus/destination/path/lib/other/baz.o.thinlto.bc",
+ )
+
+ def test_command_extraction(self):
+ obj = extract_ir_lib.TrainingIRExtractor(
+ obj_relative_path="lib/obj_file.o",
+ output_base_dir="/where/corpus/goes",
+ obj_base_dir="/foo/bar",
+ )
+ self.assertEqual(
+ obj._get_extraction_cmd_command("/bin/llvm_objcopy_path", ".llvmcmd"),
+ [
+ "/bin/llvm_objcopy_path",
+ "--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd",
+ "/foo/bar/lib/obj_file.o",
+ "/dev/null",
+ ],
+ )
+ self.assertEqual(
+ obj._get_extraction_bc_command("/bin/llvm_objcopy_path", ".llvmbc"),
+ [
+ "/bin/llvm_objcopy_path",
+ "--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc",
+ "/foo/bar/lib/obj_file.o",
+ "/dev/null",
+ ],
+ )
+
+ def test_command_extraction_no_basedir(self):
+ obj = extract_ir_lib.TrainingIRExtractor("lib/obj_file.o", "/where/corpus/goes")
+ self.assertEqual(
+ obj._get_extraction_cmd_command("/bin/llvm_objcopy_path", ".llvmcmd"),
+ [
+ "/bin/llvm_objcopy_path",
+ "--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd",
+ "lib/obj_file.o",
+ "/dev/null",
+ ],
+ )
+ self.assertEqual(
+ obj._get_extraction_bc_command("/bin/llvm_objcopy_path", ".llvmbc"),
+ [
+ "/bin/llvm_objcopy_path",
+ "--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc",
+ "lib/obj_file.o",
+ "/dev/null",
+ ],
+ )
+
+ def test_lld_params(self):
+ lld_opts = [
+ "-o",
+ "output/dir/exe",
+ "lib/obj1.o",
+ "somelib.a",
+ "-W,blah",
+ "lib/dir/obj2.o",
+ ]
+ obj = extract_ir_lib.load_from_lld_params(lld_opts, "/some/path", "/tmp/out")
+ self.assertLen(obj, 2)
+ self.assertEqual(obj[0].input_obj(), "/some/path/lib/obj1.o")
+ self.assertEqual(obj[0].relative_output_path(), "lib/obj1.o")
+ self.assertEqual(obj[0].cmd_file(), "/tmp/out/lib/obj1.o.cmd")
+ self.assertEqual(obj[0].thinlto_index_file(), "/tmp/out/lib/obj1.o.thinlto.bc")
+ self.assertEqual(obj[1].input_obj(), "/some/path/lib/dir/obj2.o")
+
+ def test_load_from_directory(self):
+ tempdir = self.create_tempdir()
+ subdir = tempdir.mkdir(dir_path="subdir")
+ subdir.create_file(file_path="test1.o")
+ subdir.create_file(file_path="test2.o")
+ outdir = self.create_tempdir()
+ objs = extract_ir_lib.load_from_directory(tempdir.full_path, outdir.full_path)
+ self.assertLen(objs, 2)
+ for index, obj in enumerate(sorted(objs, key=lambda x: x._obj_relative_path)):
+ self.assertEqual(obj._obj_relative_path, f"subdir/test{index + 1:d}.o")
+ self.assertEqual(obj._obj_base_dir, tempdir.full_path)
+ self.assertEqual(obj._output_base_dir, outdir.full_path)
+
+ def test_lld_thinlto_discovery(self):
+ tempdir = self.create_tempdir()
+ tempdir.create_file(file_path="1.3.import.bc")
+ tempdir.create_file(file_path="2.3.import.bc")
+ tempdir.create_file(file_path="3.3.import.bc")
+ tempdir.create_file(file_path="1.thinlto.bc")
+ tempdir.create_file(file_path="2.thinlto.bc")
+ tempdir.create_file(file_path="3.thinlto.bc")
+ outdir = self.create_tempdir()
+ obj = extract_ir_lib.load_for_lld_thinlto(tempdir.full_path, outdir.full_path)
+ self.assertLen(obj, 3)
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ self.assertEqual(o._obj_relative_path, f"{i + 1:d}")
+ self.assertEqual(o._obj_base_dir, tempdir.full_path)
+ self.assertEqual(o._output_base_dir, outdir.full_path)
+
+ def test_lld_thinlto_discovery_nested(self):
+ outer = self.create_tempdir()
+ tempdir = outer.mkdir(dir_path="nest")
+ tempdir.create_file(file_path="1.3.import.bc")
+ tempdir.create_file(file_path="2.3.import.bc")
+ tempdir.create_file(file_path="3.3.import.bc")
+ tempdir.create_file(file_path="1.thinlto.bc")
+ tempdir.create_file(file_path="2.thinlto.bc")
+ tempdir.create_file(file_path="3.thinlto.bc")
+ outdir = self.create_tempdir()
+ obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+ self.assertLen(obj, 3)
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ self.assertEqual(o._obj_relative_path, f"nest/{i + 1:d}")
+ self.assertEqual(o._obj_base_dir, outer.full_path)
+ self.assertEqual(o._output_base_dir, outdir.full_path)
+
+ def test_lld_thinlto_extraction(self):
+ outer = self.create_tempdir()
+ tempdir = outer.mkdir(dir_path="nest")
+ tempdir.create_file(file_path="1.3.import.bc")
+ tempdir.create_file(file_path="2.3.import.bc")
+ tempdir.create_file(file_path="3.3.import.bc")
+ tempdir.create_file(file_path="1.thinlto.bc")
+ tempdir.create_file(file_path="2.thinlto.bc")
+ tempdir.create_file(file_path="3.thinlto.bc")
+ outdir = self.create_tempdir()
+ obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ mod_path = o.extract(thinlto_build="local")
+ self.assertEqual(mod_path, f"nest/{i + 1:d}")
+ self.assertTrue(os.path.exists(os.path.join(outdir.full_path, "nest/1.bc")))
+ self.assertTrue(os.path.exists(os.path.join(outdir.full_path, "nest/2.bc")))
+ self.assertTrue(os.path.exists(os.path.join(outdir.full_path, "nest/3.bc")))
+ self.assertTrue(
+ os.path.exists(os.path.join(outdir.full_path, "nest/1.thinlto.bc"))
+ )
+ self.assertTrue(
+ os.path.exists(os.path.join(outdir.full_path, "nest/2.thinlto.bc"))
+ )
+ self.assertTrue(
+ os.path.exists(os.path.join(outdir.full_path, "nest/3.thinlto.bc"))
+ )
+
+ def test_filtering(self):
+ cmdline = "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o"
+ self.assertTrue(extract_ir_lib.should_include_module(cmdline, None))
+ self.assertTrue(extract_ir_lib.should_include_module(cmdline, ".*"))
+ self.assertTrue(extract_ir_lib.should_include_module(cmdline, "^-Oz$"))
+ self.assertFalse(extract_ir_lib.should_include_module(cmdline, "^-O3$"))
+
+ def test_thinlto_index_extractor(self):
+ cmdline = (
+ "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/"
+ "out.o\0-fthinlto-index=foo/bar.thinlto.bc"
+ )
+ self.assertEqual(
+ extract_ir_lib.get_thinlto_index(cmdline, "/the/base/dir"),
+ "/the/base/dir/foo/bar.thinlto.bc",
+ )
+
+
+if __name__ == "__main__":
+ absltest.main()
diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/py/src/mlgo/make_corpus.py
index 989d9790b5bcd9..e6ba013019829e 100644
--- a/llvm/py/src/mlgo/make_corpus.py
+++ b/llvm/py/src/mlgo/make_corpus.py
@@ -18,30 +18,33 @@
from compiler_opt.tools import make_corpus_lib
-flags.DEFINE_string('input_dir', None, 'The input directory.')
-flags.DEFINE_string('output_dir', None, 'The output directory.')
+flags.DEFINE_string("input_dir", None, "The input directory.")
+flags.DEFINE_string("output_dir", None, "The output directory.")
flags.DEFINE_string(
- 'default_args', '',
- 'The compiler flags to compile with when using downstream tooling.')
+ "default_args",
+ "",
+ "The compiler flags to compile with when using downstream tooling.",
+)
-flags.mark_flag_as_required('input_dir')
-flags.mark_flag_as_required('output_dir')
+flags.mark_flag_as_required("input_dir")
+flags.mark_flag_as_required("output_dir")
FLAGS = flags.FLAGS
def main(_):
- logging.warning(
- 'Using this tool does not guarantee that the bitcode is taken at '
- 'the correct stage for consumption during model training. Make '
- 'sure to validate assumptions about where the bitcode is coming '
- 'from before using it in production.')
- relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
- make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir,
- FLAGS.output_dir)
- make_corpus_lib.write_corpus_manifest(relative_paths, FLAGS.output_dir,
- FLAGS.default_args.split())
-
-
-if __name__ == '__main__':
- app.run(main)
+ logging.warning(
+ "Using this tool does not guarantee that the bitcode is taken at "
+ "the correct stage for consumption during model training. Make "
+ "sure to validate assumptions about where the bitcode is coming "
+ "from before using it in production."
+ )
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
+ make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, FLAGS.output_dir)
+ make_corpus_lib.write_corpus_manifest(
+ relative_paths, FLAGS.output_dir, FLAGS.default_args.split()
+ )
+
+
+if __name__ == "__main__":
+ app.run(main)
diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/py/src/mlgo/make_corpus_lib.py
index 97db20a9859e17..697c97ebf6ee29 100644
--- a/llvm/py/src/mlgo/make_corpus_lib.py
+++ b/llvm/py/src/mlgo/make_corpus_lib.py
@@ -10,70 +10,68 @@
from typing import List, Optional
-BITCODE_EXTENSION = '.bc'
+BITCODE_EXTENSION = ".bc"
def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]:
- """Finds bitcode files to extract from a given directory.
+ """Finds bitcode files to extract from a given directory.
- Args:
- bitcode_base_dir: The base directory where the bitcode to be copied
- is from.
- output_dir: The directory to place the bitcode in.
+ Args:
+ bitcode_base_dir: The base directory where the bitcode to be copied
+ is from.
+ output_dir: The directory to place the bitcode in.
- Returns an array of paths representing the relative path to the bitcode
- file from the base direcotry.
- """
- paths = [
- str(p)[:-len(BITCODE_EXTENSION)]
- for p in pathlib.Path(bitcode_base_dir).glob('**/*' + BITCODE_EXTENSION)
- ]
+ Returns an array of paths representing the relative path to the bitcode
+ file from the base direcotry.
+ """
+ paths = [
+ str(p)[: -len(BITCODE_EXTENSION)]
+ for p in pathlib.Path(bitcode_base_dir).glob("**/*" + BITCODE_EXTENSION)
+ ]
- return [
- os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths
- ]
+ return [os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths]
-def copy_bitcode(relative_paths: List[str], bitcode_base_dir: str,
- output_dir: str) -> None:
- """Copies bitcode files from the base directory to the output directory.
+def copy_bitcode(
+ relative_paths: List[str], bitcode_base_dir: str, output_dir: str
+) -> None:
+ """Copies bitcode files from the base directory to the output directory.
- Args:
- relative_paths: An array of relative paths to bitcode files that are copied
- over to the output directory, preserving relative location.
- bitcode_base_dir: The base directory where the bitcode is located.
- output_dir: The output directory to place the bitcode in.
- """
- for relative_path in relative_paths:
- base_path = os.path.join(bitcode_base_dir,
- relative_path + BITCODE_EXTENSION)
- destination_path = os.path.join(output_dir,
- relative_path + BITCODE_EXTENSION)
- os.makedirs(os.path.dirname(destination_path), exist_ok=True)
- shutil.copy(base_path, destination_path)
+ Args:
+ relative_paths: An array of relative paths to bitcode files that are copied
+ over to the output directory, preserving relative location.
+ bitcode_base_dir: The base directory where the bitcode is located.
+ output_dir: The output directory to place the bitcode in.
+ """
+ for relative_path in relative_paths:
+ base_path = os.path.join(bitcode_base_dir, relative_path + BITCODE_EXTENSION)
+ destination_path = os.path.join(output_dir, relative_path + BITCODE_EXTENSION)
+ os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+ shutil.copy(base_path, destination_path)
-def write_corpus_manifest(relative_output_paths: List[str],
- output_dir: str,
- default_args: Optional[List[str]] = None) -> None:
- """Creates a corpus manifest describing the bitcode that has been found.
+def write_corpus_manifest(
+ relative_output_paths: List[str],
+ output_dir: str,
+ default_args: Optional[List[str]] = None,
+) -> None:
+ """Creates a corpus manifest describing the bitcode that has been found.
- Args:
- relative_output_paths: A list of paths to each bitcode file relative to the
- output directory.
- outout_dir: The output directory where the corpus is being created.
- default_args: An array of compiler flags that should be used to compile
- the bitcode when using further downstream tooling."""
- if default_args is None:
- default_args = []
- corpus_description = {
- 'global_command_override': default_args,
- 'has_thinlto': False,
- 'modules': [path for path in relative_output_paths if path is not None]
- }
+ Args:
+ relative_output_paths: A list of paths to each bitcode file relative to the
+ output directory.
+ outout_dir: The output directory where the corpus is being created.
+ default_args: An array of compiler flags that should be used to compile
+ the bitcode when using further downstream tooling."""
+ if default_args is None:
+ default_args = []
+ corpus_description = {
+ "global_command_override": default_args,
+ "has_thinlto": False,
+ "modules": [path for path in relative_output_paths if path is not None],
+ }
- with open(
- os.path.join(output_dir, 'corpus_description.json'),
- 'w',
- encoding='utf-8') as description_file:
- json.dump(corpus_description, description_file, indent=2)
+ with open(
+ os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
+ ) as description_file:
+ json.dump(corpus_description, description_file, indent=2)
diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/py/src/mlgo/make_corpus_test.py
index fcb861ebb91f32..7b5cc954b6d172 100644
--- a/llvm/py/src/mlgo/make_corpus_test.py
+++ b/llvm/py/src/mlgo/make_corpus_test.py
@@ -12,44 +12,43 @@
class MakeCorpusTest(absltest.TestCase):
-
- def test_load_bitcode_from_directory(self):
- outer = self.create_tempdir()
- tempdir = outer.mkdir(dir_path='nested')
- tempdir.create_file('test1.bc')
- tempdir.create_file('test2.bc')
- relative_paths = make_corpus_lib.load_bitcode_from_directory(outer)
- relative_paths = sorted(relative_paths)
- self.assertEqual(relative_paths[0], 'nested/test1')
- self.assertEqual(relative_paths[1], 'nested/test2')
-
- def test_copy_bitcode(self):
- build_dir = self.create_tempdir()
- nested_dir = build_dir.mkdir(dir_path='nested')
- nested_dir.create_file('test1.bc')
- nested_dir.create_file('test2.bc')
- relative_paths = ['nested/test1', 'nested/test2']
- corpus_dir = self.create_tempdir()
- make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
- output_files = sorted(os.listdir(os.path.join(corpus_dir, './nested')))
- self.assertEqual(output_files[0], 'test1.bc')
- self.assertEqual(output_files[1], 'test2.bc')
-
- def test_write_corpus_manifest(self):
- relative_output_paths = ['test/test1', 'test/test2']
- output_dir = self.create_tempdir()
- default_args = ['-O3', '-c']
- make_corpus_lib.write_corpus_manifest(relative_output_paths, output_dir,
- default_args)
- with open(
- os.path.join(output_dir, 'corpus_description.json'),
- encoding='utf-8') as corpus_description_file:
- corpus_description = json.load(corpus_description_file)
- self.assertEqual(corpus_description['global_command_override'],
- default_args)
- self.assertEqual(corpus_description['has_thinlto'], False)
- self.assertEqual(corpus_description['modules'], relative_output_paths)
-
-
-if __name__ == '__main__':
- absltest.main()
+ def test_load_bitcode_from_directory(self):
+ outer = self.create_tempdir()
+ tempdir = outer.mkdir(dir_path="nested")
+ tempdir.create_file("test1.bc")
+ tempdir.create_file("test2.bc")
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(outer)
+ relative_paths = sorted(relative_paths)
+ self.assertEqual(relative_paths[0], "nested/test1")
+ self.assertEqual(relative_paths[1], "nested/test2")
+
+ def test_copy_bitcode(self):
+ build_dir = self.create_tempdir()
+ nested_dir = build_dir.mkdir(dir_path="nested")
+ nested_dir.create_file("test1.bc")
+ nested_dir.create_file("test2.bc")
+ relative_paths = ["nested/test1", "nested/test2"]
+ corpus_dir = self.create_tempdir()
+ make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
+ output_files = sorted(os.listdir(os.path.join(corpus_dir, "./nested")))
+ self.assertEqual(output_files[0], "test1.bc")
+ self.assertEqual(output_files[1], "test2.bc")
+
+ def test_write_corpus_manifest(self):
+ relative_output_paths = ["test/test1", "test/test2"]
+ output_dir = self.create_tempdir()
+ default_args = ["-O3", "-c"]
+ make_corpus_lib.write_corpus_manifest(
+ relative_output_paths, output_dir, default_args
+ )
+ with open(
+ os.path.join(output_dir, "corpus_description.json"), encoding="utf-8"
+ ) as corpus_description_file:
+ corpus_description = json.load(corpus_description_file)
+ self.assertEqual(corpus_description["global_command_override"], default_args)
+ self.assertEqual(corpus_description["has_thinlto"], False)
+ self.assertEqual(corpus_description["modules"], relative_output_paths)
+
+
+if __name__ == "__main__":
+ absltest.main()
>From f7c712cf6710c419ac2f98d2dc846995e10c9df5 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Mon, 15 Jan 2024 06:03:56 +0000
Subject: [PATCH 4/7] Restructure upstreaming
---
.../mlgo}/mlgo/combine_training_corpus.py | 2 +-
.../mlgo}/mlgo/combine_training_corpus_lib.py | 9 ++++-----
llvm/{py/src => utils/mlgo}/mlgo/extract_ir.py | 2 +-
.../src => utils/mlgo}/mlgo/extract_ir_lib.py | 5 ++---
llvm/{py/src => utils/mlgo}/mlgo/make_corpus.py | 2 +-
.../src => utils/mlgo}/mlgo/make_corpus_lib.py | 0
llvm/utils/mlgo/pyproject.toml | 10 ++++++++++
llvm/utils/mlgo/tests/__init__.py | 16 ++++++++++++++++
.../mlgo/tests}/combine_training_corpus_test.py | 2 +-
.../mlgo => utils/mlgo/tests}/extract_ir_test.py | 2 +-
.../mlgo/tests}/make_corpus_test.py | 2 +-
11 files changed, 38 insertions(+), 14 deletions(-)
rename llvm/{py/src => utils/mlgo}/mlgo/combine_training_corpus.py (95%)
rename llvm/{py/src => utils/mlgo}/mlgo/combine_training_corpus_lib.py (83%)
rename llvm/{py/src => utils/mlgo}/mlgo/extract_ir.py (99%)
rename llvm/{py/src => utils/mlgo}/mlgo/extract_ir_lib.py (99%)
rename llvm/{py/src => utils/mlgo}/mlgo/make_corpus.py (97%)
rename llvm/{py/src => utils/mlgo}/mlgo/make_corpus_lib.py (100%)
create mode 100644 llvm/utils/mlgo/pyproject.toml
create mode 100644 llvm/utils/mlgo/tests/__init__.py
rename llvm/{py/src/mlgo => utils/mlgo/tests}/combine_training_corpus_test.py (98%)
rename llvm/{py/src/mlgo => utils/mlgo/tests}/extract_ir_test.py (99%)
rename llvm/{py/src/mlgo => utils/mlgo/tests}/make_corpus_test.py (97%)
diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/utils/mlgo/mlgo/combine_training_corpus.py
similarity index 95%
rename from llvm/py/src/mlgo/combine_training_corpus.py
rename to llvm/utils/mlgo/mlgo/combine_training_corpus.py
index c14c9381a18a6b..20684b55332d00 100644
--- a/llvm/py/src/mlgo/combine_training_corpus.py
+++ b/llvm/utils/mlgo/mlgo/combine_training_corpus.py
@@ -26,7 +26,7 @@
from absl import app
from absl import flags
-from compiler_opt.tools import combine_training_corpus_lib
+from mlgo import combine_training_corpus_lib
flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/utils/mlgo/mlgo/combine_training_corpus_lib.py
similarity index 83%
rename from llvm/py/src/mlgo/combine_training_corpus_lib.py
rename to llvm/utils/mlgo/mlgo/combine_training_corpus_lib.py
index 1de182e4cb80dd..e2ae8699ec3180 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_lib.py
+++ b/llvm/utils/mlgo/mlgo/combine_training_corpus_lib.py
@@ -5,11 +5,10 @@
import os
import json
+import glob
from absl import logging
-import tensorflow as tf
-
_FILE_NAME = "corpus_description.json"
@@ -18,10 +17,10 @@ def combine_corpus(root_dir: str) -> None:
output_corpus_description = {}
corpus_description_glob = os.path.join(root_dir, "*/" + _FILE_NAME)
- for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
+ for corpus_description_path in glob.glob(corpus_description_glob):
logging.info("processing %s", corpus_description_path)
- with tf.io.gfile.GFile(corpus_description_path, "r") as f:
+ with open(corpus_description_path, encoding="utf-8") as f:
corpus_description = json.load(f)
sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
module_names.extend(
@@ -35,5 +34,5 @@ def combine_corpus(root_dir: str) -> None:
output_corpus_description["modules"] = module_names
- with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), "w") as f:
+ with open(os.path.join(root_dir, _FILE_NAME), "w") as f:
json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/utils/mlgo/mlgo/extract_ir.py
similarity index 99%
rename from llvm/py/src/mlgo/extract_ir.py
rename to llvm/utils/mlgo/mlgo/extract_ir.py
index 395a298ecec81d..ed580dbeefdc45 100644
--- a/llvm/py/src/mlgo/extract_ir.py
+++ b/llvm/utils/mlgo/mlgo/extract_ir.py
@@ -31,7 +31,7 @@
from absl import flags
from absl import logging
-from compiler_opt.tools import extract_ir_lib
+from mlgo import extract_ir_lib
flags.DEFINE_string(
"input",
diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/utils/mlgo/mlgo/extract_ir_lib.py
similarity index 99%
rename from llvm/py/src/mlgo/extract_ir_lib.py
rename to llvm/utils/mlgo/mlgo/extract_ir_lib.py
index ce6a4a17a8e6ac..c662d684f603cd 100644
--- a/llvm/py/src/mlgo/extract_ir_lib.py
+++ b/llvm/utils/mlgo/mlgo/extract_ir_lib.py
@@ -16,8 +16,7 @@
from absl import logging
-from compiler_opt.rl import constant
-
+_UNSPECIFIED_OVERRIDE = ['<UNSPECIFIED>']
# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
# \0 - separated list of strings, to a \n one.
@@ -378,7 +377,7 @@ def write_corpus_manifest(
# This comes first rather than later so global_command_override is at the top
# of the .json after being written
if thinlto_build == "local":
- corpus_description = {"global_command_override": constant.UNSPECIFIED_OVERRIDE}
+ corpus_description = {"global_command_override": _UNSPECIFIED_OVERRIDE}
else:
corpus_description = {}
diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/utils/mlgo/mlgo/make_corpus.py
similarity index 97%
rename from llvm/py/src/mlgo/make_corpus.py
rename to llvm/utils/mlgo/mlgo/make_corpus.py
index e6ba013019829e..7b3d85ff8423b3 100644
--- a/llvm/py/src/mlgo/make_corpus.py
+++ b/llvm/utils/mlgo/mlgo/make_corpus.py
@@ -16,7 +16,7 @@
from absl import flags
from absl import logging
-from compiler_opt.tools import make_corpus_lib
+from mlgo import make_corpus_lib
flags.DEFINE_string("input_dir", None, "The input directory.")
flags.DEFINE_string("output_dir", None, "The output directory.")
diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/utils/mlgo/mlgo/make_corpus_lib.py
similarity index 100%
rename from llvm/py/src/mlgo/make_corpus_lib.py
rename to llvm/utils/mlgo/mlgo/make_corpus_lib.py
diff --git a/llvm/utils/mlgo/pyproject.toml b/llvm/utils/mlgo/pyproject.toml
new file mode 100644
index 00000000000000..22d3a560aa3c44
--- /dev/null
+++ b/llvm/utils/mlgo/pyproject.toml
@@ -0,0 +1,10 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "mlgo"
+version = "0.0.1"
+description = "A small example package"
+readme = "README.md"
+requires-python = ">=3.8"
diff --git a/llvm/utils/mlgo/tests/__init__.py b/llvm/utils/mlgo/tests/__init__.py
new file mode 100644
index 00000000000000..4e8e26ce2d7fc2
--- /dev/null
+++ b/llvm/utils/mlgo/tests/__init__.py
@@ -0,0 +1,16 @@
+"""Ensure flags are initialized for e.g. pytest harness case."""
+
+import sys
+
+from absl import flags
+
+# When this module is loaded in an app, flags would have been parsed already
+# (assuming the app's main uses directly or indirectly absl.app.main). However,
+# when loaded in a test harness like pytest or unittest (e.g. python -m pytest)
+# that won't happen.
+# While tests shouldn't use the flags directly, some flags - like compilation
+# timeout - have default values that need to be accessible.
+# This makes sure flags are initialized, for this purpose.
+if not flags.FLAGS.is_parsed():
+ flags.FLAGS(sys.argv, known_only=True)
+assert flags.FLAGS.is_parsed()
diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/utils/mlgo/tests/combine_training_corpus_test.py
similarity index 98%
rename from llvm/py/src/mlgo/combine_training_corpus_test.py
rename to llvm/utils/mlgo/tests/combine_training_corpus_test.py
index 969d8472964971..0457ae1823db6a 100644
--- a/llvm/py/src/mlgo/combine_training_corpus_test.py
+++ b/llvm/utils/mlgo/tests/combine_training_corpus_test.py
@@ -8,7 +8,7 @@
from absl.testing import absltest
-from compiler_opt.tools import combine_training_corpus_lib
+from mlgo import combine_training_corpus_lib
class CombineTrainingCorpusTest(absltest.TestCase):
diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/utils/mlgo/tests/extract_ir_test.py
similarity index 99%
rename from llvm/py/src/mlgo/extract_ir_test.py
rename to llvm/utils/mlgo/tests/extract_ir_test.py
index ae9b3b30f9a5c9..9eecb33b99b10f 100644
--- a/llvm/py/src/mlgo/extract_ir_test.py
+++ b/llvm/utils/mlgo/tests/extract_ir_test.py
@@ -8,7 +8,7 @@
from absl.testing import absltest
-from compiler_opt.tools import extract_ir_lib
+from mlgo import extract_ir_lib
class ExtractIrTest(absltest.TestCase):
diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/utils/mlgo/tests/make_corpus_test.py
similarity index 97%
rename from llvm/py/src/mlgo/make_corpus_test.py
rename to llvm/utils/mlgo/tests/make_corpus_test.py
index 7b5cc954b6d172..6ad09eb74571c9 100644
--- a/llvm/py/src/mlgo/make_corpus_test.py
+++ b/llvm/utils/mlgo/tests/make_corpus_test.py
@@ -8,7 +8,7 @@
from absl.testing import absltest
-from compiler_opt.tools import make_corpus_lib
+from mlgo import make_corpus_lib
class MakeCorpusTest(absltest.TestCase):
>From f99e11f67400bf5b03c289703dad5d140922d400 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Mon, 15 Jan 2024 06:04:48 +0000
Subject: [PATCH 5/7] Fix formatting/copyright
---
llvm/utils/mlgo/mlgo/extract_ir_lib.py | 3 ++-
llvm/utils/mlgo/tests/__init__.py | 5 ++++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/llvm/utils/mlgo/mlgo/extract_ir_lib.py b/llvm/utils/mlgo/mlgo/extract_ir_lib.py
index c662d684f603cd..9c828ce1eb631f 100644
--- a/llvm/utils/mlgo/mlgo/extract_ir_lib.py
+++ b/llvm/utils/mlgo/mlgo/extract_ir_lib.py
@@ -16,7 +16,8 @@
from absl import logging
-_UNSPECIFIED_OVERRIDE = ['<UNSPECIFIED>']
+_UNSPECIFIED_OVERRIDE = ["<UNSPECIFIED>"]
+
# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
# \0 - separated list of strings, to a \n one.
diff --git a/llvm/utils/mlgo/tests/__init__.py b/llvm/utils/mlgo/tests/__init__.py
index 4e8e26ce2d7fc2..9e97ceb6bfef6b 100644
--- a/llvm/utils/mlgo/tests/__init__.py
+++ b/llvm/utils/mlgo/tests/__init__.py
@@ -1,3 +1,6 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""Ensure flags are initialized for e.g. pytest harness case."""
import sys
@@ -12,5 +15,5 @@
# timeout - have default values that need to be accessible.
# This makes sure flags are initialized, for this purpose.
if not flags.FLAGS.is_parsed():
- flags.FLAGS(sys.argv, known_only=True)
+ flags.FLAGS(sys.argv, known_only=True)
assert flags.FLAGS.is_parsed()
>From 14286955cc98ffc7a04ed189a57b1b53df4e96d6 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Mon, 15 Jan 2024 06:19:19 +0000
Subject: [PATCH 6/7] Get pyproject working
---
llvm/utils/mlgo/mlgo/__init__.py | 6 ++++++
llvm/utils/mlgo/pyproject.toml | 12 +++++++++---
2 files changed, 15 insertions(+), 3 deletions(-)
create mode 100644 llvm/utils/mlgo/mlgo/__init__.py
diff --git a/llvm/utils/mlgo/mlgo/__init__.py b/llvm/utils/mlgo/mlgo/__init__.py
new file mode 100644
index 00000000000000..bcb5de2ff4d575
--- /dev/null
+++ b/llvm/utils/mlgo/mlgo/__init__.py
@@ -0,0 +1,6 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+__versioninfo__ = (18, 0, 0)
+__version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
diff --git a/llvm/utils/mlgo/pyproject.toml b/llvm/utils/mlgo/pyproject.toml
index 22d3a560aa3c44..6bcbfe64e362e4 100644
--- a/llvm/utils/mlgo/pyproject.toml
+++ b/llvm/utils/mlgo/pyproject.toml
@@ -4,7 +4,13 @@ build-backend = "setuptools.build_meta"
[project]
name = "mlgo"
-version = "0.0.1"
-description = "A small example package"
+description = "Tooling for ML in LLVM"
readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.8,<3.11"
+dependencies = [
+ "absl-py>=1.0.0"
+]
+dynamic = ["version"]
+
+[tool.setuptools.dynamic]
+version = {attr = "mlgo.__version__"}
>From 4b8b452161cf029dc14f6070eb07da51d2eae729 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Mon, 15 Jan 2024 06:23:34 +0000
Subject: [PATCH 7/7] Add README
---
llvm/utils/mlgo/README.md | 12 ++++++++++++
1 file changed, 12 insertions(+)
create mode 100644 llvm/utils/mlgo/README.md
diff --git a/llvm/utils/mlgo/README.md b/llvm/utils/mlgo/README.md
new file mode 100644
index 00000000000000..53e616d8c6640e
--- /dev/null
+++ b/llvm/utils/mlgo/README.md
@@ -0,0 +1,12 @@
+# MLGO Python Library
+
+This folder contains the MLGO python library. This library consists of telling
+to help enable ML applications within LLVM, particularly tooling to extract
+corpora that can be used in downstream projects to train ML models and perform
+other tasks that benefit from having a large amount of data.
+
+### Python Versioning
+
+Due to type annotations, the MLGO tooling currently only supports a Python
+version greater than 3.8, deviating from the current LLVM project-wide
+minimum supported version of Python 3.6.
More information about the cfe-commits
mailing list