[clang-tools-extra] [llvm] [MLGO] Upstream the corpus extraction tooling (PR #72319)
Aiden Grossman via cfe-commits
cfe-commits at lists.llvm.org
Sun Jan 14 21:03:16 PST 2024
https://github.com/boomanaiden154 updated https://github.com/llvm/llvm-project/pull/72319
>From c3f723c8a975cc5e075d56350645b0be486f3cda Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Tue, 14 Nov 2023 14:20:24 -0800
Subject: [PATCH] [MLGO] Upstream the corpus extraction tooling
---
llvm/py/Pyproject.toml | 1 +
llvm/py/src/mlgo/combine_training_corpus.py | 55 +++
.../src/mlgo/combine_training_corpus_lib.py | 50 +++
.../src/mlgo/combine_training_corpus_test.py | 104 +++++
llvm/py/src/mlgo/extract_ir.py | 142 +++++++
llvm/py/src/mlgo/extract_ir_lib.py | 373 ++++++++++++++++++
llvm/py/src/mlgo/extract_ir_test.py | 231 +++++++++++
llvm/py/src/mlgo/make_corpus.py | 58 +++
llvm/py/src/mlgo/make_corpus_lib.py | 90 +++++
llvm/py/src/mlgo/make_corpus_test.py | 66 ++++
10 files changed, 1170 insertions(+)
create mode 100644 llvm/py/Pyproject.toml
create mode 100644 llvm/py/src/mlgo/combine_training_corpus.py
create mode 100644 llvm/py/src/mlgo/combine_training_corpus_lib.py
create mode 100644 llvm/py/src/mlgo/combine_training_corpus_test.py
create mode 100644 llvm/py/src/mlgo/extract_ir.py
create mode 100644 llvm/py/src/mlgo/extract_ir_lib.py
create mode 100644 llvm/py/src/mlgo/extract_ir_test.py
create mode 100644 llvm/py/src/mlgo/make_corpus.py
create mode 100644 llvm/py/src/mlgo/make_corpus_lib.py
create mode 100644 llvm/py/src/mlgo/make_corpus_test.py
diff --git a/llvm/py/Pyproject.toml b/llvm/py/Pyproject.toml
new file mode 100644
index 00000000000000..dcf2c804da5e19
--- /dev/null
+++ b/llvm/py/Pyproject.toml
@@ -0,0 +1 @@
+# Placeholder
diff --git a/llvm/py/src/mlgo/combine_training_corpus.py b/llvm/py/src/mlgo/combine_training_corpus.py
new file mode 100644
index 00000000000000..94ee1cbac9cea4
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+ --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+from absl import app
+from absl import flags
+
+from compiler_opt.tools import combine_training_corpus_lib
+
+flags.DEFINE_string('root_dir', '', 'root dir of module paths to combine.')
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+ if len(argv) > 1:
+ raise app.UsageError('Too many command-line arguments.')
+
+ combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
+
+
+if __name__ == '__main__':
+ app.run(main)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_lib.py b/llvm/py/src/mlgo/combine_training_corpus_lib.py
new file mode 100644
index 00000000000000..0359961266a240
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus_lib.py
@@ -0,0 +1,50 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library for combining training corpora."""
+
+import os
+import json
+
+from absl import logging
+
+import tensorflow as tf
+
+_FILE_NAME = 'corpus_description.json'
+
+
+def combine_corpus(root_dir: str) -> None:
+ module_names = []
+ output_corpus_description = {}
+
+ corpus_description_glob = os.path.join(root_dir, '*/' + _FILE_NAME)
+ for corpus_description_path in tf.io.gfile.glob(corpus_description_glob):
+ logging.info('processing %s', corpus_description_path)
+
+ with tf.io.gfile.GFile(corpus_description_path, 'r') as f:
+ corpus_description = json.load(f)
+ sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
+ module_names.extend([
+ os.path.join(sub_dir, name) for name in corpus_description['modules']
+ ])
+ del corpus_description['modules']
+ if len(output_corpus_description) == 0:
+ output_corpus_description = corpus_description
+ elif corpus_description != output_corpus_description:
+ raise ValueError('Input corpora differ by more than modules.')
+
+ output_corpus_description['modules'] = module_names
+
+ with tf.io.gfile.GFile(os.path.join(root_dir, _FILE_NAME), 'w') as f:
+ json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/combine_training_corpus_test.py b/llvm/py/src/mlgo/combine_training_corpus_test.py
new file mode 100644
index 00000000000000..47dd602967b68f
--- /dev/null
+++ b/llvm/py/src/mlgo/combine_training_corpus_test.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for combining training corpora."""
+
+import json
+import os
+
+from absl.testing import absltest
+
+from compiler_opt.tools import combine_training_corpus_lib
+
+
+class CombineTrainingCorpusTest(absltest.TestCase):
+
+ def test_combine_corpus(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+ subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
+ subcorpus1_description = {
+ 'has_thinlto': False,
+ 'modules': ['test1.o', 'test2.o']
+ }
+ subcorpus2_description = {
+ 'has_thinlto': False,
+ 'modules': ['test3.o', 'test4.o']
+ }
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus2_description_file = subcorpus2_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+ combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+ with open(
+ os.path.join(corpus_dir, 'corpus_description.json'),
+ encoding='utf-8') as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ self.assertEqual(combined_corpus_description['has_thinlto'], False)
+ self.assertLen(combined_corpus_description['modules'], 4)
+ self.assertIn('subcorpus1/test1.o', combined_corpus_description['modules'])
+ self.assertIn('subcorpus1/test2.o', combined_corpus_description['modules'])
+ self.assertIn('subcorpus2/test3.o', combined_corpus_description['modules'])
+ self.assertIn('subcorpus2/test4.o', combined_corpus_description['modules'])
+
+ def test_empty_folder(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+ _ = corpus_dir.mkdir(dir_path='empty_dir')
+ subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+ with open(
+ os.path.join(corpus_dir, 'corpus_description.json'),
+ encoding='utf-8') as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ self.assertLen(combined_corpus_description['modules'], 2)
+
+ def test_ignore_extra_file(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+ _ = corpus_dir.create_file(file_path='empty.log')
+ subcorpus1_description = {'modules': ['test1.o', 'test2.o']}
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ combine_training_corpus_lib.combine_corpus(corpus_dir.full_path)
+ with open(
+ os.path.join(corpus_dir, 'corpus_description.json'),
+ encoding='utf-8') as combined_corpus_description_file:
+ combined_corpus_description = json.load(combined_corpus_description_file)
+ self.assertLen(combined_corpus_description['modules'], 2)
+
+ def test_different_corpora(self):
+ corpus_dir = self.create_tempdir()
+ subcorpus1_dir = corpus_dir.mkdir(dir_path='subcorpus1')
+ subcorpus2_dir = corpus_dir.mkdir(dir_path='subcorpus2')
+ subcorpus1_description = {'has_thinlto': False, 'modules': ['test1.o']}
+ subcorpus2_description = {'has_thinlto': True, 'modules': ['test2.o']}
+ subcorpus1_description_file = subcorpus1_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus2_description_file = subcorpus2_dir.create_file(
+ file_path='corpus_description.json')
+ subcorpus1_description_file.write_text(json.dumps(subcorpus1_description))
+ subcorpus2_description_file.write_text(json.dumps(subcorpus2_description))
+ self.assertRaises(ValueError, combine_training_corpus_lib.combine_corpus,
+ corpus_dir.full_path)
+
+
+if __name__ == '__main__':
+ absltest.main()
diff --git a/llvm/py/src/mlgo/extract_ir.py b/llvm/py/src/mlgo/extract_ir.py
new file mode 100644
index 00000000000000..2a1ef3978888d6
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir.py
@@ -0,0 +1,142 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, pass an integer representing the desired
+verbosity to the --verbosity flag. Use 0 for all logs, status information,
+and detailed debug information, -1 for solely warnings, and -2 to not produce
+any output.
+"""
+
+import json
+import multiprocessing
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from compiler_opt.tools import extract_ir_lib
+
+flags.DEFINE_string(
+ 'input', None,
+ 'Input file or directory - either compile_commands.json, a linker parameter'
+ 'list, or a path to a directory containing object files.')
+flags.DEFINE_enum(
+ 'input_type', 'json', ['json', 'params', 'directory'],
+ 'Input file type - json, params, or directory. params latter refers to lld'
+ 'params.')
+flags.DEFINE_string('output_dir', None, 'Output directory')
+flags.DEFINE_integer(
+ 'num_workers', None,
+ 'Number of parallel workers for objcopy. `None` for maximum available.')
+flags.DEFINE_string('llvm_objcopy_path', 'llvm-objcopy', 'Path to llvm-objcopy')
+flags.DEFINE_string(
+ 'obj_base_dir', '',
+ 'Base directory for object files. Defaults to current working dir.')
+flags.DEFINE_string(
+ 'cmd_filter', None,
+ 'Include only those modules with a command line matching this regexp. '
+ 'Setting it to None for not filtering. Note that the regexp is applied '
+ 'independently for each separate command line option. For example, ^-Oz$ '
+ 'will match Oz - built binaries. Does not work with thinlto_build=lld.')
+flags.DEFINE_enum(
+ 'thinlto_build', None, ['distributed', 'local'],
+ 'Set if the build was performed with either \'distributed\' or '
+ '\'local\' ThinLTO. This ensures the thinlto.bc files are also copied. '
+ 'The build is assumed to have had '
+ '-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed '
+ 'case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files '
+ 'passed in the local case.')
+flags.DEFINE_string(
+ 'cmd_section_name', '.llvmcmd',
+ 'The section name passed to llvm-objcopy. For ELF object files, the '
+ 'default .llvmcmd is correct. For Mach-O object files, one should use '
+ 'something like __LLVM,__cmdline')
+flags.DEFINE_string(
+ 'bitcode_section_name', '.llvmbc',
+ 'The section name passed to llvm-objcopy. For ELF object files, the '
+ 'default .llvmbc is correct. For Mach-O object files, one should use '
+ '__LLVM,__bitcode')
+
+flags.mark_flag_as_required('output_dir')
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+ if len(argv) > 1:
+ raise app.UsageError('Too many command-line arguments.')
+
+ objs = []
+ if FLAGS.input is not None and FLAGS.thinlto_build == 'local':
+ raise ValueError('--thinlto_build=local cannot be run with --input')
+ if FLAGS.input is None:
+ if FLAGS.thinlto_build != 'local':
+ raise ValueError('--input or --thinlto_build=local must be provided')
+ objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir,
+ FLAGS.output_dir)
+ elif FLAGS.input_type == 'json':
+ with open(FLAGS.input, encoding='utf-8') as f:
+ objs = extract_ir_lib.load_from_compile_commands(
+ json.load(f), FLAGS.output_dir)
+ elif FLAGS.input_type == 'params':
+ if not FLAGS.obj_base_dir:
+ logging.info(
+ '-obj_base_dir is unspecified, assuming current directory.'
+ 'If no objects are found, use this option to specify the root'
+ 'directory for the object file paths in the input file.')
+ with open(FLAGS.input, encoding='utf-8') as f:
+ objs = extract_ir_lib.load_from_lld_params(
+ [l.strip() for l in f.readlines()], FLAGS.obj_base_dir,
+ FLAGS.output_dir)
+ elif FLAGS.input_type == 'directory':
+ logging.warning(
+ 'Using the directory input is only recommended if the build system'
+ 'your project uses does not support any structured output that'
+ 'ml-compiler-opt understands. If your build system provides a'
+ 'structured compilation database, use that instead')
+ objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+ else:
+ logging.error('Unknown input type: %s', FLAGS.input_type)
+
+ relative_output_paths = extract_ir_lib.run_extraction(
+ objs, FLAGS.num_workers, FLAGS.llvm_objcopy_path, FLAGS.cmd_filter,
+ FLAGS.thinlto_build, FLAGS.cmd_section_name, FLAGS.bitcode_section_name)
+
+ extract_ir_lib.write_corpus_manifest(FLAGS.thinlto_build,
+ relative_output_paths, FLAGS.output_dir)
+
+ logging.info('Converted %d files out of %d',
+ len(objs) - relative_output_paths.count(None), len(objs))
+
+
+if __name__ == '__main__':
+ multiprocessing.set_start_method('fork')
+ app.run(main)
diff --git a/llvm/py/src/mlgo/extract_ir_lib.py b/llvm/py/src/mlgo/extract_ir_lib.py
new file mode 100644
index 00000000000000..c1d2a54b9a9e7c
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir_lib.py
@@ -0,0 +1,373 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library functions for IR extraction."""
+
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import multiprocessing
+import functools
+import json
+
+from typing import Dict, List, Optional
+
+from absl import logging
+
+from compiler_opt.rl import constant
+
+
+# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
+# \0 - separated list of strings, to a \n one.
+def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
+ """Determine if the module should be included."""
+ if match_regexp is None:
+ return True
+ lines = cmdline.split('\0')
+ return any(len(re.findall(match_regexp, l)) for l in lines)
+
+
+def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
+ opts = cmdline.split('\0')
+ for option in opts:
+ if option.startswith('-fthinlto-index'):
+ return os.path.join(basedir, option.split('=')[1])
+ return None
+
+
+class TrainingIRExtractor:
+ """IR and command line extraction from an object file."""
+
+ def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
+ """Set up a TrainingIRExtractor.
+
+ Args:
+ obj_relative_path: relative path to the input object file. It will be also
+ used to construct the absolute path of the output IR and cmd files, by
+ appending it to output_base_dir.
+ output_base_dir: the directory under which the output will be produced.
+ obj_base_dir: the base directory for all the input object files.
+ """
+ self._obj_relative_path = obj_relative_path
+ self._output_base_dir = output_base_dir
+ self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ''
+
+ def obj_base_dir(self):
+ return self._obj_base_dir
+
+ def output_base_dir(self):
+ return self._output_base_dir
+
+ def relative_output_path(self):
+ return self._obj_relative_path
+
+ def input_obj(self):
+ return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+
+ def lld_src_bc(self):
+ # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+ # IR bitcode saved by lld. It is hardcoded into lld.
+ return os.path.join(self._obj_base_dir,
+ self._obj_relative_path + '.3.import.bc')
+
+ def lld_src_thinlto(self):
+ return os.path.join(self._obj_base_dir,
+ self._obj_relative_path + '.thinlto.bc')
+
+ def dest_dir(self):
+ return os.path.join(self.output_base_dir(),
+ os.path.dirname(self._obj_relative_path))
+
+ def module_name(self):
+ return os.path.basename(self._obj_relative_path)
+
+ def cmd_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + '.cmd')
+
+ def bc_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + '.bc')
+
+ def thinlto_index_file(self):
+ return os.path.join(self.dest_dir(), self.module_name() + '.thinlto.bc')
+
+ def _get_extraction_cmd_command(self, llvm_objcopy_path: str,
+ cmd_section_name: str):
+ """Get llvm-objcopy and process args to a produce a command string that,
+ when invoked, will extract the cmd section info ths self.cmd_file() file.
+ """
+ return [
+ llvm_objcopy_path,
+ '--dump-section=' + cmd_section_name + '=' + self.cmd_file(),
+ self.input_obj(), '/dev/null'
+ ]
+
+ def _get_extraction_bc_command(self, llvm_objcopy_path: str,
+ bitcode_section_name: str):
+ """Gets llvm-objcopy and process args to produce a command string that,
+ when invoked, will extract the bitcode section into the self.bc_file()
+ file.
+ """
+ return [
+ llvm_objcopy_path,
+ '--dump-section=' + bitcode_section_name + '=' + self.bc_file(),
+ self.input_obj(), '/dev/null'
+ ]
+
+ def _extract_clang_artifacts(self, llvm_objcopy_path: str, cmd_filter: str,
+ is_thinlto: bool, cmd_section_name: str,
+ bitcode_section_name: str) -> Optional[str]:
+ """Run llvm-objcopy to extract the .bc and command line."""
+ if not os.path.exists(self.input_obj()):
+ logging.info('%s does not exist.', self.input_obj())
+ return None
+ os.makedirs(self.dest_dir(), exist_ok=True)
+ try:
+ subprocess.check_output(
+ self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
+ stderr=subprocess.STDOUT,
+ encoding='utf-8')
+ if cmd_filter is not None or is_thinlto:
+ with open(self.cmd_file(), encoding='utf-8') as f:
+ lines = f.readlines()
+ assert len(lines) == 1
+ cmdline = lines[0]
+ if not should_include_module(cmdline, cmd_filter):
+ logging.info(
+ 'Excluding module %s because it does not match the filter',
+ self.input_obj())
+ os.remove(self.cmd_file())
+ return None
+ if is_thinlto:
+ index_file = get_thinlto_index(cmdline, self.obj_base_dir())
+ shutil.copy(index_file, self.thinlto_index_file())
+
+ subprocess.check_output(
+ self._get_extraction_bc_command(llvm_objcopy_path,
+ bitcode_section_name),
+ stderr=subprocess.STDOUT,
+ encoding='utf-8')
+ except subprocess.CalledProcessError as e:
+ # This may happen if .o file was build from asm (.S source).
+ logging.warning('%s was not processed: %s', self.input_obj(), e)
+ logging.info(e.output)
+ return None
+ assert (os.path.exists(self.cmd_file()) and
+ os.path.exists(self.bc_file()) and
+ (not is_thinlto or os.path.exists(self.thinlto_index_file())))
+ return self.relative_output_path()
+
+ def _extract_lld_artifacts(self) -> Optional[str]:
+ """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation.
+ """
+ if not os.path.exists(self.lld_src_bc()):
+ logging.info('%s does not exist.', self.lld_src_bc())
+ return None
+ if not os.path.exists(self.lld_src_thinlto()):
+ logging.info('%s does not exist.', self.lld_src_thinlto())
+ return None
+ os.makedirs(self.dest_dir(), exist_ok=True)
+
+ # Copy over the files
+ shutil.copy(self.lld_src_bc(), self.bc_file())
+ shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
+
+ assert os.path.exists(self.bc_file())
+ assert os.path.exists(self.thinlto_index_file())
+ return self._obj_relative_path
+
+ def extract(self,
+ llvm_objcopy_path: Optional[str] = None,
+ cmd_filter: Optional[str] = None,
+ thinlto_build: Optional[str] = None,
+ cmd_section_name: Optional[str] = '.llvmcmd',
+ bitcode_section_name: Optional[str] = '.llvmbc') -> Optional[str]:
+ if thinlto_build == 'local':
+ return self._extract_lld_artifacts()
+ return self._extract_clang_artifacts(
+ llvm_objcopy_path=llvm_objcopy_path,
+ cmd_filter=cmd_filter,
+ is_thinlto=thinlto_build == 'distributed',
+ cmd_section_name=cmd_section_name,
+ bitcode_section_name=bitcode_section_name)
+
+
+def convert_compile_command_to_objectfile(
+ command: Dict[str, str], output_dir: str) -> Optional[TrainingIRExtractor]:
+ obj_base_dir = command['directory']
+ if 'arguments' in command:
+ cmd_parts = command['arguments']
+ elif 'command' in command:
+ cmd_parts = command['command'].split()
+ else:
+ logging.info('compile_commands element has no command and arguments')
+ return None
+
+ try:
+ obj_index = cmd_parts.index('-o') + 1
+ except ValueError:
+ # This could happen if there are non-clang commands in compile_commands.json
+ logging.info('Command has no -o option: %s', ' '.join(cmd_parts))
+ return None
+ obj_rel_path = cmd_parts[obj_index]
+ # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
+ return TrainingIRExtractor(
+ obj_relative_path=obj_rel_path,
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir)
+
+
+def load_from_compile_commands(json_array: List[Dict[str, str]],
+ output_dir: str) -> List[TrainingIRExtractor]:
+ objs = [
+ convert_compile_command_to_objectfile(cmd, output_dir)
+ for cmd in json_array
+ ]
+ # Filter out None, in case there were non-clang commands in the .json
+ return [obj for obj in objs if obj is not None]
+
+
+def load_from_lld_params(params_array: List[str], obj_base_dir: str,
+ output_dir: str) -> List[TrainingIRExtractor]:
+ """Create an ObjectFile array based on lld's parameters."""
+ # yank out -o and the output. After that, anything not starting with '-', and
+ # ending in a '.o', is an object file.
+ try:
+ minus_o_idx = params_array.index('-o')
+ del params_array[minus_o_idx:minus_o_idx + 2]
+ just_obj_paths = [
+ o for o in params_array if not o.startswith('-') and o.endswith('.o')
+ ]
+ except ValueError:
+ logging.info('This params file does not have an explicit -o option.')
+ just_obj_paths = params_array
+
+ def make_obj(obj_file: str) -> TrainingIRExtractor:
+ return TrainingIRExtractor(
+ obj_relative_path=obj_file,
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir)
+
+ return [make_obj(obj_file) for obj_file in just_obj_paths]
+
+
+def load_from_directory(obj_base_dir: str,
+ output_dir: str) -> List[TrainingIRExtractor]:
+ """Create an object file array by globbing an entire drectory.
+
+ Args:
+ obj_base_dir: The base build directory that all object files will be
+ written out as being relative to.
+ output_dir: The output directory where extracted .bc and .cmd files should
+ be placed.
+ """
+ paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.o')]
+
+ def make_spec(obj_file: str):
+ return TrainingIRExtractor(
+ obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir)
+
+ return [make_spec(path) for path in paths]
+
+
+def load_for_lld_thinlto(obj_base_dir: str,
+ output_dir: str) -> List[TrainingIRExtractor]:
+ # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+ # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
+ # are also emitted next to the postimport bitcode, with the suffix
+ # .thinlto.bc instead
+ paths = [str(p) for p in pathlib.Path(obj_base_dir).glob('**/*.3.import.bc')]
+
+ def make_spec(obj_file: str):
+ return TrainingIRExtractor(
+ # Cut away .3.import.bc
+ obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
+ output_base_dir=output_dir,
+ obj_base_dir=obj_base_dir)
+
+ return [make_spec(path) for path in paths]
+
+
+def run_extraction(objs: List[TrainingIRExtractor], num_workers: int,
+ llvm_objcopy_path: str, cmd_filter: str, thinlto_build: str,
+ cmd_section_name: str, bitcode_section_name: str):
+ """Extracts all specified object files into the corpus directory.
+
+ Args:
+ objs: A list of TrainingIRExtractor Objects that represent the object files
+ to extract bitcode/commands from.
+ num_workers: The number of parallel processes to spawn to run the
+ extraction.
+ llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
+ cmd_filter: A regular expression that is used to select for compilations
+ performed with specific flags. If you want to include all compilations,
+ set this to None.
+ thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
+ Set this to None if the build was not done with ThinLTO.
+ cmd_section_name: The name of the command line section created by the
+ bitcode embedding.
+ bitcode_section_name: The name of the bitcode section created by the
+ bitcode embedding.
+ """
+ extract_artifacts = functools.partial(
+ TrainingIRExtractor.extract,
+ llvm_objcopy_path=llvm_objcopy_path,
+ cmd_filter=cmd_filter,
+ thinlto_build=thinlto_build,
+ cmd_section_name=cmd_section_name,
+ bitcode_section_name=bitcode_section_name)
+
+ with multiprocessing.Pool(num_workers) as pool:
+ relative_output_paths = pool.map(extract_artifacts, objs)
+ pool.close()
+ pool.join()
+ return relative_output_paths
+
+
+def write_corpus_manifest(thinlto_build: str, relative_output_paths: List[str],
+ output_dir: str):
+ """Writes a corpus_manifest.json containing all necessary information about
+ the corpus.
+
+ Args:
+ thinlto_build: Whether or not the build was done with ThinLTO and if so,
+ what kind of ThinLTO. Set this to none if the build was not performed with
+ ThinLTO.
+ relative_output_paths: The relative (to the corpus directory) output paths
+ of all the bitcode files that should be placed in the corpus manifest
+ output_dir: The corpus directory where the corpus manifest should be
+ placed.
+ """
+ # This comes first rather than later so global_command_override is at the top
+ # of the .json after being written
+ if thinlto_build == 'local':
+ corpus_description = {
+ 'global_command_override': constant.UNSPECIFIED_OVERRIDE
+ }
+ else:
+ corpus_description = {}
+
+ corpus_description.update({
+ 'has_thinlto': thinlto_build is not None,
+ 'modules': [path for path in relative_output_paths if path is not None]
+ })
+
+ with open(
+ os.path.join(output_dir, 'corpus_description.json'),
+ 'w',
+ encoding='utf-8') as f:
+ json.dump(corpus_description, f, indent=2)
diff --git a/llvm/py/src/mlgo/extract_ir_test.py b/llvm/py/src/mlgo/extract_ir_test.py
new file mode 100644
index 00000000000000..8811134aab4fce
--- /dev/null
+++ b/llvm/py/src/mlgo/extract_ir_test.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for compiler_opt.tools.extract_ir."""
+
+# pylint: disable=protected-access
+import os.path
+
+from absl.testing import absltest
+
+from compiler_opt.tools import extract_ir_lib
+
+
+class ExtractIrTest(absltest.TestCase):
+
+ def test_one_conversion(self):
+ obj = extract_ir_lib.convert_compile_command_to_objectfile(
+ {
+ 'directory': '/output/directory',
+ 'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
+ 'file': '/some/path/lib/foo/bar.cc'
+ }, '/corpus/destination/path')
+ self.assertIsNotNone(obj)
+ # pytype: disable=attribute-error
+ # Pytype complains about obj being None
+ self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
+ self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
+ self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
+ self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+ self.assertEqual(obj.thinlto_index_file(),
+ '/corpus/destination/path/lib/bar.o.thinlto.bc')
+ # pytype: enable=attribute-error
+
+ def test_one_conversion_arguments_style(self):
+ obj = extract_ir_lib.convert_compile_command_to_objectfile(
+ {
+ 'directory': '/output/directory',
+ 'arguments':
+ ['-cc1', '-c', '/some/path/lib/foo/bar.cc', '-o', 'lib/bar.o'],
+ 'file': '/some/path/lib/foo/bar.cc'
+ }, '/corpus/destination/path')
+ self.assertIsNotNone(obj)
+ # pytype: disable=attribute-error
+ # Pytype complains about obj being None
+ self.assertEqual(obj.input_obj(), '/output/directory/lib/bar.o')
+ self.assertEqual(obj.relative_output_path(), 'lib/bar.o')
+ self.assertEqual(obj.cmd_file(), '/corpus/destination/path/lib/bar.o.cmd')
+ self.assertEqual(obj.bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+ self.assertEqual(obj.thinlto_index_file(),
+ '/corpus/destination/path/lib/bar.o.thinlto.bc')
+ # pytype: enable=attribute-error
+
+ def test_arr_conversion(self):
+ res = extract_ir_lib.load_from_compile_commands([{
+ 'directory': '/output/directory',
+ 'command': '-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o',
+ 'file': '/some/path/lib/foo/bar.cc'
+ }, {
+ 'directory': '/output/directory',
+ 'command': '-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o',
+ 'file': '/some/path/lib/foo/baz.cc'
+ }], '/corpus/destination/path')
+ res = list(res)
+ self.assertLen(res, 2)
+ self.assertEqual(res[0].input_obj(), '/output/directory/lib/bar.o')
+ self.assertEqual(res[0].relative_output_path(), 'lib/bar.o')
+ self.assertEqual(res[0].cmd_file(),
+ '/corpus/destination/path/lib/bar.o.cmd')
+ self.assertEqual(res[0].bc_file(), '/corpus/destination/path/lib/bar.o.bc')
+ self.assertEqual(res[0].thinlto_index_file(),
+ '/corpus/destination/path/lib/bar.o.thinlto.bc')
+
+ self.assertEqual(res[1].input_obj(), '/output/directory/lib/other/baz.o')
+ self.assertEqual(res[1].relative_output_path(), 'lib/other/baz.o')
+ self.assertEqual(res[1].cmd_file(),
+ '/corpus/destination/path/lib/other/baz.o.cmd')
+ self.assertEqual(res[1].bc_file(),
+ '/corpus/destination/path/lib/other/baz.o.bc')
+ self.assertEqual(res[1].thinlto_index_file(),
+ '/corpus/destination/path/lib/other/baz.o.thinlto.bc')
+
+ def test_command_extraction(self):
+ obj = extract_ir_lib.TrainingIRExtractor(
+ obj_relative_path='lib/obj_file.o',
+ output_base_dir='/where/corpus/goes',
+ obj_base_dir='/foo/bar')
+ self.assertEqual(
+ obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
+ '/bin/llvm_objcopy_path',
+ '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
+ '/foo/bar/lib/obj_file.o', '/dev/null'
+ ])
+ self.assertEqual(
+ obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
+ '/bin/llvm_objcopy_path',
+ '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
+ '/foo/bar/lib/obj_file.o', '/dev/null'
+ ])
+
+ def test_command_extraction_no_basedir(self):
+ obj = extract_ir_lib.TrainingIRExtractor('lib/obj_file.o',
+ '/where/corpus/goes')
+ self.assertEqual(
+ obj._get_extraction_cmd_command('/bin/llvm_objcopy_path', '.llvmcmd'), [
+ '/bin/llvm_objcopy_path',
+ '--dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd',
+ 'lib/obj_file.o', '/dev/null'
+ ])
+ self.assertEqual(
+ obj._get_extraction_bc_command('/bin/llvm_objcopy_path', '.llvmbc'), [
+ '/bin/llvm_objcopy_path',
+ '--dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc',
+ 'lib/obj_file.o', '/dev/null'
+ ])
+
+ def test_lld_params(self):
+ lld_opts = [
+ '-o', 'output/dir/exe', 'lib/obj1.o', 'somelib.a', '-W,blah',
+ 'lib/dir/obj2.o'
+ ]
+ obj = extract_ir_lib.load_from_lld_params(lld_opts, '/some/path',
+ '/tmp/out')
+ self.assertLen(obj, 2)
+ self.assertEqual(obj[0].input_obj(), '/some/path/lib/obj1.o')
+ self.assertEqual(obj[0].relative_output_path(), 'lib/obj1.o')
+ self.assertEqual(obj[0].cmd_file(), '/tmp/out/lib/obj1.o.cmd')
+ self.assertEqual(obj[0].thinlto_index_file(),
+ '/tmp/out/lib/obj1.o.thinlto.bc')
+ self.assertEqual(obj[1].input_obj(), '/some/path/lib/dir/obj2.o')
+
+ def test_load_from_directory(self):
+ tempdir = self.create_tempdir()
+ subdir = tempdir.mkdir(dir_path='subdir')
+ subdir.create_file(file_path='test1.o')
+ subdir.create_file(file_path='test2.o')
+ outdir = self.create_tempdir()
+ objs = extract_ir_lib.load_from_directory(tempdir.full_path,
+ outdir.full_path)
+ self.assertLen(objs, 2)
+ for index, obj in enumerate(
+ sorted(objs, key=lambda x: x._obj_relative_path)):
+ self.assertEqual(obj._obj_relative_path, f'subdir/test{index + 1:d}.o')
+ self.assertEqual(obj._obj_base_dir, tempdir.full_path)
+ self.assertEqual(obj._output_base_dir, outdir.full_path)
+
+ def test_lld_thinlto_discovery(self):
+ tempdir = self.create_tempdir()
+ tempdir.create_file(file_path='1.3.import.bc')
+ tempdir.create_file(file_path='2.3.import.bc')
+ tempdir.create_file(file_path='3.3.import.bc')
+ tempdir.create_file(file_path='1.thinlto.bc')
+ tempdir.create_file(file_path='2.thinlto.bc')
+ tempdir.create_file(file_path='3.thinlto.bc')
+ outdir = self.create_tempdir()
+ obj = extract_ir_lib.load_for_lld_thinlto(tempdir.full_path,
+ outdir.full_path)
+ self.assertLen(obj, 3)
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ self.assertEqual(o._obj_relative_path, f'{i + 1:d}')
+ self.assertEqual(o._obj_base_dir, tempdir.full_path)
+ self.assertEqual(o._output_base_dir, outdir.full_path)
+
+ def test_lld_thinlto_discovery_nested(self):
+ outer = self.create_tempdir()
+ tempdir = outer.mkdir(dir_path='nest')
+ tempdir.create_file(file_path='1.3.import.bc')
+ tempdir.create_file(file_path='2.3.import.bc')
+ tempdir.create_file(file_path='3.3.import.bc')
+ tempdir.create_file(file_path='1.thinlto.bc')
+ tempdir.create_file(file_path='2.thinlto.bc')
+ tempdir.create_file(file_path='3.thinlto.bc')
+ outdir = self.create_tempdir()
+ obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+ self.assertLen(obj, 3)
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ self.assertEqual(o._obj_relative_path, f'nest/{i + 1:d}')
+ self.assertEqual(o._obj_base_dir, outer.full_path)
+ self.assertEqual(o._output_base_dir, outdir.full_path)
+
+ def test_lld_thinlto_extraction(self):
+ outer = self.create_tempdir()
+ tempdir = outer.mkdir(dir_path='nest')
+ tempdir.create_file(file_path='1.3.import.bc')
+ tempdir.create_file(file_path='2.3.import.bc')
+ tempdir.create_file(file_path='3.3.import.bc')
+ tempdir.create_file(file_path='1.thinlto.bc')
+ tempdir.create_file(file_path='2.thinlto.bc')
+ tempdir.create_file(file_path='3.thinlto.bc')
+ outdir = self.create_tempdir()
+ obj = extract_ir_lib.load_for_lld_thinlto(outer.full_path, outdir.full_path)
+ for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+ mod_path = o.extract(thinlto_build='local')
+ self.assertEqual(mod_path, f'nest/{i + 1:d}')
+ self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/1.bc')))
+ self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/2.bc')))
+ self.assertTrue(os.path.exists(os.path.join(outdir.full_path, 'nest/3.bc')))
+ self.assertTrue(
+ os.path.exists(os.path.join(outdir.full_path, 'nest/1.thinlto.bc')))
+ self.assertTrue(
+ os.path.exists(os.path.join(outdir.full_path, 'nest/2.thinlto.bc')))
+ self.assertTrue(
+ os.path.exists(os.path.join(outdir.full_path, 'nest/3.thinlto.bc')))
+
+ def test_filtering(self):
+ cmdline = '-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o'
+ self.assertTrue(extract_ir_lib.should_include_module(cmdline, None))
+ self.assertTrue(extract_ir_lib.should_include_module(cmdline, '.*'))
+ self.assertTrue(extract_ir_lib.should_include_module(cmdline, '^-Oz$'))
+ self.assertFalse(extract_ir_lib.should_include_module(cmdline, '^-O3$'))
+
+ def test_thinlto_index_extractor(self):
+ cmdline = ('-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/'
+ 'out.o\0-fthinlto-index=foo/bar.thinlto.bc')
+ self.assertEqual(
+ extract_ir_lib.get_thinlto_index(cmdline, '/the/base/dir'),
+ '/the/base/dir/foo/bar.thinlto.bc')
+
+
+if __name__ == '__main__':
+ absltest.main()
diff --git a/llvm/py/src/mlgo/make_corpus.py b/llvm/py/src/mlgo/make_corpus.py
new file mode 100644
index 00000000000000..24493d894be723
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus.py
@@ -0,0 +1,58 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+ --input_dir=<path to input directory> \
+ --output_dir=<path to output directory> \
+ --default_args="<list of space separated flags>"
+"""
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from compiler_opt.tools import make_corpus_lib
+
+flags.DEFINE_string('input_dir', None, 'The input directory.')
+flags.DEFINE_string('output_dir', None, 'The output directory.')
+flags.DEFINE_string(
+ 'default_args', '',
+ 'The compiler flags to compile with when using downstream tooling.')
+
+flags.mark_flag_as_required('input_dir')
+flags.mark_flag_as_required('output_dir')
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+ logging.warning(
+ 'Using this tool does not guarantee that the bitcode is taken at '
+ 'the correct stage for consumption during model training. Make '
+ 'sure to validate assumptions about where the bitcode is coming '
+ 'from before using it in production.')
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
+ make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir,
+ FLAGS.output_dir)
+ make_corpus_lib.write_corpus_manifest(relative_paths, FLAGS.output_dir,
+ FLAGS.default_args.split())
+
+
+if __name__ == '__main__':
+ app.run(main)
diff --git a/llvm/py/src/mlgo/make_corpus_lib.py b/llvm/py/src/mlgo/make_corpus_lib.py
new file mode 100644
index 00000000000000..3598fc12a04d14
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus_lib.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Library functions for making a corpus from arbitrary bitcode."""
+
+import pathlib
+import os
+import shutil
+import json
+
+from typing import List, Optional
+
+BITCODE_EXTENSION = '.bc'
+
+
+def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]:
+ """Finds bitcode files to extract from a given directory.
+
+ Args:
+ bitcode_base_dir: The base directory where the bitcode to be copied
+ is from.
+ output_dir: The directory to place the bitcode in.
+
+ Returns an array of paths representing the relative path to the bitcode
+ file from the base direcotry.
+ """
+ paths = [
+ str(p)[:-len(BITCODE_EXTENSION)]
+ for p in pathlib.Path(bitcode_base_dir).glob('**/*' + BITCODE_EXTENSION)
+ ]
+
+ return [
+ os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths
+ ]
+
+
+def copy_bitcode(relative_paths: List[str], bitcode_base_dir: str,
+ output_dir: str) -> None:
+ """Copies bitcode files from the base directory to the output directory.
+
+ Args:
+ relative_paths: An array of relative paths to bitcode files that are copied
+ over to the output directory, preserving relative location.
+ bitcode_base_dir: The base directory where the bitcode is located.
+ output_dir: The output directory to place the bitcode in.
+ """
+ for relative_path in relative_paths:
+ base_path = os.path.join(bitcode_base_dir,
+ relative_path + BITCODE_EXTENSION)
+ destination_path = os.path.join(output_dir,
+ relative_path + BITCODE_EXTENSION)
+ os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+ shutil.copy(base_path, destination_path)
+
+
+def write_corpus_manifest(relative_output_paths: List[str],
+ output_dir: str,
+ default_args: Optional[List[str]] = None) -> None:
+ """Creates a corpus manifest describing the bitcode that has been found.
+
+ Args:
+ relative_output_paths: A list of paths to each bitcode file relative to the
+ output directory.
+ outout_dir: The output directory where the corpus is being created.
+ default_args: An array of compiler flags that should be used to compile
+ the bitcode when using further downstream tooling."""
+ if default_args is None:
+ default_args = []
+ corpus_description = {
+ 'global_command_override': default_args,
+ 'has_thinlto': False,
+ 'modules': [path for path in relative_output_paths if path is not None]
+ }
+
+ with open(
+ os.path.join(output_dir, 'corpus_description.json'),
+ 'w',
+ encoding='utf-8') as description_file:
+ json.dump(corpus_description, description_file, indent=2)
diff --git a/llvm/py/src/mlgo/make_corpus_test.py b/llvm/py/src/mlgo/make_corpus_test.py
new file mode 100644
index 00000000000000..8ed598695d06ee
--- /dev/null
+++ b/llvm/py/src/mlgo/make_corpus_test.py
@@ -0,0 +1,66 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test for compiler_opt.tools.make_corpus_lib"""
+
+import json
+import os
+
+from absl.testing import absltest
+
+from compiler_opt.tools import make_corpus_lib
+
+
+class MakeCorpusTest(absltest.TestCase):
+
+ def test_load_bitcode_from_directory(self):
+ outer = self.create_tempdir()
+ tempdir = outer.mkdir(dir_path='nested')
+ tempdir.create_file('test1.bc')
+ tempdir.create_file('test2.bc')
+ relative_paths = make_corpus_lib.load_bitcode_from_directory(outer)
+ relative_paths = sorted(relative_paths)
+ self.assertEqual(relative_paths[0], 'nested/test1')
+ self.assertEqual(relative_paths[1], 'nested/test2')
+
+ def test_copy_bitcode(self):
+ build_dir = self.create_tempdir()
+ nested_dir = build_dir.mkdir(dir_path='nested')
+ nested_dir.create_file('test1.bc')
+ nested_dir.create_file('test2.bc')
+ relative_paths = ['nested/test1', 'nested/test2']
+ corpus_dir = self.create_tempdir()
+ make_corpus_lib.copy_bitcode(relative_paths, build_dir, corpus_dir)
+ output_files = sorted(os.listdir(os.path.join(corpus_dir, './nested')))
+ self.assertEqual(output_files[0], 'test1.bc')
+ self.assertEqual(output_files[1], 'test2.bc')
+
+ def test_write_corpus_manifest(self):
+ relative_output_paths = ['test/test1', 'test/test2']
+ output_dir = self.create_tempdir()
+ default_args = ['-O3', '-c']
+ make_corpus_lib.write_corpus_manifest(relative_output_paths, output_dir,
+ default_args)
+ with open(
+ os.path.join(output_dir, 'corpus_description.json'),
+ encoding='utf-8') as corpus_description_file:
+ corpus_description = json.load(corpus_description_file)
+ self.assertEqual(corpus_description['global_command_override'],
+ default_args)
+ self.assertEqual(corpus_description['has_thinlto'], False)
+ self.assertEqual(corpus_description['modules'], relative_output_paths)
+
+
+if __name__ == '__main__':
+ absltest.main()
More information about the cfe-commits
mailing list