[clang] Add support for dynamic libraries in CLANG_BOLT (PR #127020)
via cfe-commits
cfe-commits at lists.llvm.org
Wed Feb 12 23:57:37 PST 2025
https://github.com/serge-sans-paille created https://github.com/llvm/llvm-project/pull/127020
None
>From c2d1352aba4872957e34633b92d87c39d0eb7e45 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Tue, 11 Feb 2025 18:20:15 +0100
Subject: [PATCH 1/2] [clang][cmake] Sanitize CLANG_BOLT values
This avoids failing later in the build process.
---
clang/tools/driver/CMakeLists.txt | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index ad336fcc45b60..5d7962769014a 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -23,10 +23,14 @@ if(CLANG_PLUGIN_SUPPORT)
set(support_plugins SUPPORT_PLUGINS)
endif()
+set(CLANG_BOLT_ALLOWLIST INSTRUMENT PERF LBR)
set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
- May be specified as Instrument or Perf or LBR to use a particular profiling \
+May be specified as one of ${CLANG_BOLT_ALLOWLIST} to use a particular profiling \
mechanism.")
string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
+if (CLANG_BOLT AND NOT CLANG_BOLT IN_LIST CLANG_BOLT_ALLOWLIST)
+ message(FATAL_ERROR "Specified CLANG_BOLT value '${CLANG_BOLT}' is not one of ${CLANG_BOLT_ALLOWLIST}.")
+endif()
if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
set(CLANG_BOLT_DEPS clear-bolt-fdata llvm-bolt llvm-readobj)
>From 4c34b09c96735d3af346c19e05149bc553038a8c Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Thu, 13 Feb 2025 08:54:03 +0100
Subject: [PATCH 2/2] [clang][bolt] Improve CLANG_BOLT setup to support shared
libraries
When linking clang with libLLVM and clang-cpp dynamically, bolt post
processing only optimizes the clang binary.
This patch makes sure it also instruments libLLVM and libclang-cpp,
otherwise optimizing just the clang binary yields limited benefits.
This currently only works on Linux due to reliance on LD_PRELOAD to have
the instrumented binary use the instrumented shared libraries.
---
clang/tools/driver/CMakeLists.txt | 40 ++++--
clang/utils/perf-training/perf-helper.py | 147 ++++++++++++++---------
2 files changed, 123 insertions(+), 64 deletions(-)
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index 5d7962769014a..10ea5de387220 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -168,6 +168,28 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
)
set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}")
+ set(CLANG_BOLT_INPUTS $<TARGET_FILE:clang>)
+ set(CLANG_INSTRUMENTED_OUTPUTS ${CLANG_INSTRUMENTED})
+
+ # Add in dynamically linked libraries, if needs be. Currently only supported
+ # on Linux because it relies on LD_PRELOAD for instrumentation.
+ if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ if (CLANG_LINK_CLANG_DYLIB)
+ set(CLANG_CPP_BOLT_INSTRUMENTED "clang-cxx-bolt.inst" CACHE STRING
+ "Name of BOLT-instrumented Clang library")
+ set(CLANG_CPP_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_CPP_BOLT_INSTRUMENTED})
+ list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:clang-cpp>)
+ list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${CLANG_CPP_INSTRUMENTED})
+ endif()
+ if (LLVM_LINK_LLVM_DYLIB)
+ set(LLVM_BOLT_INSTRUMENTED "LLVM-bolt.inst" CACHE STRING
+ "Name of BOLT-instrumented LLVM library")
+ set(LLVM_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${LLVM_BOLT_INSTRUMENTED})
+ list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:LLVM>)
+ list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${LLVM_INSTRUMENTED})
+ endif()
+ endif()
+
# This POST_BUILD command is executed unconditionally even if the clang target
# is already built. We need to wrap the whole bolt optimization process in
# a single python wrapper, so that we can first check if the binary has
@@ -176,15 +198,15 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
TARGET clang POST_BUILD
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py
bolt-optimize
- --method ${CLANG_BOLT}
- --input $<TARGET_FILE:clang>
- --instrumented-output ${CLANG_INSTRUMENTED}
- --fdata ${BOLT_FDATA}
- --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
- --readelf $<TARGET_FILE:llvm-readobj>
- --bolt $<TARGET_FILE:llvm-bolt>
- --lit "${LIT_COMMAND}"
- --merge-fdata $<TARGET_FILE:merge-fdata>
+ --method ${CLANG_BOLT}
+ --input "${CLANG_BOLT_INPUTS}"
+ --instrumented-output "${CLANG_INSTRUMENTED_OUTPUTS}"
+ --fdata ${BOLT_FDATA}
+ --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
+ --readelf $<TARGET_FILE:llvm-readobj>
+ --bolt $<TARGET_FILE:llvm-bolt>
+ --lit "${LIT_COMMAND}"
+ --merge-fdata $<TARGET_FILE:merge-fdata>
COMMENT "Optimizing Clang with BOLT"
USES_TERMINAL
VERBATIM
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 55c5160a71c4f..ea32ef216bcaa 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -559,6 +559,22 @@ def genOrderFile(args):
return 0
+def filter_bolt_optimized(inputs, instrumented_outputs)
+ new_inputs = []
+ new_instrumented_ouputs = []
+ for input, instrumented_output in zip(inputs, instrumented_outputs):
+ output = subprocess.check_output(
+ [opts.readelf, "-WS", input], universal_newlines=True
+ )
+
+ # This binary has already been bolt-optimized, so skip further processing.
+ if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
+ print(f"Skipping {input}, it's already instrumented")
+ else:
+ new_inputs.append(input)
+ new_instrumented_ouputs.append(instrumented_output)
+ return new_inputs, new_instrumented_ouputs
+
def bolt_optimize(args):
parser = argparse.ArgumentParser("%prog [options] ")
@@ -574,47 +590,66 @@ def bolt_optimize(args):
opts = parser.parse_args(args)
- output = subprocess.check_output(
- [opts.readelf, "-WS", opts.input], universal_newlines=True
- )
+ inputs = opts.input.split(';')
+ instrumented_outputs = opts.instrumented_output.split(';')
+ assert len(inputs) == len(instrumented_outputs), "inconsistent --input / --instrumented-output arguments"
- # This binary has already been bolt-optimized, so skip further processing.
- if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
+ inputs, instrumented_outputs = filter_bolt_optimized(inputs, instrumented_outputs)
+ if not inputs:
return 0
+ environ = os.environ.copy()
if opts.method == "INSTRUMENT":
- process = subprocess.run(
- [
- opts.bolt,
- opts.input,
- "-o",
- opts.instrumented_output,
- "-instrument",
- "--instrumentation-file-append-pid",
- f"--instrumentation-file={opts.fdata}",
- ],
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- text=True,
- )
+ preloads = []
+ for input, instrumented_output in zip(inputs, instrumented_outputs):
+ args = [
+ opts.bolt,
+ input,
+ "-o",
+ instrumented_output,
+ "-instrument",
+ "--instrumentation-file-append-pid",
+ f"--instrumentation-file={opts.fdata}",
+ ]
+ print("Running: " + " ".join(args))
+ process = subprocess.run(
+ args,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
+ )
- print(process.args)
- for line in process.stdout:
- sys.stdout.write(line)
- process.check_returncode()
+ for line in process.stdout:
+ sys.stdout.write(line)
+ process.check_returncode()
- process = subprocess.run(
- [
+ output = subprocess.check_output(
+ [opts.readelf, "--file-header", input], universal_newlines=True
+ )
+ if re.search(r"Type:\s*((Shared)|(DYN))", output):
+ # force using the instrumented version
+ preloads.append(instrumented_output)
+
+ if preloads:
+ print("Patching execution environment for dynamic library")
+ environ["LD_PRELOAD"] = os.pathsep.join(preloads)
+
+
+ args = [
sys.executable,
opts.lit,
- os.path.join(opts.perf_training_binary_dir, "bolt-fdata"),
- ],
+ "-v",
+ os.path.join(opts.perf_training_binary_dir, f"bolt-fdata"),
+ ]
+ print("Running: " + " ".join(args))
+ process = subprocess.run(
+ args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
+ env=environ,
)
- print(process.args)
for line in process.stdout:
sys.stdout.write(line)
process.check_returncode()
@@ -624,35 +659,37 @@ def bolt_optimize(args):
merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
- shutil.copy(opts.input, f"{opts.input}-prebolt")
+ for input in inputs:
+ shutil.copy(input, f"{input}-prebolt")
- process = subprocess.run(
- [
- opts.bolt,
- f"{opts.input}-prebolt",
- "-o",
- opts.input,
- "-data",
- opts.fdata,
- "-reorder-blocks=ext-tsp",
- "-reorder-functions=cdsort",
- "-split-functions",
- "-split-all-cold",
- "-split-eh",
- "-dyno-stats",
- "-use-gnu-stack",
- "-update-debug-sections",
- "-nl" if opts.method == "PERF" else "",
- ],
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- text=True,
- )
+ args = [
+ opts.bolt,
+ f"{input}-prebolt",
+ "-o",
+ input,
+ "-data",
+ opts.fdata,
+ "-reorder-blocks=ext-tsp",
+ "-reorder-functions=cdsort",
+ "-split-functions",
+ "-split-all-cold",
+ "-split-eh",
+ "-dyno-stats",
+ "-use-gnu-stack",
+ "-update-debug-sections",
+ "-nl" if opts.method == "PERF" else "",
+ ]
+ print("Running: " + " ".join(args))
+ process = subprocess.run(
+ args,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
+ )
- print(process.args)
- for line in process.stdout:
- sys.stdout.write(line)
- process.check_returncode()
+ for line in process.stdout:
+ sys.stdout.write(line)
+ process.check_returncode()
commands = {
More information about the cfe-commits
mailing list