[clang] Add support for dynamic libraries in CLANG_BOLT (PR #127020)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Feb 17 06:35:58 PST 2025
https://github.com/serge-sans-paille updated https://github.com/llvm/llvm-project/pull/127020
>From a701851462a19b4708599bd1bfcdd154b5b95573 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Thu, 13 Feb 2025 08:54:03 +0100
Subject: [PATCH] [clang][bolt] Improve CLANG_BOLT setup to support shared
libraries
When linking clang with libLLVM and clang-cpp dynamically, bolt post
processing only optimizes the clang binary.
This patch makes sure it also instruments libLLVM and libclang-cpp,
otherwise optimizing just the clang binary yields limited benefits.
This currently only works on Linux due to reliance on LD_PRELOAD to have
the instrumented binary use the instrumented shared libraries.
---
clang/cmake/caches/BOLT.cmake | 1 +
clang/tools/driver/CMakeLists.txt | 40 ++++++--
clang/utils/perf-training/perf-helper.py | 115 +++++++++++++++--------
3 files changed, 109 insertions(+), 47 deletions(-)
diff --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake
index eba2346b2f4ca..1956c10463148 100644
--- a/clang/cmake/caches/BOLT.cmake
+++ b/clang/cmake/caches/BOLT.cmake
@@ -1,6 +1,7 @@
set(CMAKE_BUILD_TYPE Release CACHE STRING "")
set(CLANG_BOLT "INSTRUMENT" CACHE STRING "")
set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
+set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index 5d7962769014a..10ea5de387220 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -168,6 +168,28 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
)
set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}")
+ set(CLANG_BOLT_INPUTS $<TARGET_FILE:clang>)
+ set(CLANG_INSTRUMENTED_OUTPUTS ${CLANG_INSTRUMENTED})
+
+ # Add in dynamically linked libraries, if needs be. Currently only supported
+ # on Linux because it relies on LD_PRELOAD for instrumentation.
+ if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ if (CLANG_LINK_CLANG_DYLIB)
+ set(CLANG_CPP_BOLT_INSTRUMENTED "clang-cxx-bolt.inst" CACHE STRING
+ "Name of BOLT-instrumented Clang library")
+ set(CLANG_CPP_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_CPP_BOLT_INSTRUMENTED})
+ list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:clang-cpp>)
+ list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${CLANG_CPP_INSTRUMENTED})
+ endif()
+ if (LLVM_LINK_LLVM_DYLIB)
+ set(LLVM_BOLT_INSTRUMENTED "LLVM-bolt.inst" CACHE STRING
+ "Name of BOLT-instrumented LLVM library")
+ set(LLVM_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${LLVM_BOLT_INSTRUMENTED})
+ list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:LLVM>)
+ list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${LLVM_INSTRUMENTED})
+ endif()
+ endif()
+
# This POST_BUILD command is executed unconditionally even if the clang target
# is already built. We need to wrap the whole bolt optimization process in
# a single python wrapper, so that we can first check if the binary has
@@ -176,15 +198,15 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
TARGET clang POST_BUILD
COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py
bolt-optimize
- --method ${CLANG_BOLT}
- --input $<TARGET_FILE:clang>
- --instrumented-output ${CLANG_INSTRUMENTED}
- --fdata ${BOLT_FDATA}
- --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
- --readelf $<TARGET_FILE:llvm-readobj>
- --bolt $<TARGET_FILE:llvm-bolt>
- --lit "${LIT_COMMAND}"
- --merge-fdata $<TARGET_FILE:merge-fdata>
+ --method ${CLANG_BOLT}
+ --input "${CLANG_BOLT_INPUTS}"
+ --instrumented-output "${CLANG_INSTRUMENTED_OUTPUTS}"
+ --fdata ${BOLT_FDATA}
+ --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
+ --readelf $<TARGET_FILE:llvm-readobj>
+ --bolt $<TARGET_FILE:llvm-bolt>
+ --lit "${LIT_COMMAND}"
+ --merge-fdata $<TARGET_FILE:merge-fdata>
COMMENT "Optimizing Clang with BOLT"
USES_TERMINAL
VERBATIM
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 55c5160a71c4f..cdb6c39f6c50e 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -560,6 +560,23 @@ def genOrderFile(args):
return 0
+def filter_bolt_optimized(inputs, instrumented_outputs):
+ new_inputs = []
+ new_instrumented_ouputs = []
+ for input, instrumented_output in zip(inputs, instrumented_outputs):
+ output = subprocess.check_output(
+ [opts.readelf, "-WS", input], universal_newlines=True
+ )
+
+ # This binary has already been bolt-optimized, so skip further processing.
+ if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
+ print(f"Skipping {input}, it's already instrumented")
+ else:
+ new_inputs.append(input)
+ new_instrumented_ouputs.append(instrumented_output)
+ return new_inputs, new_instrumented_ouputs
+
+
def bolt_optimize(args):
parser = argparse.ArgumentParser("%prog [options] ")
parser.add_argument("--method", choices=["INSTRUMENT", "PERF", "LBR"])
@@ -574,47 +591,67 @@ def bolt_optimize(args):
opts = parser.parse_args(args)
- output = subprocess.check_output(
- [opts.readelf, "-WS", opts.input], universal_newlines=True
- )
+ inputs = opts.input.split(";")
+ instrumented_outputs = opts.instrumented_output.split(";")
+ assert len(inputs) == len(
+ instrumented_outputs
+ ), "inconsistent --input / --instrumented-output arguments"
- # This binary has already been bolt-optimized, so skip further processing.
- if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
+ inputs, instrumented_outputs = filter_bolt_optimized(inputs, instrumented_outputs)
+ if not inputs:
return 0
+ environ = os.environ.copy()
if opts.method == "INSTRUMENT":
- process = subprocess.run(
- [
+ preloads = []
+ for input, instrumented_output in zip(inputs, instrumented_outputs):
+ args = [
opts.bolt,
- opts.input,
+ input,
"-o",
- opts.instrumented_output,
+ instrumented_output,
"-instrument",
"--instrumentation-file-append-pid",
f"--instrumentation-file={opts.fdata}",
- ],
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- text=True,
- )
+ ]
+ print("Running: " + " ".join(args))
+ process = subprocess.run(
+ args,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
+ )
- print(process.args)
- for line in process.stdout:
- sys.stdout.write(line)
- process.check_returncode()
+ for line in process.stdout:
+ sys.stdout.write(line)
+ process.check_returncode()
+ output = subprocess.check_output(
+ [opts.readelf, "--file-header", input], universal_newlines=True
+ )
+ if re.search(r"Type:\s*((Shared)|(DYN))", output):
+ # force using the instrumented version
+ preloads.append(instrumented_output)
+
+ if preloads:
+ print("Patching execution environment for dynamic library")
+ environ["LD_PRELOAD"] = os.pathsep.join(preloads)
+
+ args = [
+ sys.executable,
+ opts.lit,
+ "-v",
+ os.path.join(opts.perf_training_binary_dir, f"bolt-fdata"),
+ ]
+ print("Running: " + " ".join(args))
process = subprocess.run(
- [
- sys.executable,
- opts.lit,
- os.path.join(opts.perf_training_binary_dir, "bolt-fdata"),
- ],
+ args,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
+ env=environ,
)
- print(process.args)
for line in process.stdout:
sys.stdout.write(line)
process.check_returncode()
@@ -624,14 +661,14 @@ def bolt_optimize(args):
merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
- shutil.copy(opts.input, f"{opts.input}-prebolt")
+ for input in inputs:
+ shutil.copy(input, f"{input}-prebolt")
- process = subprocess.run(
- [
+ args = [
opts.bolt,
- f"{opts.input}-prebolt",
+ f"{input}-prebolt",
"-o",
- opts.input,
+ input,
"-data",
opts.fdata,
"-reorder-blocks=ext-tsp",
@@ -643,16 +680,18 @@ def bolt_optimize(args):
"-use-gnu-stack",
"-update-debug-sections",
"-nl" if opts.method == "PERF" else "",
- ],
- stdout=subprocess.PIPE,
- stderr=subprocess.STDOUT,
- text=True,
- )
+ ]
+ print("Running: " + " ".join(args))
+ process = subprocess.run(
+ args,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
+ )
- print(process.args)
- for line in process.stdout:
- sys.stdout.write(line)
- process.check_returncode()
+ for line in process.stdout:
+ sys.stdout.write(line)
+ process.check_returncode()
commands = {
More information about the cfe-commits
mailing list