[clang] Add support for dynamic libraries in CLANG_BOLT (PR #127020)

via cfe-commits cfe-commits at lists.llvm.org
Thu Feb 13 01:26:53 PST 2025


https://github.com/serge-sans-paille updated https://github.com/llvm/llvm-project/pull/127020

>From c2d1352aba4872957e34633b92d87c39d0eb7e45 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Tue, 11 Feb 2025 18:20:15 +0100
Subject: [PATCH 1/2] [clang][cmake] Sanitize CLANG_BOLT values

This avoids failing later in the build process.
---
 clang/tools/driver/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index ad336fcc45b60..5d7962769014a 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -23,10 +23,14 @@ if(CLANG_PLUGIN_SUPPORT)
   set(support_plugins SUPPORT_PLUGINS)
 endif()
 
+set(CLANG_BOLT_ALLOWLIST INSTRUMENT PERF LBR)
 set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
-  May be specified as Instrument or Perf or LBR to use a particular profiling \
+May be specified as one of ${CLANG_BOLT_ALLOWLIST} to use a particular profiling \
   mechanism.")
 string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
+if (CLANG_BOLT AND NOT CLANG_BOLT IN_LIST CLANG_BOLT_ALLOWLIST)
+    message(FATAL_ERROR "Specified CLANG_BOLT value '${CLANG_BOLT}' is not one of ${CLANG_BOLT_ALLOWLIST}.")
+endif()
 
 if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
   set(CLANG_BOLT_DEPS clear-bolt-fdata llvm-bolt llvm-readobj)

>From c3a958828947b3f310270d8d8f3545b8d8624dc7 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Thu, 13 Feb 2025 08:54:03 +0100
Subject: [PATCH 2/2] [clang][bolt] Improve CLANG_BOLT setup to support shared
 libraries

When linking clang with libLLVM and clang-cpp dynamically, bolt post
processing only optimizes the clang binary.

This patch makes sure it also instruments libLLVM and libclang-cpp,
otherwise optimizing just the clang binary yields limited benefits.

This currently only works on Linux due to reliance on LD_PRELOAD to have
the instrumented binary use the instrumented shared libraries.
---
 clang/cmake/caches/BOLT.cmake            |   1 +
 clang/tools/driver/CMakeLists.txt        |  40 ++++++--
 clang/utils/perf-training/perf-helper.py | 115 +++++++++++++++--------
 3 files changed, 109 insertions(+), 47 deletions(-)

diff --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake
index eba2346b2f4ca..1956c10463148 100644
--- a/clang/cmake/caches/BOLT.cmake
+++ b/clang/cmake/caches/BOLT.cmake
@@ -1,6 +1,7 @@
 set(CMAKE_BUILD_TYPE Release CACHE STRING "")
 set(CLANG_BOLT "INSTRUMENT" CACHE STRING "")
 set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
+set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
 
 set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
 set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index 5d7962769014a..10ea5de387220 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -168,6 +168,28 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
   )
   set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}")
 
+  set(CLANG_BOLT_INPUTS $<TARGET_FILE:clang>)
+  set(CLANG_INSTRUMENTED_OUTPUTS ${CLANG_INSTRUMENTED})
+
+  # Add in dynamically linked libraries, if needs be. Currently only supported
+  # on Linux because it relies on LD_PRELOAD for instrumentation.
+  if (CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    if (CLANG_LINK_CLANG_DYLIB)
+     set(CLANG_CPP_BOLT_INSTRUMENTED "clang-cxx-bolt.inst" CACHE STRING
+       "Name of BOLT-instrumented Clang library")
+     set(CLANG_CPP_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_CPP_BOLT_INSTRUMENTED})
+     list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:clang-cpp>)
+     list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${CLANG_CPP_INSTRUMENTED})
+    endif()
+    if (LLVM_LINK_LLVM_DYLIB)
+      set(LLVM_BOLT_INSTRUMENTED "LLVM-bolt.inst" CACHE STRING
+        "Name of BOLT-instrumented LLVM library")
+      set(LLVM_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${LLVM_BOLT_INSTRUMENTED})
+      list(APPEND CLANG_BOLT_INPUTS $<TARGET_FILE:LLVM>)
+      list(APPEND CLANG_INSTRUMENTED_OUTPUTS ${LLVM_INSTRUMENTED})
+    endif()
+  endif()
+
   # This POST_BUILD command is executed unconditionally even if the clang target
   # is already built.  We need to wrap the whole bolt optimization process in
   # a single python wrapper, so that we can first check if the binary has
@@ -176,15 +198,15 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
     TARGET clang POST_BUILD
     COMMAND  "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py
              bolt-optimize
-	     --method ${CLANG_BOLT}
-	     --input $<TARGET_FILE:clang>
-	     --instrumented-output ${CLANG_INSTRUMENTED}
-	     --fdata ${BOLT_FDATA}
-	     --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
-	     --readelf $<TARGET_FILE:llvm-readobj>
-	     --bolt $<TARGET_FILE:llvm-bolt>
-	     --lit "${LIT_COMMAND}"
-	     --merge-fdata $<TARGET_FILE:merge-fdata>
+             --method ${CLANG_BOLT}
+             --input "${CLANG_BOLT_INPUTS}"
+             --instrumented-output "${CLANG_INSTRUMENTED_OUTPUTS}"
+             --fdata ${BOLT_FDATA}
+             --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
+             --readelf $<TARGET_FILE:llvm-readobj>
+             --bolt $<TARGET_FILE:llvm-bolt>
+             --lit "${LIT_COMMAND}"
+             --merge-fdata $<TARGET_FILE:merge-fdata>
     COMMENT "Optimizing Clang with BOLT"
     USES_TERMINAL
     VERBATIM
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 55c5160a71c4f..cdb6c39f6c50e 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -560,6 +560,23 @@ def genOrderFile(args):
     return 0
 
 
+def filter_bolt_optimized(inputs, instrumented_outputs):
+    new_inputs = []
+    new_instrumented_ouputs = []
+    for input, instrumented_output in zip(inputs, instrumented_outputs):
+        output = subprocess.check_output(
+            [opts.readelf, "-WS", input], universal_newlines=True
+        )
+
+        # This binary has already been bolt-optimized, so skip further processing.
+        if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
+            print(f"Skipping {input}, it's already instrumented")
+        else:
+            new_inputs.append(input)
+            new_instrumented_ouputs.append(instrumented_output)
+    return new_inputs, new_instrumented_ouputs
+
+
 def bolt_optimize(args):
     parser = argparse.ArgumentParser("%prog  [options] ")
     parser.add_argument("--method", choices=["INSTRUMENT", "PERF", "LBR"])
@@ -574,47 +591,67 @@ def bolt_optimize(args):
 
     opts = parser.parse_args(args)
 
-    output = subprocess.check_output(
-        [opts.readelf, "-WS", opts.input], universal_newlines=True
-    )
+    inputs = opts.input.split(";")
+    instrumented_outputs = opts.instrumented_output.split(";")
+    assert len(inputs) == len(
+        instrumented_outputs
+    ), "inconsistent --input / --instrumented-output arguments"
 
-    # This binary has already been bolt-optimized, so skip further processing.
-    if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
+    inputs, instrumented_outputs = filter_bolt_optimized(inputs, instrumented_outputs)
+    if not inputs:
         return 0
 
+    environ = os.environ.copy()
     if opts.method == "INSTRUMENT":
-        process = subprocess.run(
-            [
+        preloads = []
+        for input, instrumented_output in zip(inputs, instrumented_outputs):
+            args = [
                 opts.bolt,
-                opts.input,
+                input,
                 "-o",
-                opts.instrumented_output,
+                instrumented_output,
                 "-instrument",
                 "--instrumentation-file-append-pid",
                 f"--instrumentation-file={opts.fdata}",
-            ],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-        )
+            ]
+            print("Running: " + " ".join(args))
+            process = subprocess.run(
+                args,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                text=True,
+            )
 
-        print(process.args)
-        for line in process.stdout:
-            sys.stdout.write(line)
-        process.check_returncode()
+            for line in process.stdout:
+                sys.stdout.write(line)
+            process.check_returncode()
 
+            output = subprocess.check_output(
+                [opts.readelf, "--file-header", input], universal_newlines=True
+            )
+            if re.search(r"Type:\s*((Shared)|(DYN))", output):
+                # force using the instrumented version
+                preloads.append(instrumented_output)
+
+        if preloads:
+            print("Patching execution environment for dynamic library")
+            environ["LD_PRELOAD"] = os.pathsep.join(preloads)
+
+    args = [
+        sys.executable,
+        opts.lit,
+        "-v",
+        os.path.join(opts.perf_training_binary_dir, f"bolt-fdata"),
+    ]
+    print("Running: " + " ".join(args))
     process = subprocess.run(
-        [
-            sys.executable,
-            opts.lit,
-            os.path.join(opts.perf_training_binary_dir, "bolt-fdata"),
-        ],
+        args,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT,
         text=True,
+        env=environ,
     )
 
-    print(process.args)
     for line in process.stdout:
         sys.stdout.write(line)
     process.check_returncode()
@@ -624,14 +661,14 @@ def bolt_optimize(args):
 
     merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
 
-    shutil.copy(opts.input, f"{opts.input}-prebolt")
+    for input in inputs:
+        shutil.copy(input, f"{input}-prebolt")
 
-    process = subprocess.run(
-        [
+        args = [
             opts.bolt,
-            f"{opts.input}-prebolt",
+            f"{input}-prebolt",
             "-o",
-            opts.input,
+            input,
             "-data",
             opts.fdata,
             "-reorder-blocks=ext-tsp",
@@ -643,16 +680,18 @@ def bolt_optimize(args):
             "-use-gnu-stack",
             "-update-debug-sections",
             "-nl" if opts.method == "PERF" else "",
-        ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-    )
+        ]
+        print("Running: " + " ".join(args))
+        process = subprocess.run(
+            args,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
 
-    print(process.args)
-    for line in process.stdout:
-        sys.stdout.write(line)
-    process.check_returncode()
+        for line in process.stdout:
+            sys.stdout.write(line)
+        process.check_returncode()
 
 
 commands = {



More information about the cfe-commits mailing list