[clang] 3dab7fe - [CMake] Add clang-bolt target

Amir Ayupov via cfe-commits cfe-commits at lists.llvm.org
Fri Sep 23 01:10:41 PDT 2022


Author: Amir Ayupov
Date: 2022-09-23T10:10:31+02:00
New Revision: 3dab7fede2019c399d793c43ca9ea5a4f2d5031f

URL: https://github.com/llvm/llvm-project/commit/3dab7fede2019c399d793c43ca9ea5a4f2d5031f
DIFF: https://github.com/llvm/llvm-project/commit/3dab7fede2019c399d793c43ca9ea5a4f2d5031f.diff

LOG: [CMake] Add clang-bolt target

This patch adds `CLANG_BOLT_INSTRUMENT` option that applies BOLT instrumentation
to Clang, performs a bootstrap build with the resulting Clang, merges resulting
fdata files into a single profile file, and uses it to perform BOLT optimization
on the original Clang binary.

The projects and targets used for bootstrap/profile collection are configurable via
`CLANG_BOLT_INSTRUMENT_PROJECTS` and `CLANG_BOLT_INSTRUMENT_TARGETS`.
The defaults are "llvm" and "count" respectively, which results in a profile with
~5.3B dynamically executed instructions.

The intended use of the functionality is through BOLT CMake cache file, similar
to PGO 2-stage build:
```
cmake <llvm-project>/llvm -C <llvm-project>/clang/cmake/caches/BOLT.cmake
ninja clang++-bolt # pulls clang-bolt
```

Stats with a recent checkout (clang-16), pre-built BOLT and Clang, 72vCPU/224G
| CMake configure with host Clang + BOLT.cmake | 1m6.592s
| Instrumenting Clang with BOLT | 2m50.508s
| CMake configure `llvm` with instrumented Clang | 5m46.364s (~5x slowdown)
| CMake build `not` with instrumented Clang |0m6.456s
| Merging fdata files | 0m9.439s
| Optimizing Clang with BOLT | 0m39.201s

Building Clang:
```cmake ../llvm-project/llvm -DCMAKE_C_COMPILER=... -DCMAKE_CXX_COMPILER=...
  -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS=clang
  -DLLVM_TARGETS_TO_BUILD=Native -GNinja```

| | Release | BOLT-optimized
| cmake | 0m24.016s | 0m22.333s
| ninja clang | 5m55.692s | 4m35.122s

I know it's not rigorous, but shows a ballpark figure.

Reviewed By: phosek

Differential Revision: https://reviews.llvm.org/D132975

Added: 
    clang/cmake/caches/BOLT.cmake

Modified: 
    clang/CMakeLists.txt
    clang/utils/perf-training/perf-helper.py

Removed: 
    


################################################################################
diff  --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 02ce5de4652de..22b5118c83eda 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -443,7 +443,7 @@ CMAKE_DEPENDENT_OPTION(CLANG_PLUGIN_SUPPORT
   "HAVE_CLANG_PLUGIN_SUPPORT" OFF)
 
 # If libstdc++ is statically linked, clang-repl needs to statically link libstdc++
-# itself, which is not possible in many platforms because of current limitations in 
+# itself, which is not possible in many platforms because of current limitations in
 # JIT stack. (more platforms need to be supported by JITLink)
 if(NOT LLVM_STATIC_LINK_CXX_STDLIB)
   set(HAVE_CLANG_REPL_SUPPORT ON)
@@ -881,6 +881,118 @@ if (CLANG_ENABLE_BOOTSTRAP)
   endforeach()
 endif()
 
+if (CLANG_BOLT_INSTRUMENT)
+  set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
+  set(CLANGXX_PATH ${CLANG_PATH}++)
+  set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst)
+  set(CLANGXX_INSTRUMENTED ${CLANGXX_PATH}-bolt.inst)
+  set(CLANG_OPTIMIZED ${CLANG_PATH}-bolt)
+  set(CLANGXX_OPTIMIZED ${CLANGXX_PATH}-bolt)
+
+  # Instrument clang with BOLT
+  add_custom_target(clang-instrumented
+    DEPENDS ${CLANG_INSTRUMENTED}
+  )
+  add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
+    DEPENDS clang llvm-bolt
+    COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
+      -instrument --instrumentation-file-append-pid
+      --instrumentation-file=${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
+    COMMENT "Instrumenting clang binary with BOLT"
+    VERBATIM
+  )
+
+  # Make a symlink from clang-bolt.inst to clang++-bolt.inst
+  add_custom_target(clang++-instrumented
+    DEPENDS ${CLANGXX_INSTRUMENTED}
+  )
+  add_custom_command(OUTPUT ${CLANGXX_INSTRUMENTED}
+    DEPENDS clang-instrumented
+    COMMAND ${CMAKE_COMMAND} -E create_symlink
+      ${CLANG_INSTRUMENTED}
+      ${CLANGXX_INSTRUMENTED}
+    COMMENT "Creating symlink from BOLT instrumented clang to clang++"
+    VERBATIM
+  )
+
+  # Build specified targets with instrumented Clang to collect the profile
+  set(STAMP_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-stamps/)
+  set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/bolt-instrumented-clang-bins/)
+  set(build_configuration "$<CONFIG>")
+  include(ExternalProject)
+  ExternalProject_Add(bolt-instrumentation-profile
+    DEPENDS clang++-instrumented
+    PREFIX bolt-instrumentation-profile
+    SOURCE_DIR ${CMAKE_SOURCE_DIR}
+    STAMP_DIR ${STAMP_DIR}
+    BINARY_DIR ${BINARY_DIR}
+    EXCLUDE_FROM_ALL 1
+    CMAKE_ARGS
+                ${CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS}
+                # We shouldn't need to set this here, but INSTALL_DIR doesn't
+                # seem to work, so instead I'm passing this through
+                -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}
+                -DCMAKE_C_COMPILER=${CLANG_INSTRUMENTED}
+                -DCMAKE_CXX_COMPILER=${CLANGXX_INSTRUMENTED}
+                -DCMAKE_ASM_COMPILER=${CLANG_INSTRUMENTED}
+                -DCMAKE_ASM_COMPILER_ID=Clang
+                -DCMAKE_BUILD_TYPE=Release
+                -DLLVM_ENABLE_PROJECTS=${CLANG_BOLT_INSTRUMENT_PROJECTS}
+                -DLLVM_TARGETS_TO_BUILD=${LLVM_TARGETS_TO_BUILD}
+    BUILD_COMMAND ${CMAKE_COMMAND} --build ${BINARY_DIR}
+                                   --config ${build_configuration}
+                                   --target ${CLANG_BOLT_INSTRUMENT_TARGETS}
+    INSTALL_COMMAND ""
+    STEP_TARGETS configure build
+    USES_TERMINAL_CONFIGURE 1
+    USES_TERMINAL_BUILD 1
+    USES_TERMINAL_INSTALL 1
+  )
+
+  # Merge profiles into one using merge-fdata
+  add_custom_target(clang-bolt-profile
+    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
+  )
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
+    DEPENDS merge-fdata bolt-instrumentation-profile-build
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    COMMAND ${Python3_EXECUTABLE}
+      ${CMAKE_CURRENT_SOURCE_DIR}/utils/perf-training/perf-helper.py merge-fdata
+      $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
+      ${CMAKE_CURRENT_BINARY_DIR}
+    COMMENT "Preparing BOLT profile"
+    VERBATIM
+  )
+
+  # Optimize original (pre-bolt) Clang using the collected profile
+  add_custom_target(clang-bolt
+    DEPENDS ${CLANG_OPTIMIZED}
+  )
+  add_custom_command(OUTPUT ${CLANG_OPTIMIZED}
+    DEPENDS clang-bolt-profile
+    COMMAND llvm-bolt ${CLANG_PATH}
+      -o ${CLANG_OPTIMIZED}
+      -data ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata
+      -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions
+      -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack
+    COMMENT "Optimizing Clang with BOLT"
+    VERBATIM
+  )
+
+  # Make a symlink from clang-bolt to clang++-bolt
+  add_custom_target(clang++-bolt
+    DEPENDS ${CLANGXX_OPTIMIZED}
+  )
+  add_custom_command(OUTPUT ${CLANGXX_OPTIMIZED}
+    DEPENDS clang-bolt
+    COMMAND ${CMAKE_COMMAND} -E create_symlink
+      ${CLANG_OPTIMIZED}
+      ${CLANGXX_OPTIMIZED}
+    COMMENT "Creating symlink from BOLT optimized clang to clang++"
+    VERBATIM
+  )
+endif()
+
 if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION)
   add_subdirectory(utils/ClangVisualizers)
 endif()

diff  --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake
new file mode 100644
index 0000000000000..65444c8044c3b
--- /dev/null
+++ b/clang/cmake/caches/BOLT.cmake
@@ -0,0 +1,15 @@
+set(CMAKE_BUILD_TYPE Release CACHE STRING "")
+set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "")
+set(CLANG_BOLT_INSTRUMENT_PROJECTS "llvm" CACHE STRING "")
+set(CLANG_BOLT_INSTRUMENT_TARGETS "count" CACHE STRING "")
+set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
+set(CLANG_BOLT_INSTRUMENT_EXTRA_CMAKE_FLAGS "" CACHE STRING "")
+
+set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
+set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
+
+# Disable function splitting enabled by default in GCC8+
+if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-reorder-blocks-and-partition")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-reorder-blocks-and-partition")
+endif()

diff  --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 88708a92712a9..c6a815e654736 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -38,7 +38,7 @@ def clean(args):
 
 def merge(args):
   if len(args) != 3:
-    print('Usage: %s clean <llvm-profdata> <output> <path>\n' % __file__ +
+    print('Usage: %s merge <llvm-profdata> <output> <path>\n' % __file__ +
       '\tMerges all profraw files from path into output.')
     return 1
   cmd = [args[0], 'merge', '-o', args[1]]
@@ -46,6 +46,16 @@ def merge(args):
   subprocess.check_call(cmd)
   return 0
 
+def merge_fdata(args):
+  if len(args) != 3:
+    print('Usage: %s merge-fdata <merge-fdata> <output> <path>\n' % __file__ +
+      '\tMerges all fdata files from path into output.')
+    return 1
+  cmd = [args[0], '-o', args[1]]
+  cmd.extend(findFilesWithExtension(args[2], "fdata"))
+  subprocess.check_call(cmd)
+  return 0
+
 def dtrace(args):
   parser = argparse.ArgumentParser(prog='perf-helper dtrace',
     description='dtrace wrapper for order file generation')
@@ -395,10 +405,12 @@ def genOrderFile(args):
   return 0
 
 commands = {'clean' : clean,
-  'merge' : merge, 
+  'merge' : merge,
   'dtrace' : dtrace,
   'cc1' : cc1,
-  'gen-order-file' : genOrderFile}
+  'gen-order-file' : genOrderFile,
+  'merge-fdata' : merge_fdata,
+  }
 
 def main():
   f = commands[sys.argv[1]]


        


More information about the cfe-commits mailing list