[libc-commits] [libc] 6d0e137 - [libc] Remove OpenMP and build the GPU libc directly

Thu Feb 2 07:47:10 PST 2023

Author: Joseph Huber
Date: 2023-02-02T09:47:03-06:00
New Revision: 6d0e1373589a105696854d66d9146b555b791a5f

URL: https://github.com/llvm/llvm-project/commit/6d0e1373589a105696854d66d9146b555b791a5f
DIFF: https://github.com/llvm/llvm-project/commit/6d0e1373589a105696854d66d9146b555b791a5f.diff

LOG: [libc] Remove OpenMP and build the GPU libc directly

The current `libcgpu.a` is actually an archive of fatbinaries. The host
file contains nothing but a section called `LLVM_OFFLOADING` that
contains embedded device code. This used to be handled implicitly by
borrowing the OpenMP toolchain, which did this packaging internally.
Passing the OpenMP flags causes problems with trying to move to testing.
This patch pulls this logic out into the CMake and handles it manually.

This patch is a lot of noise, but it fundamentally comes down to the
following changes.
1. Build the source for every GPU architecture (GPU architectures are
   generally not backwards compatible)
2. Combine all of these files into a single binary blob
3. Embed that binary blob into a host file
4. Package these host files into a `.a` archive.
5. The device code will be extracted and managed by the offloading
   linker.

Another important point. Right now we are maintaining an important
distinction with the GPU build. That is, when we build the exported
library we will build for many GPU architectures. However, the internal
version will only be built for a single GPU architecture, one that was
found on the user's system. This is intended to be used for internal
testing, very similar to the current path where `libc` is compiled for a
single target triple.

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D143089

Added: 
    

Modified: 
    libc/cmake/modules/LLVMLibCObjectRules.cmake
    libc/cmake/modules/prepare_libc_gpu_build.cmake
    libc/src/__support/common.h

Removed: 
    


################################################################################
diff  --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 24fd36080814e..4ac540a4c550f 100644

--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -52,19 +52,149 @@ function(_get_common_compile_options output_var flags)
     endif()
   endif()
   if (LIBC_TARGET_ARCHITECTURE_IS_GPU)
-    list(APPEND compile_options "-fopenmp")
-    list(APPEND compile_options "-fopenmp-cuda-mode")
-    foreach(gpu_arch ${LIBC_GPU_ARCHITECTURES})
-      list(APPEND compile_options "--offload-arch=${gpu_arch}")
-    endforeach()
     list(APPEND compile_options "-nogpulib")
-    list(APPEND compile_options "-nogpuinc")
     list(APPEND compile_options "-fvisibility=hidden")
-    list(APPEND compile_options "-foffload-lto")
   endif()
   set(${output_var} ${compile_options} PARENT_SCOPE)
 endfunction()
 
+# Builds the entrypoint target for the GPU.
+# Usage:
+#     _build_gpu_entrypoint_objects(
+#       <target_name>
+#       SRCS <list of .cpp files>
+#       HDRS <list of .h files>
+#       DEPENDS <list of dependencies>
+#       COMPILE_OPTIONS <optional list of special compile options for this target>
+#       FLAGS <optional list of flags>
+#     )
+function(_build_gpu_entrypoint_objects fq_target_name)
+  cmake_parse_arguments(
+    "ADD_GPU_ENTRYPOINT_OBJ"
+    "" # No optional arguments
+    "NAME;CXX_STANDARD" # Single value arguments
+    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;FLAGS"  # Multi value arguments
+    ${ARGN}
+  )
+
+  # The packaged version will be built for every target GPU architecture. We do
+  # this so we can support multiple accelerators on the same machine.
+  foreach(gpu_arch ${all_gpu_architectures})
+    set(gpu_target_name ${fq_target_name}.${gpu_arch})
+    set(compile_options ${ADD_GPU_ENTRYPOINT_OBJ_COMPILE_OPTIONS})
+    # Derive the triple from the specified architecture.
+    if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
+      set(gpu_target_triple "amdgcn-amd-amdhsa")
+      list(APPEND compile_options "-mcpu=${gpu_arch}")
+    elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
+      set(gpu_target_triple "nvptx64-nvidia-cuda")
+      list(APPEND compile_options "-march=${gpu_arch}")
+    else()
+      message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
+    endif()
+    list(APPEND compile_options "--target=${gpu_target_triple}")
+    list(APPEND compile_options "-emit-llvm")
+
+    # Build the library for this target architecture. We always emit LLVM-IR for
+    # packaged GPU binaries.
+    add_library(${gpu_target_name}
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_GPU_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_GPU_ENTRYPOINT_OBJ_HDRS}
+    )
+
+    target_compile_options(${gpu_target_name} PRIVATE ${compile_options})
+    target_include_directories(${gpu_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${gpu_target_name} ${ADD_GPU_ENTRYPOINT_OBJ_DEPENDS})
+    target_compile_definitions(${gpu_target_name} PRIVATE LLVM_LIBC_PUBLIC_PACKAGING)
+
+    # Append this target to a list of images to package into a single binary.
+    set(input_file $<TARGET_OBJECTS:${gpu_target_name}>)
+    list(APPEND packager_images
+         --image=file=${input_file},arch=${gpu_arch},triple=${gpu_target_triple})
+    list(APPEND gpu_target_names ${gpu_target_name})
+  endforeach()
+
+  # After building the target for the desired GPUs we must package the output
+  # into a fatbinary, see https://clang.llvm.org/docs/OffloadingDesign.html for
+  # more information.
+  set(packaged_target_name ${fq_target_name}.__gpu__)
+  set(packaged_output_name ${CMAKE_CURRENT_BINARY_DIR}/${fq_target_name}.gpubin)
+
+  add_custom_command(OUTPUT ${packaged_output_name}
+                     COMMAND ${LIBC_CLANG_OFFLOAD_PACKAGER}
+                             ${packager_images} -o ${packaged_output_name}
+                     DEPENDS ${gpu_target_names}
+                     COMMENT "Packaging LLVM offloading binary")
+  add_custom_target(${packaged_target_name} DEPENDS ${packaged_output_name})
+
+  # We create an empty 'stub' file for the host to contain the embedded device
+  # code. This will be packaged into 'libcgpu.a'.
+  # TODO: In the future we will want to combine every architecture for a target
+  #       into a single bitcode file and use that. For now we simply build for
+  #       every single one and let the offloading linker handle it.
+  get_filename_component(stub_filename ${ADD_GPU_ENTRYPOINT_OBJ_SRCS} NAME)
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${stub_filename} "// Empty file.\n")
+  add_library(
+    ${fq_target_name}
+    # We want an object library as the objects will eventually get packaged into
+    # an archive (like libcgpu.a).
+    EXCLUDE_FROM_ALL
+    OBJECT
+    "${CMAKE_CURRENT_BINARY_DIR}/${stub_filename}"
+  )
+  target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options}
+                         -DLLVM_LIBC_PUBLIC_PACKAGING
+                         -nostdlib -Xclang -fembed-offload-object=${packaged_output_name})
+  target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
+  add_dependencies(${fq_target_name} ${full_deps_list} ${packaged_target_name})
+
+  set_target_properties(
+    ${fq_target_name}
+    PROPERTIES
+      ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
+      TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
+      OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
+      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+      DEPS "${fq_deps_list}"
+      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+  )
+
+  # We only build the internal target for a single supported architecture.
+  set(internal_target_name ${fq_target_name}.__internal__)
+  set(include_dirs ${LIBC_BUILD_DIR}/include ${LIBC_SOURCE_DIR} ${LIBC_BUILD_DIR})
+  if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU OR
+     LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+    add_library(
+      ${internal_target_name}
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_ENTRYPOINT_OBJ_HDRS}
+    )
+    target_compile_options(${internal_target_name} BEFORE PRIVATE
+                           ${common_compile_options} --target=${LIBC_GPU_TARGET_TRIPLE})
+    if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
+      target_compile_options(${internal_target_name} PRIVATE -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE})
+    elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
+      target_compile_options(${internal_target_name} PRIVATE -march=${LIBC_GPU_TARGET_ARCHITECTURE})
+    endif()
+    target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${internal_target_name} ${full_deps_list})
+    set_target_properties(
+      ${internal_target_name}
+      PROPERTIES
+        CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+        FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+    )
+    set_target_properties(
+      ${fq_target_name}
+      PROPERTIES OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
+    )
+  endif()
+endfunction()
+
 # Rule which is essentially a wrapper over add_library to compile a set of
 # sources to object files.
 # Usage:
@@ -127,7 +257,6 @@ function(create_object_library fq_target_name)
   if(NOT ADD_OBJECT_CXX_STANDARD)
     set(ADD_OBJECT_CXX_STANDARD ${CMAKE_CXX_STANDARD})
   endif()
-  
   set_target_properties(
     ${fq_target_name}
     PROPERTIES
@@ -350,53 +479,67 @@ function(create_entrypoint_object fq_target_name)
     endif()
   endif()
 
-  add_library(
-    ${internal_target_name}
-    # TODO: We don't need an object library for internal consumption.
-    # A future change should switch this to a normal static library.
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${ADD_ENTRYPOINT_OBJ_SRCS}
-    ${ADD_ENTRYPOINT_OBJ_HDRS}
-  )
-  target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
-  target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
-  add_dependencies(${internal_target_name} ${full_deps_list})
-  set_target_properties(
-    ${internal_target_name}
-    PROPERTIES
+  # GPU builds require special handling for the objects because we want to
+  # export several 
diff erent targets at once, e.g. for both Nvidia and AMD.
+  if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
+    _build_gpu_entrypoint_objects(
+      ${fq_target_name}
+      SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
+      HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
+      COMPILE_OPTIONS ${common_compile_options}
+      DEPENDS ${full_deps_list}
       CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
       FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-  )
+    )
+  else()
+    add_library(
+      ${internal_target_name}
+      # TODO: We don't need an object library for internal consumption.
+      # A future change should switch this to a normal static library.
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_ENTRYPOINT_OBJ_HDRS}
+    )
+    target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
+    target_include_directories(${internal_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${internal_target_name} ${full_deps_list})
+    set_target_properties(
+      ${internal_target_name}
+      PROPERTIES
+        CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+        FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+    )
 
-  add_library(
-    ${fq_target_name}
-    # We want an object library as the objects will eventually get packaged into
-    # an archive (like libc.a).
-    EXCLUDE_FROM_ALL
-    OBJECT
-    ${ADD_ENTRYPOINT_OBJ_SRCS}
-    ${ADD_ENTRYPOINT_OBJ_HDRS}
-  )
-  target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLLVM_LIBC_PUBLIC_PACKAGING)
-  target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
-  add_dependencies(${fq_target_name} ${full_deps_list})
+    add_library(
+      ${fq_target_name}
+      # We want an object library as the objects will eventually get packaged into
+      # an archive (like libc.a).
+      EXCLUDE_FROM_ALL
+      OBJECT
+      ${ADD_ENTRYPOINT_OBJ_SRCS}
+      ${ADD_ENTRYPOINT_OBJ_HDRS}
+    )
+    target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLLVM_LIBC_PUBLIC_PACKAGING)
+    target_include_directories(${fq_target_name} PRIVATE ${include_dirs})
+    add_dependencies(${fq_target_name} ${full_deps_list})
 
-  set_target_properties(
-    ${fq_target_name}
-    PROPERTIES
-      ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
-      TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
-      OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
-      # TODO: We don't need to list internal object files if the internal
-      # target is a normal static library.
-      OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
-      CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
-      DEPS "${fq_deps_list}"
-      FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
-  )
+    set_target_properties(
+      ${fq_target_name}
+      PROPERTIES
+        ENTRYPOINT_NAME ${ADD_ENTRYPOINT_OBJ_NAME}
+        TARGET_TYPE ${ENTRYPOINT_OBJ_TARGET_TYPE}
+        OBJECT_FILE "$<TARGET_OBJECTS:${fq_target_name}>"
+        # TODO: We don't need to list internal object files if the internal
+        # target is a normal static library.
+        OBJECT_FILE_RAW "$<TARGET_OBJECTS:${internal_target_name}>"
+        CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
+        DEPS "${fq_deps_list}"
+        FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
+    )
+  endif()
 
-  if(LLVM_LIBC_ENABLE_LINTING)
+  if(LLVM_LIBC_ENABLE_LINTING AND TARGET ${internal_target_name})
     if(NOT LLVM_LIBC_CLANG_TIDY)
       message(FATAL_ERROR "Something is wrong!  LLVM_LIBC_ENABLE_LINTING is "
               "ON but LLVM_LIBC_CLANG_TIDY is not set.")

diff  --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 8dfa0fa5d4c6f..68f30f9d5ce5e 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -4,12 +4,14 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
 endif()
 
 # Set up the target architectures to build the GPU libc for.
-set(all_gpu_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62;"
-                          "sm_70;sm_72;sm_75;sm_80;sm_86;gfx700;gfx701;gfx801;"
-                          "gfx803;gfx900;gfx902;gfx906;gfx908;gfx90a;gfx90c;"
-                          "gfx940;gfx1010;gfx1030;gfx1031;gfx1032;gfx1033;"
-                          "gfx1034;gfx1035;gfx1036;gfx1100;gfx1101;gfx1102;"
-                          "gfx1103")
+set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906;"
+                             "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030;"
+                             "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036;"
+                             "gfx1100;gfx1101;gfx1102;gfx1103")
+set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62;"
+                            "sm_70;sm_72;sm_75;sm_80;sm_86")
+set(all_gpu_architectures
+    "${all_amdgpu_architectures};${all_nvptx_architectures}")
 set(LIBC_GPU_ARCHITECTURES ${all_gpu_architectures} CACHE STRING
     "List of GPU architectures to build the libc for.")
 if(LIBC_GPU_ARCHITECTURES STREQUAL "all")
@@ -29,6 +31,15 @@ if(NOT LLVM_LIBC_FULL_BUILD)
                       "GPU.")
 endif()
 
+# Identify the program used to package multiple images into a single binary.
+find_program(LIBC_CLANG_OFFLOAD_PACKAGER
+             NAMES clang-offload-packager
+             PATHS ${LLVM_BINARY_DIR}/bin)
+if(NOT LIBC_CLANG_OFFLOAD_PACKAGER)
+  message(FATAL_ERROR "Cannot find the 'clang-offload-packager' for the GPU "
+                      "build")
+endif()
+
 # Identify any locally installed AMD GPUs on the system to use for testing.
 find_program(LIBC_AMDGPU_ARCH
              NAMES amdgpu-arch

diff  --git a/libc/src/__support/common.h b/libc/src/__support/common.h
index 25bc77978cba2..2aaa8f804c9a9 100644
--- a/libc/src/__support/common.h
+++ b/libc/src/__support/common.h
@@ -29,17 +29,15 @@
 #define LIBC_INLINE inline
 #endif
 
-// We use OpenMP to declare these functions on the device.
-#define STR(X) #X
-#define LLVM_LIBC_DECLARE_DEVICE(name)                                         \
-  _Pragma(STR(omp declare target to(name) device_type(nohost)))
+#if defined(__AMDGPU__) || defined(__NVPTX__)
+#define PACKAGE_FOR_GPU
+#endif
 
-// GPU targets do not support aliasing and must be declared on the device.
-#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && defined(_OPENMP)
+// GPU targets do not support aliasing.
+#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && defined(PACKAGE_FOR_GPU)
 #define LLVM_LIBC_FUNCTION(type, name, arglist)                                \
   LLVM_LIBC_FUNCTION_ATTR decltype(__llvm_libc::name)                          \
       __##name##_impl__ __asm__(#name);                                        \
-  LLVM_LIBC_DECLARE_DEVICE(__##name##_impl__)                                  \
   type __##name##_impl__ arglist
 // MacOS needs to be excluded because it does not support aliasing.
 #elif defined(LLVM_LIBC_PUBLIC_PACKAGING) && (!defined(__APPLE__))