[libc] [llvm] [openmp] [libc] Rework the GPU build to only target a single architecture (PR #81644)

Tue Feb 13 10:45:54 PST 2024

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/81644

>From 9600b8b9e25a53e30e6b0d01fb64aa34ef04861e Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 13 Feb 2024 12:24:31 -0600
Subject: [PATCH] [libc] Rework the GPU build to only target a single
 architecture

Summary:
GPUs generally have issues related to compatibility between generations.
The GPU libc is provided as an LLVM-IR file, so that gives us some
control over the compatibility. Previous changes have removed all the
target divergent things, so we can now replace this with a singlar
target build

This gets rid of the `LIBC_GPU_ARCHITECTURES` variable and instead just
builds for the NVPTX target and AMDGPU target all the time. This will be
massively reworked in the future, but for now we want to test it using
this build configuration.
---
 libc/CMakeLists.txt                           |  4 +-
 .../cmake/modules/LLVMLibCArchitectures.cmake |  2 +-
 libc/cmake/modules/LLVMLibCObjectRules.cmake  | 89 ++++++-------------
 libc/cmake/modules/LLVMLibCTestRules.cmake    |  8 +-
 .../modules/prepare_libc_gpu_build.cmake      | 28 ++----
 libc/docs/gpu/using.rst                       |  7 +-
 llvm/runtimes/CMakeLists.txt                  |  6 +-
 openmp/libomptarget/CMakeLists.txt            |  3 +-
 8 files changed, 46 insertions(+), 101 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 3d775736616745..88f586344fed11 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -43,7 +43,7 @@ set(LIBC_NAMESPACE "__llvm_libc_${LLVM_VERSION_MAJOR}_${LLVM_VERSION_MINOR}_${LL
   CACHE STRING "The namespace to use to enclose internal implementations. Must start with '__llvm_libc'."
 )
 
-if(LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+if(LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD)
   if(NOT LIBC_HDRGEN_EXE)
     # We need to set up hdrgen first since other targets depend on it.
     add_subdirectory(utils/LibcTableGenUtil)
@@ -65,7 +65,7 @@ if(("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND NOT LLVM_RUNTIMES_BUILD) OR
   # to build libc-hdrgen and return.
 
   # Always make the RPC server availible to other projects for GPU mode.
-  if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+  if(LIBC_GPU_BUILD)
     add_subdirectory(utils/gpu/server)
   endif()
   return()
diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake
index 623ed774be7270..79020ee3a65bc6 100644
--- a/libc/cmake/modules/LLVMLibCArchitectures.cmake
+++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake
@@ -6,7 +6,7 @@
 # platform.
 # ------------------------------------------------------------------------------
 
-if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+if(LIBC_GPU_BUILD)
   # We set the generic target and OS to "gpu" here. More specific defintions
   # for the exact target GPU are set up in prepare_libc_gpu_build.cmake.
   set(LIBC_TARGET_OS "gpu")
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index ef1f24863f61ab..9d524d55033a61 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -100,44 +100,11 @@ endfunction()
 # for the features we wish to use on that target. The minimum PTX features used
 # here roughly corresponds to the CUDA 9.0 release.
 # Adjust as needed for desired PTX features.
-function(get_nvptx_compile_options output_var gpu_arch)
+function(get_nvptx_compile_options output_var)
   set(nvptx_options "")
-  list(APPEND nvptx_options "-march=${gpu_arch}")
   list(APPEND nvptx_options "-Wno-unknown-cuda-version")
   list(APPEND nvptx_options "SHELL:-mllvm -nvptx-emit-init-fini-kernel=false")
-  if(${gpu_arch} STREQUAL "sm_35")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_37")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_50")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_52")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_53")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_60")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_61")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_62")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_70")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_72")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_75")
-    list(APPEND nvptx_options "--cuda-feature=+ptx63")
-  elseif(${gpu_arch} STREQUAL "sm_80")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_86")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_89")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  elseif(${gpu_arch} STREQUAL "sm_90")
-    list(APPEND nvptx_options "--cuda-feature=+ptx72")
-  else()
-    message(FATAL_ERROR "Unknown Nvidia GPU architecture '${gpu_arch}'")
-  endif()
+  list(APPEND nvptx_options "--cuda-feature=+ptx63")
 
   if(LIBC_CUDA_ROOT)
     list(APPEND nvptx_options "--cuda-path=${LIBC_CUDA_ROOT}")
@@ -147,16 +114,16 @@ endfunction()
 
 # Build the object target for a single GPU arch.
 # Usage:
-#     _build_gpu_object_for_single_arch(
+#     _build_gpu_object_for_single_target(
 #       <target_name>
-#       <gpu_arch>
+#       <gpu_target>
 #       SRCS <list of .cpp files>
 #       HDRS <list of .h files>
 #       DEPENDS <list of dependencies>
 #       COMPILE_OPTIONS <optional list of special compile options for this target>
 #       FLAGS <optional list of flags>
 #     )
-function(_build_gpu_object_for_single_arch fq_target_name gpu_arch)
+function(_build_gpu_object_for_single_target fq_target_name gpu_target)
   cmake_parse_arguments(
     "ADD_GPU_OBJ"
     "" # No optional arguments
@@ -170,20 +137,16 @@ function(_build_gpu_object_for_single_arch fq_target_name gpu_arch)
   endif()
 
   set(compile_options ${ADD_GPU_OBJ_COMPILE_OPTIONS})
-  # Derive the triple from the specified architecture.
-  if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
-    set(gpu_target_triple ${AMDGPU_TARGET_TRIPLE})
-    list(APPEND compile_options "-mcpu=${gpu_arch}")
+  if("${gpu_target}" STREQUAL ${AMDGPU_TARGET_TRIPLE})
     list(APPEND compile_options "SHELL:-Xclang -mcode-object-version=none")
     list(APPEND compile_options "-emit-llvm")
-  elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
-    set(gpu_target_triple ${NVPTX_TARGET_TRIPLE})
-    get_nvptx_compile_options(nvptx_options ${gpu_arch})
+  elseif("${gpu_target}" STREQUAL ${NVPTX_TARGET_TRIPLE})
+    get_nvptx_compile_options(nvptx_options)
     list(APPEND compile_options "${nvptx_options}")
   else()
-    message(FATAL_ERROR "Unknown GPU architecture '${gpu_arch}'")
+    message(FATAL_ERROR "Unknown GPU architecture '${gpu_target}'")
   endif()
-  list(APPEND compile_options "--target=${gpu_target_triple}")
+  list(APPEND compile_options "--target=${gpu_target}")
 
   # Build the library for this target architecture. We always emit LLVM-IR for
   # packaged GPU binaries.
@@ -202,7 +165,7 @@ function(_build_gpu_object_for_single_arch fq_target_name gpu_arch)
     add_dependencies(${fq_target_name} ${ADD_GPU_OBJ_DEPENDS})
     set_target_properties(${fq_target_name} PROPERTIES DEPS "${ADD_GPU_OBJ_DEPENDS}")
   endif()
-endfunction(_build_gpu_object_for_single_arch)
+endfunction(_build_gpu_object_for_single_target)
 
 # Build the object target for the GPU.
 # This compiles the target for all supported architectures and embeds it into
@@ -232,13 +195,13 @@ function(_build_gpu_object_bundle fq_target_name)
   foreach(add_gpu_obj_src ${ADD_GPU_OBJ_SRCS})
     # The packaged version will be built for every target GPU architecture. We do
     # this so we can support multiple accelerators on the same machine.
-    foreach(gpu_arch ${LIBC_GPU_ARCHITECTURES})
+    foreach(gpu_target ${NVPTX_TARGET_TRIPLE} ${AMDGPU_TARGET_TRIPLE})
       get_filename_component(src_name ${add_gpu_obj_src} NAME)
-      set(gpu_target_name ${fq_target_name}.${src_name}.${gpu_arch})
+      set(gpu_target_name ${fq_target_name}.${src_name}.${gpu_target})
 
-      _build_gpu_object_for_single_arch(
+      _build_gpu_object_for_single_target(
         ${gpu_target_name}
-        ${gpu_arch}
+        ${gpu_target}
         CXX_STANDARD ${ADD_GPU_OBJ_CXX_STANDARD}
         HDRS ${ADD_GPU_OBJ_HDRS}
         SRCS ${add_gpu_obj_src}
@@ -249,15 +212,15 @@ function(_build_gpu_object_bundle fq_target_name)
       )
       # Append this target to a list of images to package into a single binary.
       set(input_file $<TARGET_OBJECTS:${gpu_target_name}>)
-      if("${gpu_arch}" IN_LIST all_nvptx_architectures)
+      if("${gpu_target}" STREQUAL "${NVPTX_TARGET_TRIPLE}")
         get_nvptx_compile_options(nvptx_options ${gpu_arch})
         string(REGEX MATCH "\\+ptx[0-9]+" nvptx_ptx_feature ${nvptx_options})
         list(APPEND packager_images
-             --image=file=${input_file},arch=${gpu_arch},triple=${NVPTX_TARGET_TRIPLE},feature=${nvptx_ptx_feature})
+             --image=file=${input_file},arch=generic,triple=${NVPTX_TARGET_TRIPLE},feature=${nvptx_ptx_feature})
       else()
         list(APPEND packager_images
-             --image=file=${input_file},arch=${gpu_arch},triple=${AMDGPU_TARGET_TRIPLE})
-       endif()
+             --image=file=${input_file},arch=generic,triple=${AMDGPU_TARGET_TRIPLE})
+      endif()
       list(APPEND gpu_target_objects ${input_file})
     endforeach()
 
@@ -386,13 +349,14 @@ function(create_object_library fq_target_name)
     endif()
     # When the target for GPU is not bundled, internal_target_name is the same
     # as fq_targetname
-    _build_gpu_object_for_single_arch(
+    _build_gpu_object_for_single_target(
       ${internal_target_name}
-      ${LIBC_GPU_TARGET_ARCHITECTURE}
+      ${LIBC_GPU_TARGET_TRIPLE}
       SRCS ${ADD_OBJECT_SRCS}
       HDRS ${ADD_OBJECT_HDRS}
       CXX_STANDARD ${ADD_OBJECT_CXX_STANDARD}
-      COMPILE_OPTIONS ${compile_options} ${public_packaging_for_internal}
+      COMPILE_OPTIONS ${compile_options} -march=${LIBC_GPU_TARGET_ARCHITECTURE}
+                      ${public_packaging_for_internal}
       DEPENDS ${fq_deps_list}
     )
   else()
@@ -598,12 +562,13 @@ function(create_entrypoint_object fq_target_name)
       DEPENDS ${full_deps_list}
       FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
     )
-    _build_gpu_object_for_single_arch(
+    _build_gpu_object_for_single_target(
       ${internal_target_name}
-      ${LIBC_GPU_TARGET_ARCHITECTURE}
+      ${LIBC_GPU_TARGET_TRIPLE}
       SRCS ${ADD_ENTRYPOINT_OBJ_SRCS}
       HDRS ${ADD_ENTRYPOINT_OBJ_HDRS}
-      COMPILE_OPTIONS ${common_compile_options}
+      COMPILE_OPTIONS -march=${LIBC_GPU_TARGET_ARCHITECTURE}
+                      ${common_compile_options}
       CXX_STANDARD ${ADD_ENTRYPOINT_OBJ_CXX_STANDARD}
       DEPENDS ${full_deps_list}
       FLAGS "${ADD_ENTRYPOINT_OBJ_FLAGS}"
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 5b96c5e9f8c801..e30cc512a98852 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -477,8 +477,9 @@ function(add_integration_test test_name)
                            -flto --target=${LIBC_GPU_TARGET_TRIPLE}
                            -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION})
   elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
-    get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE})
+    get_nvptx_compile_options(nvptx_options)
     target_compile_options(${fq_build_target_name} PRIVATE
+                           -march=${LIBC_GPU_TARGET_ARCHITECTURE}
                            -nogpulib ${nvptx_options} -fno-use-cxa-atexit
                            --target=${LIBC_GPU_TARGET_TRIPLE})
   endif()
@@ -539,9 +540,10 @@ if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
        --target=${LIBC_GPU_TARGET_TRIPLE}
        -mcode-object-version=${LIBC_GPU_CODE_OBJECT_VERSION})
 elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
-  get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE})
+  get_nvptx_compile_options(nvptx_options)
   list(APPEND LIBC_HERMETIC_TEST_COMPILE_OPTIONS
-       -nogpulib ${nvptx_options} -fno-use-cxa-atexit --target=${LIBC_GPU_TARGET_TRIPLE})
+       -nogpulib ${nvptx_options} -fno-use-cxa-atexit 
+       -march=${LIBC_GPU_TARGET_ARCHITECTURE} --target=${LIBC_GPU_TARGET_TRIPLE})
 endif()
 
 # Rule to add a hermetic test. A hermetic test is one whose executable is fully
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index 2086175bae6c72..4fcfce7036719f 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -13,8 +13,6 @@ set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
                             "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90")
 set(all_gpu_architectures
     "${all_amdgpu_architectures};${all_nvptx_architectures}")
-set(LIBC_GPU_ARCHITECTURES "all" CACHE STRING
-    "List of GPU architectures to build the libc for.")
 set(AMDGPU_TARGET_TRIPLE "amdgcn-amd-amdhsa")
 set(NVPTX_TARGET_TRIPLE "nvptx64-nvidia-cuda")
 
@@ -54,17 +52,6 @@ foreach(arch_tool ${LIBC_NVPTX_ARCH} ${LIBC_AMDGPU_ARCH})
 endforeach()
 list(REMOVE_DUPLICATES detected_gpu_architectures)
 
-if(LIBC_GPU_ARCHITECTURES STREQUAL "all")
-  set(LIBC_GPU_ARCHITECTURES ${all_gpu_architectures})
-elseif(LIBC_GPU_ARCHITECTURES STREQUAL "native")
-  if(NOT detected_gpu_architectures)
-    message(FATAL_ERROR "No GPUs found on the system when using 'native'")
-  endif()
-  set(LIBC_GPU_ARCHITECTURES ${detected_gpu_architectures})
-endif()
-message(STATUS "Building libc for the following GPU architecture(s): "
-               "${LIBC_GPU_ARCHITECTURES}")
-
 # Identify the program used to package multiple images into a single binary.
 find_program(LIBC_CLANG_OFFLOAD_PACKAGER
              NAMES clang-offload-packager NO_DEFAULT_PATH
@@ -98,16 +85,11 @@ elseif(detected_gpu_architectures)
   message(STATUS "Using GPU architecture detected on the system for testing: "
                  "'${gpu_test_architecture}'")
 else()
-  list(LENGTH LIBC_GPU_ARCHITECTURES n_gpu_archs)
-  if (${n_gpu_archs} EQUAL 1)
-    set(gpu_test_architecture ${LIBC_GPU_ARCHITECTURES})
-    message(STATUS "Using user-specified GPU architecture for testing: "
-                  "'${gpu_test_architecture}'")
-  else()
-    message(STATUS "No GPU architecture set for testing. GPU tests will not be "
-                  "availibe. Set 'LIBC_GPU_TEST_ARCHITECTURE' to override.")
-    return()
-  endif()
+  # FIXME: This logic is broken, just default to some value for now so it
+  #        builds correctly. This will be reworked in the future.
+  list(GET all_gpu_architectures 0 gpu_test_architecture)
+  message(STATUS "No GPU architecture set for testing. GPU tests will not be "
+          "availibe. Set 'LIBC_GPU_TEST_ARCHITECTURE' to override.")
 endif()
 
 if("${gpu_test_architecture}" IN_LIST all_amdgpu_architectures)
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
index 71f5e7ba203930..f4760f97afb967 100644
--- a/libc/docs/gpu/using.rst
+++ b/libc/docs/gpu/using.rst
@@ -16,10 +16,8 @@ LLVM's libc GPU support *must* be built with an up-to-date ``clang`` compiler
 due to heavy reliance on ``clang``'s GPU support. This can be done automatically
 using the ``LLVM_ENABLE_RUNTIMES=libc`` option. To enable libc for the GPU,
 enable the ``LIBC_GPU_BUILD`` option. By default, ``libcgpu.a`` will be built
-using every supported GPU architecture. To restrict the number of architectures
-build, either set ``LIBC_GPU_ARCHITECTURES`` to the list of desired
-architectures manually or use ``native`` to detect the GPUs on your system. A
-typical ``cmake`` configuration will look like this:
+targeting the NVPTX and AMDGPU implementations. A typical ``cmake``
+configuration will look like this:
 
 .. code-block:: sh
 
@@ -31,7 +29,6 @@ typical ``cmake`` configuration will look like this:
      -DLLVM_ENABLE_RUNTIMES="libc;openmp"                  \
      -DCMAKE_BUILD_TYPE=<Debug|Release>   \ # Select build type
      -DLIBC_GPU_BUILD=ON                  \ # Build in GPU mode
-     -DLIBC_GPU_ARCHITECTURES=all         \ # Build all supported architectures
      -DCMAKE_INSTALL_PREFIX=<PATH>        \ # Where 'libcgpu.a' will live
   $> ninja install
 
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 8c48d85a4346f4..188755b466b3f6 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -199,7 +199,7 @@ foreach(entry ${runtimes})
     list(APPEND prefixes "LLVM_LIBC")
     list(APPEND prefixes "LIBC_")
     # The `libc` project may require '-DCUDAToolkit_ROOT' in GPU mode.
-    if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+    if(LIBC_GPU_BUILD)
       list(APPEND prefixes "CUDA")
     endif()
   endif()
@@ -424,7 +424,7 @@ if(runtimes)
     endforeach()
   endif()
   if("libc" IN_LIST LLVM_ENABLE_PROJECTS AND
-      (LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES))
+      (LLVM_LIBC_FULL_BUILD OR LIBC_GPU_BUILD))
     if(LIBC_HDRGEN_EXE)
       set(hdrgen_exe ${LIBC_HDRGEN_EXE})
     else()
@@ -441,7 +441,7 @@ if(runtimes)
     set(libc_cmake_args "-DLIBC_HDRGEN_EXE=${hdrgen_exe}"
                         "-DLLVM_LIBC_FULL_BUILD=ON")
     list(APPEND extra_deps ${hdrgen_deps})
-    if(LIBC_GPU_BUILD OR LIBC_GPU_ARCHITECTURES)
+    if(LIBC_GPU_BUILD)
       foreach(dep clang-offload-packager nvptx-arch amdgpu-arch)
         if(TARGET ${dep})
           list(APPEND extra_deps ${dep})
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 17e61d0bc47dce..8e51e189ab9531 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -121,8 +121,7 @@ pythonize_bool(LIBOMPTARGET_OMPT_SUPPORT)
 
 # Check if this build supports the GPU libc.
 set(LIBC_GPU_SUPPORT FALSE)
-if("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND (LIBC_GPU_BUILD OR
-                                            LIBC_GPU_ARCHITECTURES))
+if("libc" IN_LIST LLVM_ENABLE_RUNTIMES AND LIBC_GPU_BUILD)
   set(LIBC_GPU_SUPPORT TRUE)
 endif()