[libc-commits] [clang] [libc] [llvm] [libc][GPU] Use CMAKE_CROSSCOMPILING_EMULATOR instead of custom GPU loader (PR #189417)

Mon Mar 30 08:53:42 PDT 2026

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/189417

## Summary

- Pass `-DCMAKE_CROSSCOMPILING_EMULATOR=.../llvm-gpu-loader` from the runtimes build for GPU targets (`amdgcn`, `nvptx64`) instead of relying on a custom `libc.utils.gpu.loader` target and `find_program()` within the libc sub-build.
- Remove the `LIBC_GPU_LOADER_EXECUTABLE` cache variable and the `libc.utils.gpu.loader` custom target from `prepare_libc_gpu_build.cmake`.
- Remove GPU-loader-specific conditional logic from the integration and hermetic test commands in `LLVMLibCTestRules.cmake`, relying on the already-present `${CMAKE_CROSSCOMPILING_EMULATOR}` expansion.
- Update lit configuration to use `CMAKE_CROSSCOMPILING_EMULATOR` instead of `LIBC_GPU_LOADER_EXECUTABLE`.

This works because libc uses `add_custom_target`/`add_custom_command` for test execution (not `add_test()`), so `CMAKE_CROSSCOMPILING_EMULATOR` is expanded as a plain CMake variable regardless of whether `CMAKE_CROSSCOMPILING` is true.

## Test plan

- [ ] Verify GPU libc tests still run correctly on an AMDGPU target
- [ ] Verify GPU libc tests still run correctly on an NVPTX target
- [ ] Verify non-GPU libc builds are unaffected (emulator variable is empty)
- [ ] Verify `RUNTIMES_<triple>_CMAKE_CROSSCOMPILING_EMULATOR` can override the loader path


Made with [Cursor](https://cursor.com)

>From 597fe6a875d69af493393396cb2224ac1faf34e5 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 30 Mar 2026 09:03:14 -0500
Subject: [PATCH 1/2] [Clang] Fix constant bit widths in gpuintrin.h

Summary:
The `ull` suffix can mean 128 bits on some architectures. Replace this
with the `stdint.h` constructor to be certain.
---
 clang/lib/Headers/gpuintrin.h              | 25 +++++++++++-----------
 clang/lib/Headers/nvptxintrin.h            |  2 +-
 clang/test/Headers/Inputs/include/stdint.h |  3 +++
 clang/test/Headers/gpuintrin_lang.c        |  1 +
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ef1446a3ac77b..12176847776be 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -147,11 +147,10 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
 // Copies the value from the first active thread to the rest.
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
-  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
-  return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
-         ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
-          0xFFFFFFFFull);
+  uint32_t __hi = (uint32_t)(__x >> 32);
+  uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
+  return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32) |
+         ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) & 0xFFFFFFFF);
 }
 
 // Gets the first floating point value from the active lanes.
@@ -174,11 +173,10 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
 _DEFAULT_FN_ATTRS static __inline__ uint64_t
 __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
                       uint32_t __width) {
-  uint32_t __hi = (uint32_t)(__x >> 32ull);
+  uint32_t __hi = (uint32_t)(__x >> 32);
   uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
   uint32_t __mask = (uint32_t)__lane_mask;
-  return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
-          << 32ull) |
+  return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) << 32) |
          ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
 }
 
@@ -211,7 +209,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
   _DEFAULT_FN_ATTRS static __inline__ __type                                   \
   __gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask,              \
                                             __type __x) {                      \
-    uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id());               \
+    uint64_t __above = __lane_mask & -(UINT64_C(2) << __gpu_lane_id());        \
     for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
       uint32_t __src = __above ? __builtin_ctzg(__above) : __gpu_lane_id();    \
       __type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x,  \
@@ -226,7 +224,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
   _DEFAULT_FN_ATTRS static __inline__ __type                                   \
   __gpu_prefix_scan_##__prefix##_##__suffix(uint64_t __lane_mask,              \
                                             __type __x) {                      \
-    uint64_t __below = __lane_mask & ((1ull << __gpu_lane_id()) - 1);          \
+    uint64_t __below = __lane_mask & ((UINT64_C(1) << __gpu_lane_id()) - 1);   \
     for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
       uint32_t __src =                                                         \
           __below ? (63 - __builtin_clzg(__below)) : __gpu_lane_id();          \
@@ -234,7 +232,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
                                                      __gpu_num_lanes());       \
       __x = __op(__x, __below ? __result : (__type)__identity);                \
       for (uint32_t __i = 0; __i < __step; ++__i)                              \
-        __below ^= (1ull << (63 - __builtin_clzg(__below, 0))) & __below;      \
+        __below ^=                                                             \
+            (UINT64_C(1) << (63 - __builtin_clzg(__below, 0))) & __below;      \
     }                                                                          \
     return __x;                                                                \
   }                                                                            \
@@ -338,7 +337,7 @@ __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
   uint32_t __first = __gpu_shuffle_idx_u32(
       __lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
-  return __ballot == __lane_mask ? __lane_mask : 0ull;
+  return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
 }
 
 // Returns the current lane mask if every lane contains __x.
@@ -347,7 +346,7 @@ __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
   uint64_t __first = __gpu_shuffle_idx_u64(
       __lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
   uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
-  return __ballot == __lane_mask ? __lane_mask : 0ull;
+  return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
 }
 
 _Pragma("omp end declare variant");
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index b2e538580ba10..57a6a2cd08633 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -137,7 +137,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
                       uint32_t __width) {
   // Mask out inactive lanes to match AMDGPU behavior.
   uint32_t __mask = (uint32_t)__lane_mask;
-  bool __bitmask = (1ull << __idx) & __lane_mask;
+  bool __bitmask = (UINT64_C(1) << __idx) & __lane_mask;
   return -__bitmask &
          __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
                                   ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
diff --git a/clang/test/Headers/Inputs/include/stdint.h b/clang/test/Headers/Inputs/include/stdint.h
index c4836441096b2..a44ebff2d69fe 100644
--- a/clang/test/Headers/Inputs/include/stdint.h
+++ b/clang/test/Headers/Inputs/include/stdint.h
@@ -39,4 +39,7 @@ typedef unsigned __INTPTR_TYPE__ uintptr_t;
 #define UINT32_MAX  __UINT32_C(4294967295)
 #define UINT64_MAX  __UINT64_C(18446744073709551615)
 
+#define UINT32_C __UINT32_C
+#define UINT64_C __UINT64_C
+
 #endif /* STDINT_H */
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
index 433d24b18d654..80e73b8c5647e 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -40,6 +40,7 @@ __device__ int foo() { return __gpu_thread_id_x(); }
 #elif defined(SYCL)
 extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
 #else
+//
 // CUDA-LABEL: define dso_local i32 @foo(
 // CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
 // CUDA-NEXT:  [[ENTRY:.*:]]

>From 3ef29025240c3f75954d57af41daedf4afc97168 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 30 Mar 2026 10:52:51 -0500
Subject: [PATCH 2/2] [libc][GPU] Use CMAKE_CROSSCOMPILING_EMULATOR instead of
 custom GPU loader

Replace the custom `libc.utils.gpu.loader` target and
`LIBC_GPU_LOADER_EXECUTABLE` cache variable with the standard CMake
`CMAKE_CROSSCOMPILING_EMULATOR` mechanism.

For GPU runtime targets (amdgcn, nvptx64), the runtimes build now
automatically passes `-DCMAKE_CROSSCOMPILING_EMULATOR=.../llvm-gpu-loader`
to the sub-build. The libc test rules already expanded
`${CMAKE_CROSSCOMPILING_EMULATOR}` in their command lines, so the GPU
loader is picked up without any GPU-specific conditional logic.

This works because libc uses `add_custom_target`/`add_custom_command`
for test execution rather than `add_test()`, so the variable is a plain
string expansion that does not depend on `CMAKE_CROSSCOMPILING` being
true.

Made-with: Cursor
---
 libc/cmake/modules/LLVMLibCTestRules.cmake     | 15 +--------------
 .../cmake/modules/prepare_libc_gpu_build.cmake | 18 ------------------
 libc/docs/gpu/building.rst                     |  7 ++++---
 libc/docs/gpu/testing.rst                      |  2 +-
 libc/test/CMakeLists.txt                       |  9 ++-------
 libc/test/lit.site.cfg.py.in                   |  8 ++++----
 llvm/runtimes/CMakeLists.txt                   | 10 +++++++++-
 7 files changed, 21 insertions(+), 48 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index e25f739408b99..6d381194d5ed9 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -625,12 +625,6 @@ function(add_integration_test test_name)
                    libc.test.IntegrationTest.test
                    ${INTEGRATION_TEST_DEPENDS})
 
-  # Tests on the GPU require an external loader utility to launch the kernel.
-  if(TARGET libc.utils.gpu.loader)
-    add_dependencies(${fq_build_target_name} libc.utils.gpu.loader)
-    get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
-  endif()
-
   # We have to use a separate var to store the command as a list because
   # the COMMAND option of `add_custom_target` cannot handle empty vars in the
   # command. For example, if INTEGRATION_TEST_ENV is empty, the actual
@@ -640,7 +634,6 @@ function(add_integration_test test_name)
   set(test_cmd
       ${INTEGRATION_TEST_ENV}
       $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
-      $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
       ${CMAKE_CROSSCOMPILING_EMULATOR}
       ${INTEGRATION_TEST_LOADER_ARGS}
       $<TARGET_FILE:${fq_build_target_name}> ${INTEGRATION_TEST_ARGS})
@@ -878,12 +871,6 @@ function(add_libc_hermetic test_name)
     )
   endif()
 
-  # Tests on the GPU require an external loader utility to launch the kernel.
-  if(TARGET libc.utils.gpu.loader)
-    add_dependencies(${fq_build_target_name} libc.utils.gpu.loader)
-    get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
-  endif()
-
   if(NOT HERMETIC_TEST_NO_RUN_POSTBUILD)
     if (LIBC_TEST_CMD)
       # In the form of "<command> binary=@BINARY@", e.g. "qemu-system-arm -loader$<COMMA>file=@BINARY@"
@@ -892,7 +879,7 @@ function(add_libc_hermetic test_name)
     else()
       set(test_cmd ${HERMETIC_TEST_ENV}
         $<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
-        $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
+        ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
         $<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
     endif()
 
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index fcf0e38db81af..c87a1df926c85 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -66,24 +66,6 @@ else()
 endif()
 set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
 
-# Identify the GPU loader utility used to run tests.
-set(LIBC_GPU_LOADER_EXECUTABLE "" CACHE STRING "Executable for the GPU loader.")
-if(LIBC_GPU_LOADER_EXECUTABLE)
-  set(gpu_loader_executable ${LIBC_GPU_LOADER_EXECUTABLE})
-else()
-  find_program(gpu_loader_executable
-               NAMES llvm-gpu-loader NO_DEFAULT_PATH
-               PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
-endif()
-if(NOT TARGET libc.utils.gpu.loader AND gpu_loader_executable)
-  add_custom_target(libc.utils.gpu.loader)
-  set_target_properties(
-    libc.utils.gpu.loader
-    PROPERTIES
-      EXECUTABLE "${gpu_loader_executable}"
-  )
-endif()
-
 # The AMDGPU environment uses different code objects to encode the ABI for
 # kernel calls and intrinsic functions. We want to expose this to conform to
 # whatever the test suite was built to handle.
diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst
index d80ad0c0ff540..b8f5e79a430e3 100644
--- a/libc/docs/gpu/building.rst
+++ b/libc/docs/gpu/building.rst
@@ -195,6 +195,7 @@ standard runtime build.
   commonly run out of resources if this is not constrained so it is recommended
   to keep it low. The default value is a single thread.
 
-**LIBC_GPU_LOADER_EXECUTABLE**:STRING
-  Overrides the default loader used for running GPU tests. If this is not
-  provided the standard one will be built.
+**CMAKE_CROSSCOMPILING_EMULATOR**:STRING
+  Overrides the default loader used for running GPU tests. This is set
+  automatically to ``llvm-gpu-loader`` for GPU runtime targets when building
+  via the runtimes build.
diff --git a/libc/docs/gpu/testing.rst b/libc/docs/gpu/testing.rst
index 4115e68e5225a..c34467346b798 100644
--- a/libc/docs/gpu/testing.rst
+++ b/libc/docs/gpu/testing.rst
@@ -94,7 +94,7 @@ Running tests
 
 Tests will only be built and run if a GPU target architecture is set and the
 corresponding loader utility was built. These can be overridden with the
-``LIBC_GPU_TEST_ARCHITECTURE`` and ``LIBC_GPU_LOADER_EXECUTABLE`` :ref:`CMake
+``LIBC_GPU_TEST_ARCHITECTURE`` and ``CMAKE_CROSSCOMPILING_EMULATOR`` :ref:`CMake
 options <gpu_cmake_options>`. Once built, they can be run like any other tests.
 The CMake target depends on how the library was built.
 
diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt
index 60478739aced4..374756e77260e 100644
--- a/libc/test/CMakeLists.txt
+++ b/libc/test/CMakeLists.txt
@@ -16,11 +16,6 @@ add_custom_target(libc-hermetic-tests-build)
 add_custom_target(libc-integration-tests-build)
 add_custom_target(libc_include_tests-build)
 
-# Resolve the GPU loader executable path for the lit site config.
-if(TARGET libc.utils.gpu.loader)
-  get_target_property(LIBC_GPU_LOADER_EXECUTABLE libc.utils.gpu.loader "EXECUTABLE")
-endif()
-
 # Configure the site config file for lit
 configure_lit_site_cfg(
   ${LIBC_SOURCE_DIR}/test/lit.site.cfg.py.in
@@ -34,7 +29,7 @@ configure_lit_site_cfg(
   "LLVM_LIBS_DIR"
   "LIBC_SOURCE_DIR"
   "LIBC_BUILD_DIR"
-  "LIBC_GPU_LOADER_EXECUTABLE"
+  "CMAKE_CROSSCOMPILING_EMULATOR"
 )
 
 add_lit_testsuite(check-libc-lit
@@ -46,7 +41,7 @@ add_lit_testsuite(check-libc-lit
 add_subdirectory(UnitTest)
 
 if(LIBC_TARGET_OS_IS_GPU)
-  if(NOT TARGET libc.utils.gpu.loader)
+  if(NOT CMAKE_CROSSCOMPILING_EMULATOR)
     message(WARNING "Cannot build libc GPU tests, missing loader.")
     return()
   elseif(LIBC_GPU_TESTS_DISABLED)
diff --git a/libc/test/lit.site.cfg.py.in b/libc/test/lit.site.cfg.py.in
index 7773bdfdf0e9c..7727f16956d73 100644
--- a/libc/test/lit.site.cfg.py.in
+++ b/libc/test/lit.site.cfg.py.in
@@ -8,11 +8,11 @@ config.llvm_tools_dir = lit_config.substitute(path(r"@LLVM_TOOLS_DIR@"))
 config.libc_src_root = path(r"@LIBC_SOURCE_DIR@")
 config.libc_obj_root = path(r"@LIBC_BUILD_DIR@")
 config.libc_test_cmd = "@LIBC_TEST_CMD@"
-config.libc_gpu_loader = path(r"@LIBC_GPU_LOADER_EXECUTABLE@")
+config.libc_crosscompiling_emulator = path(r"@CMAKE_CROSSCOMPILING_EMULATOR@")
 
-# If running GPU tests and no explicit test command is set, use the GPU loader.
-if not config.libc_test_cmd and config.libc_gpu_loader:
-    config.libc_test_cmd = config.libc_gpu_loader + " @BINARY@"
+# If no explicit test command is set, use the cross-compiling emulator.
+if not config.libc_test_cmd and config.libc_crosscompiling_emulator:
+    config.libc_test_cmd = config.libc_crosscompiling_emulator + " @BINARY@"
 
 # Add libc's utils directory to the path so we can import the test format.
 site.addsitedir(os.path.join(config.libc_src_root, "utils"))
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index f22b551b89aca..a6e209a7ab204 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -674,9 +674,17 @@ if(build_runtimes)
 
       check_apple_target(${name} runtime)
 
+      set(per_target_cmake_args)
+      if(LLVM_LIBC_GPU_BUILD AND TARGET llvm-gpu-loader
+         AND "${name}" MATCHES "^(amdgcn|nvptx64)")
+        list(APPEND per_target_cmake_args
+          "-DCMAKE_CROSSCOMPILING_EMULATOR=${LLVM_TOOLS_BINARY_DIR}/llvm-gpu-loader${CMAKE_EXECUTABLE_SUFFIX}")
+      endif()
+
       runtime_register_target(${name}
         DEPENDS ${builtins_dep_name} ${extra_deps}
-        CMAKE_ARGS -DLLVM_DEFAULT_TARGET_TRIPLE=${name} ${extra_cmake_args}
+        CMAKE_ARGS -DLLVM_DEFAULT_TARGET_TRIPLE=${name}
+                   ${extra_cmake_args} ${per_target_cmake_args}
         EXTRA_ARGS TARGET_TRIPLE ${name} ${extra_args})
     endforeach()