[libc-commits] [clang] [libc] [llvm] [libc][GPU] Use CMAKE_CROSSCOMPILING_EMULATOR instead of custom GPU loader (PR #189417)
via libc-commits
libc-commits at lists.llvm.org
Mon Mar 30 08:54:31 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libc
@llvm/pr-subscribers-backend-x86
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
## Summary
- Pass `-DCMAKE_CROSSCOMPILING_EMULATOR=.../llvm-gpu-loader` from the runtimes build for GPU targets (`amdgcn`, `nvptx64`) instead of relying on a custom `libc.utils.gpu.loader` target and `find_program()` within the libc sub-build.
- Remove the `LIBC_GPU_LOADER_EXECUTABLE` cache variable and the `libc.utils.gpu.loader` custom target from `prepare_libc_gpu_build.cmake`.
- Remove GPU-loader-specific conditional logic from the integration and hermetic test commands in `LLVMLibCTestRules.cmake`, relying on the already-present `${CMAKE_CROSSCOMPILING_EMULATOR}` expansion.
- Update lit configuration to use `CMAKE_CROSSCOMPILING_EMULATOR` instead of `LIBC_GPU_LOADER_EXECUTABLE`.
This works because libc uses `add_custom_target`/`add_custom_command` for test execution (not `add_test()`), so `CMAKE_CROSSCOMPILING_EMULATOR` is expanded as a plain CMake variable regardless of whether `CMAKE_CROSSCOMPILING` is true.
## Test plan
- [ ] Verify GPU libc tests still run correctly on an AMDGPU target
- [ ] Verify GPU libc tests still run correctly on an NVPTX target
- [ ] Verify non-GPU libc builds are unaffected (emulator variable is empty)
- [ ] Verify `RUNTIMES_<triple>_CMAKE_CROSSCOMPILING_EMULATOR` can override the loader path
Made with [Cursor](https://cursor.com)
---
Full diff: https://github.com/llvm/llvm-project/pull/189417.diff
11 Files Affected:
- (modified) clang/lib/Headers/gpuintrin.h (+12-13)
- (modified) clang/lib/Headers/nvptxintrin.h (+1-1)
- (modified) clang/test/Headers/Inputs/include/stdint.h (+3)
- (modified) clang/test/Headers/gpuintrin_lang.c (+1)
- (modified) libc/cmake/modules/LLVMLibCTestRules.cmake (+1-14)
- (modified) libc/cmake/modules/prepare_libc_gpu_build.cmake (-18)
- (modified) libc/docs/gpu/building.rst (+4-3)
- (modified) libc/docs/gpu/testing.rst (+1-1)
- (modified) libc/test/CMakeLists.txt (+2-7)
- (modified) libc/test/lit.site.cfg.py.in (+4-4)
- (modified) llvm/runtimes/CMakeLists.txt (+9-1)
``````````diff
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ef1446a3ac77b..12176847776be 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -147,11 +147,10 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
// Copies the value from the first active thread to the rest.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
- uint32_t __hi = (uint32_t)(__x >> 32ull);
- uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
- return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
- ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) &
- 0xFFFFFFFFull);
+ uint32_t __hi = (uint32_t)(__x >> 32);
+ uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
+ return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32) |
+ ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) & 0xFFFFFFFF);
}
// Gets the first floating point value from the active lanes.
@@ -174,11 +173,10 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
uint32_t __width) {
- uint32_t __hi = (uint32_t)(__x >> 32ull);
+ uint32_t __hi = (uint32_t)(__x >> 32);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
- return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
- << 32ull) |
+ return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width) << 32) |
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
@@ -211,7 +209,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
_DEFAULT_FN_ATTRS static __inline__ __type \
__gpu_suffix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
__type __x) { \
- uint64_t __above = __lane_mask & -(2ull << __gpu_lane_id()); \
+ uint64_t __above = __lane_mask & -(UINT64_C(2) << __gpu_lane_id()); \
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
uint32_t __src = __above ? __builtin_ctzg(__above) : __gpu_lane_id(); \
__type __result = __gpu_shuffle_idx_##__suffix(__lane_mask, __src, __x, \
@@ -226,7 +224,7 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
_DEFAULT_FN_ATTRS static __inline__ __type \
__gpu_prefix_scan_##__prefix##_##__suffix(uint64_t __lane_mask, \
__type __x) { \
- uint64_t __below = __lane_mask & ((1ull << __gpu_lane_id()) - 1); \
+ uint64_t __below = __lane_mask & ((UINT64_C(1) << __gpu_lane_id()) - 1); \
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
uint32_t __src = \
__below ? (63 - __builtin_clzg(__below)) : __gpu_lane_id(); \
@@ -234,7 +232,8 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
__gpu_num_lanes()); \
__x = __op(__x, __below ? __result : (__type)__identity); \
for (uint32_t __i = 0; __i < __step; ++__i) \
- __below ^= (1ull << (63 - __builtin_clzg(__below, 0))) & __below; \
+ __below ^= \
+ (UINT64_C(1) << (63 - __builtin_clzg(__below, 0))) & __below; \
} \
return __x; \
} \
@@ -338,7 +337,7 @@ __gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
uint32_t __first = __gpu_shuffle_idx_u32(
__lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
- return __ballot == __lane_mask ? __lane_mask : 0ull;
+ return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
}
// Returns the current lane mask if every lane contains __x.
@@ -347,7 +346,7 @@ __gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
uint64_t __first = __gpu_shuffle_idx_u64(
__lane_mask, __builtin_ctzg(__lane_mask), __x, __gpu_num_lanes());
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
- return __ballot == __lane_mask ? __lane_mask : 0ull;
+ return __ballot == __lane_mask ? __lane_mask : UINT64_C(0);
}
_Pragma("omp end declare variant");
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index b2e538580ba10..57a6a2cd08633 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -137,7 +137,7 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
uint32_t __width) {
// Mask out inactive lanes to match AMDGPU behavior.
uint32_t __mask = (uint32_t)__lane_mask;
- bool __bitmask = (1ull << __idx) & __lane_mask;
+ bool __bitmask = (UINT64_C(1) << __idx) & __lane_mask;
return -__bitmask &
__nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
((__gpu_num_lanes() - __width) << 8u) | 0x1f);
diff --git a/clang/test/Headers/Inputs/include/stdint.h b/clang/test/Headers/Inputs/include/stdint.h
index c4836441096b2..a44ebff2d69fe 100644
--- a/clang/test/Headers/Inputs/include/stdint.h
+++ b/clang/test/Headers/Inputs/include/stdint.h
@@ -39,4 +39,7 @@ typedef unsigned __INTPTR_TYPE__ uintptr_t;
#define UINT32_MAX __UINT32_C(4294967295)
#define UINT64_MAX __UINT64_C(18446744073709551615)
+#define UINT32_C __UINT32_C
+#define UINT64_C __UINT64_C
+
#endif /* STDINT_H */
diff --git a/clang/test/Headers/gpuintrin_lang.c b/clang/test/Headers/gpuintrin_lang.c
index 433d24b18d654..80e73b8c5647e 100644
--- a/clang/test/Headers/gpuintrin_lang.c
+++ b/clang/test/Headers/gpuintrin_lang.c
@@ -40,6 +40,7 @@ __device__ int foo() { return __gpu_thread_id_x(); }
#elif defined(SYCL)
extern "C" [[clang::sycl_external]] int foo() { return __gpu_thread_id_x(); }
#else
+//
// CUDA-LABEL: define dso_local i32 @foo(
// CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
// CUDA-NEXT: [[ENTRY:.*:]]
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index e25f739408b99..6d381194d5ed9 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -625,12 +625,6 @@ function(add_integration_test test_name)
libc.test.IntegrationTest.test
${INTEGRATION_TEST_DEPENDS})
- # Tests on the GPU require an external loader utility to launch the kernel.
- if(TARGET libc.utils.gpu.loader)
- add_dependencies(${fq_build_target_name} libc.utils.gpu.loader)
- get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
- endif()
-
# We have to use a separate var to store the command as a list because
# the COMMAND option of `add_custom_target` cannot handle empty vars in the
# command. For example, if INTEGRATION_TEST_ENV is empty, the actual
@@ -640,7 +634,6 @@ function(add_integration_test test_name)
set(test_cmd
${INTEGRATION_TEST_ENV}
$<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
- $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}>
${CMAKE_CROSSCOMPILING_EMULATOR}
${INTEGRATION_TEST_LOADER_ARGS}
$<TARGET_FILE:${fq_build_target_name}> ${INTEGRATION_TEST_ARGS})
@@ -878,12 +871,6 @@ function(add_libc_hermetic test_name)
)
endif()
- # Tests on the GPU require an external loader utility to launch the kernel.
- if(TARGET libc.utils.gpu.loader)
- add_dependencies(${fq_build_target_name} libc.utils.gpu.loader)
- get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE")
- endif()
-
if(NOT HERMETIC_TEST_NO_RUN_POSTBUILD)
if (LIBC_TEST_CMD)
# In the form of "<command> binary=@BINARY@", e.g. "qemu-system-arm -loader$<COMMA>file=@BINARY@"
@@ -892,7 +879,7 @@ function(add_libc_hermetic test_name)
else()
set(test_cmd ${HERMETIC_TEST_ENV}
$<$<BOOL:${LIBC_TARGET_ARCHITECTURE_IS_NVPTX}>:LIBOMPTARGET_STACK_SIZE=3072>
- $<$<BOOL:${LIBC_TARGET_OS_IS_GPU}>:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
+ ${CMAKE_CROSSCOMPILING_EMULATOR} ${HERMETIC_TEST_LOADER_ARGS}
$<TARGET_FILE:${fq_build_target_name}> ${HERMETIC_TEST_ARGS})
endif()
diff --git a/libc/cmake/modules/prepare_libc_gpu_build.cmake b/libc/cmake/modules/prepare_libc_gpu_build.cmake
index fcf0e38db81af..c87a1df926c85 100644
--- a/libc/cmake/modules/prepare_libc_gpu_build.cmake
+++ b/libc/cmake/modules/prepare_libc_gpu_build.cmake
@@ -66,24 +66,6 @@ else()
endif()
set(LIBC_GPU_TARGET_ARCHITECTURE "${gpu_test_architecture}")
-# Identify the GPU loader utility used to run tests.
-set(LIBC_GPU_LOADER_EXECUTABLE "" CACHE STRING "Executable for the GPU loader.")
-if(LIBC_GPU_LOADER_EXECUTABLE)
- set(gpu_loader_executable ${LIBC_GPU_LOADER_EXECUTABLE})
-else()
- find_program(gpu_loader_executable
- NAMES llvm-gpu-loader NO_DEFAULT_PATH
- PATHS ${LLVM_BINARY_DIR}/bin ${compiler_path})
-endif()
-if(NOT TARGET libc.utils.gpu.loader AND gpu_loader_executable)
- add_custom_target(libc.utils.gpu.loader)
- set_target_properties(
- libc.utils.gpu.loader
- PROPERTIES
- EXECUTABLE "${gpu_loader_executable}"
- )
-endif()
-
# The AMDGPU environment uses different code objects to encode the ABI for
# kernel calls and intrinsic functions. We want to expose this to conform to
# whatever the test suite was built to handle.
diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst
index d80ad0c0ff540..b8f5e79a430e3 100644
--- a/libc/docs/gpu/building.rst
+++ b/libc/docs/gpu/building.rst
@@ -195,6 +195,7 @@ standard runtime build.
commonly run out of resources if this is not constrained so it is recommended
to keep it low. The default value is a single thread.
-**LIBC_GPU_LOADER_EXECUTABLE**:STRING
- Overrides the default loader used for running GPU tests. If this is not
- provided the standard one will be built.
+**CMAKE_CROSSCOMPILING_EMULATOR**:STRING
+ Overrides the default loader used for running GPU tests. This is set
+ automatically to ``llvm-gpu-loader`` for GPU runtime targets when building
+ via the runtimes build.
diff --git a/libc/docs/gpu/testing.rst b/libc/docs/gpu/testing.rst
index 4115e68e5225a..c34467346b798 100644
--- a/libc/docs/gpu/testing.rst
+++ b/libc/docs/gpu/testing.rst
@@ -94,7 +94,7 @@ Running tests
Tests will only be built and run if a GPU target architecture is set and the
corresponding loader utility was built. These can be overridden with the
-``LIBC_GPU_TEST_ARCHITECTURE`` and ``LIBC_GPU_LOADER_EXECUTABLE`` :ref:`CMake
+``LIBC_GPU_TEST_ARCHITECTURE`` and ``CMAKE_CROSSCOMPILING_EMULATOR`` :ref:`CMake
options <gpu_cmake_options>`. Once built, they can be run like any other tests.
The CMake target depends on how the library was built.
diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt
index 60478739aced4..374756e77260e 100644
--- a/libc/test/CMakeLists.txt
+++ b/libc/test/CMakeLists.txt
@@ -16,11 +16,6 @@ add_custom_target(libc-hermetic-tests-build)
add_custom_target(libc-integration-tests-build)
add_custom_target(libc_include_tests-build)
-# Resolve the GPU loader executable path for the lit site config.
-if(TARGET libc.utils.gpu.loader)
- get_target_property(LIBC_GPU_LOADER_EXECUTABLE libc.utils.gpu.loader "EXECUTABLE")
-endif()
-
# Configure the site config file for lit
configure_lit_site_cfg(
${LIBC_SOURCE_DIR}/test/lit.site.cfg.py.in
@@ -34,7 +29,7 @@ configure_lit_site_cfg(
"LLVM_LIBS_DIR"
"LIBC_SOURCE_DIR"
"LIBC_BUILD_DIR"
- "LIBC_GPU_LOADER_EXECUTABLE"
+ "CMAKE_CROSSCOMPILING_EMULATOR"
)
add_lit_testsuite(check-libc-lit
@@ -46,7 +41,7 @@ add_lit_testsuite(check-libc-lit
add_subdirectory(UnitTest)
if(LIBC_TARGET_OS_IS_GPU)
- if(NOT TARGET libc.utils.gpu.loader)
+ if(NOT CMAKE_CROSSCOMPILING_EMULATOR)
message(WARNING "Cannot build libc GPU tests, missing loader.")
return()
elseif(LIBC_GPU_TESTS_DISABLED)
diff --git a/libc/test/lit.site.cfg.py.in b/libc/test/lit.site.cfg.py.in
index 7773bdfdf0e9c..7727f16956d73 100644
--- a/libc/test/lit.site.cfg.py.in
+++ b/libc/test/lit.site.cfg.py.in
@@ -8,11 +8,11 @@ config.llvm_tools_dir = lit_config.substitute(path(r"@LLVM_TOOLS_DIR@"))
config.libc_src_root = path(r"@LIBC_SOURCE_DIR@")
config.libc_obj_root = path(r"@LIBC_BUILD_DIR@")
config.libc_test_cmd = "@LIBC_TEST_CMD@"
-config.libc_gpu_loader = path(r"@LIBC_GPU_LOADER_EXECUTABLE@")
+config.libc_crosscompiling_emulator = path(r"@CMAKE_CROSSCOMPILING_EMULATOR@")
-# If running GPU tests and no explicit test command is set, use the GPU loader.
-if not config.libc_test_cmd and config.libc_gpu_loader:
- config.libc_test_cmd = config.libc_gpu_loader + " @BINARY@"
+# If no explicit test command is set, use the cross-compiling emulator.
+if not config.libc_test_cmd and config.libc_crosscompiling_emulator:
+ config.libc_test_cmd = config.libc_crosscompiling_emulator + " @BINARY@"
# Add libc's utils directory to the path so we can import the test format.
site.addsitedir(os.path.join(config.libc_src_root, "utils"))
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index f22b551b89aca..a6e209a7ab204 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -674,9 +674,17 @@ if(build_runtimes)
check_apple_target(${name} runtime)
+ set(per_target_cmake_args)
+ if(LLVM_LIBC_GPU_BUILD AND TARGET llvm-gpu-loader
+ AND "${name}" MATCHES "^(amdgcn|nvptx64)")
+ list(APPEND per_target_cmake_args
+ "-DCMAKE_CROSSCOMPILING_EMULATOR=${LLVM_TOOLS_BINARY_DIR}/llvm-gpu-loader${CMAKE_EXECUTABLE_SUFFIX}")
+ endif()
+
runtime_register_target(${name}
DEPENDS ${builtins_dep_name} ${extra_deps}
- CMAKE_ARGS -DLLVM_DEFAULT_TARGET_TRIPLE=${name} ${extra_cmake_args}
+ CMAKE_ARGS -DLLVM_DEFAULT_TARGET_TRIPLE=${name}
+ ${extra_cmake_args} ${per_target_cmake_args}
EXTRA_ARGS TARGET_TRIPLE ${name} ${extra_args})
endforeach()
``````````
</details>
https://github.com/llvm/llvm-project/pull/189417
More information about the libc-commits
mailing list