[libc-commits] [clang] [libc] [Clang] Add handlers for 'match_any' and 'match_all' to `gpuintrin.h` (PR #127504)
via libc-commits
libc-commits at lists.llvm.org
Mon Feb 17 07:09:29 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Joseph Huber (jhuber6)
<details>
<summary>Changes</summary>
Summary:
These helpers are very useful but currently absent. They allow the user
to get a bitmask representing the matches within the warp. I have made
an executive decision to drop the `predicate` return from `match_all`
because it's easily testable with `match_all() == __activemask()`.
---
Full diff: https://github.com/llvm/llvm-project/pull/127504.diff
5 Files Affected:
- (modified) clang/lib/Headers/amdgpuintrin.h (+56)
- (modified) clang/lib/Headers/nvptxintrin.h (+74)
- (modified) libc/src/__support/GPU/utils.h (+8)
- (modified) libc/test/integration/src/__support/GPU/CMakeLists.txt (+9)
- (added) libc/test/integration/src/__support/GPU/match.cpp (+32)
``````````diff
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 9dad99ffe9439..355e75d0b2d42 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..f857a87b5f4c7 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,6 +13,10 @@
#error "This file is intended for NVPTX targets or offloading to NVPTX"
#endif
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
#include <stdint.h>
#if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i32(__lane_mask, __x);
+
+ uint32_t __match_mask = 0;
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i64(__lane_mask, __x);
+
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to CUDA 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __nvvm_isspacep_shared(ptr);
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 323c003f1ff07..0fd3a6498b865 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
+LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_all_u32(lane_mask, x);
+}
+
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 68bbc3849bc7e..e066830f6cc0d 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -18,3 +18,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ match_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ match.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp
new file mode 100644
index 0000000000000..225078022cdc3
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/match.cpp
@@ -0,0 +1,32 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to ensure that match any / match all work.
+static void test_match() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id()));
+ EXPECT_EQ(mask, gpu::match_any(mask, 1));
+ EXPECT_EQ(0xffff, gpu::match_any(mask, gpu::get_lane_id() < 16));
+ EXPECT_EQ(mask, gpu::match_all(mask, 1));
+ EXPECT_EQ(0ull, gpu::match_any(mask, gpu::get_lane_id()));
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_match();
+
+ return 0;
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/127504
More information about the libc-commits
mailing list