[clang] [Headers][NFC] Deduplicate gpu_match_ between targets via inlining (PR #131141)
Jon Chesterfield via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 13 08:29:08 PDT 2025
https://github.com/JonChesterfield updated https://github.com/llvm/llvm-project/pull/131141
>From fbeb177a750ca671a9cff9f37f57e58c6900e7fd Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield at gmail.com>
Date: Thu, 13 Mar 2025 13:23:38 +0000
Subject: [PATCH] [Headers][NFC] Deduplicate gpu_match_ between targets via
inlining
---
clang/lib/Headers/amdgpuintrin.h | 44 ++---------------
clang/lib/Headers/gpuintrin.h | 82 +++++++++++++++++++++++++++++++-
clang/lib/Headers/nvptxintrin.h | 48 ++++---------------
3 files changed, 93 insertions(+), 81 deletions(-)
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 56748f6c3e818..f7fb8e2814180 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -30,10 +30,6 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
// Returns the number of workgroups in the 'x' dimension of the grid.
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -146,57 +142,25 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
- uint32_t __match_mask = 0;
-
- bool __done = 0;
- while (__gpu_ballot(__lane_mask, !__done)) {
- if (!__done) {
- uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
- if (__first == __x) {
- __match_mask = __gpu_lane_mask();
- __done = 1;
- }
- }
- }
- __gpu_sync_lane(__lane_mask);
- return __match_mask;
+ return __gpu_match_any_u32_impl(__lane_mask, __x);
}
// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
- uint64_t __match_mask = 0;
-
- bool __done = 0;
- while (__gpu_ballot(__lane_mask, !__done)) {
- if (!__done) {
- uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
- if (__first == __x) {
- __match_mask = __gpu_lane_mask();
- __done = 1;
- }
- }
- }
- __gpu_sync_lane(__lane_mask);
- return __match_mask;
+ return __gpu_match_any_u64_impl(__lane_mask, __x);
}
// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
- uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
- uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
- __gpu_sync_lane(__lane_mask);
- return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+ return __gpu_match_all_u32_impl(__lane_mask, __x);
}
// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
- uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
- uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
- __gpu_sync_lane(__lane_mask);
- return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+ return __gpu_match_all_u64_impl(__lane_mask, __x);
}
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index ac79d685337c5..0fb3916acac61 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -32,6 +32,30 @@ _Pragma("push_macro(\"bool\")");
#define bool _Bool
#endif
+_Pragma("omp begin declare target device_type(nohost)");
+_Pragma("omp begin declare variant match(device = {kind(gpu)})");
+
+// Forward declare a few functions for the implementation header.
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x);
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x);
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x);
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x);
+
+_Pragma("omp end declare variant");
+_Pragma("omp end declare target");
+
#if defined(__NVPTX__)
#include <nvptxintrin.h>
#elif defined(__AMDGPU__)
@@ -115,7 +139,7 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
}
-// Copies the value from the first active thread in the wavefront to the rest.
+// Copies the value from the first active thread to the rest.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
@@ -234,6 +258,62 @@ __DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
#undef __DO_LANE_SUM
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32_impl(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64_impl(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32_impl(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64_impl(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
_Pragma("omp end declare variant");
_Pragma("omp end declare target");
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 10ad7a682d4cd..fb811d0d58394 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -34,10 +34,6 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
// Attribute to declare a function as a kernel.
#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
-// Defined in gpuintrin.h, used later in this file.
-_DEFAULT_FN_ATTRS static __inline__ uint64_t
-__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
-
// Returns the number of CUDA blocks in the 'x' dimension.
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
return __nvvm_read_ptx_sreg_nctaid_x();
@@ -156,20 +152,9 @@ __gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
// Newer targets can use the dedicated CUDA support.
#if __CUDA_ARCH__ >= 700
return __nvvm_match_any_sync_i32(__lane_mask, __x);
+#else
+ return __gpu_match_any_u32_impl(__lane_mask, __x);
#endif
-
- uint32_t __match_mask = 0;
- bool __done = 0;
- while (__gpu_ballot(__lane_mask, !__done)) {
- if (!__done) {
- uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
- if (__first == __x) {
- __match_mask = __gpu_lane_mask();
- __done = 1;
- }
- }
- }
- return __match_mask;
}
// Returns a bitmask marking all lanes that have the same value of __x.
@@ -178,22 +163,9 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
// Newer targets can use the dedicated CUDA support.
#if __CUDA_ARCH__ >= 700
return __nvvm_match_any_sync_i64(__lane_mask, __x);
+#else
+ return __gpu_match_any_u64_impl(__lane_mask, __x);
#endif
-
- uint64_t __match_mask = 0;
-
- bool __done = 0;
- while (__gpu_ballot(__lane_mask, !__done)) {
- if (!__done) {
- uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
- if (__first == __x) {
- __match_mask = __gpu_lane_mask();
- __done = 1;
- }
- }
- }
- __gpu_sync_lane(__lane_mask);
- return __match_mask;
}
// Returns the current lane mask if every lane contains __x.
@@ -203,11 +175,9 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
#if __CUDA_ARCH__ >= 700
int predicate;
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+#else
+ return __gpu_match_all_u32_impl(__lane_mask, __x);
#endif
-
- uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
- uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
- return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}
// Returns the current lane mask if every lane contains __x.
@@ -217,11 +187,9 @@ __gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
#if __CUDA_ARCH__ >= 700
int predicate;
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+#else
+ return __gpu_match_all_u64_impl(__lane_mask, __x);
#endif
-
- uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
- uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
- return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}
// Returns true if the flat pointer points to CUDA 'shared' memory.
More information about the cfe-commits
mailing list