[Openmp-commits] [llvm] [openmp] [OpenMP][OMPX] Add ballot_sync (PR #91297)
via Openmp-commits
openmp-commits at lists.llvm.org
Mon May 6 21:19:10 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-offload
Author: Shilei Tian (shiltian)
<details>
<summary>Changes</summary>
- **[NFC][OpenMP][OMPX] Use `__attribute__((__always_inline__))` instead of `inline`**
- **[OpenMP][OMPX] Add `ballot_sync`**
---
Full diff: https://github.com/llvm/llvm-project/pull/91297.diff
5 Files Affected:
- (modified) offload/DeviceRTL/include/Utils.h (+2)
- (modified) offload/DeviceRTL/src/Mapping.cpp (+4)
- (modified) offload/DeviceRTL/src/Utils.cpp (+12)
- (added) offload/test/offloading/ompx_bare_ballot_sync.c (+38)
- (modified) openmp/runtime/src/include/ompx.h.var (+25-11)
``````````diff
diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h
index 4ab0aea46eea122..5048345fdbc1131 100644
--- a/offload/DeviceRTL/include/Utils.h
+++ b/offload/DeviceRTL/include/Utils.h
@@ -25,6 +25,8 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
+uint32_t ballotSync(uint32_t Mask, int32_t Pred);
+
/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
uint64_t pack(uint32_t LowBits, uint32_t HighBits);
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index b2028a8fb4f5069..2894a4885292ecd 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -364,4 +364,8 @@ _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
_TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
_TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
+extern "C" unsigned ompx_ballot_sync(unsigned mask, int pred) {
+ return utils::ballotSync(mask, pred);
+}
+
#pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp
index d07ac0fb499c941..70df7d52822e0b7 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/Utils.cpp
@@ -37,6 +37,8 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
int32_t Width);
+uint32_t ballotSync(uint32_t Mask, int32_t Pred);
+
/// AMDGCN Implementation
///
///{
@@ -57,6 +59,12 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
+uint32_t ballotSync(uint32_t Mask, int32_t Pred) {
+ if (__AMDGCN_WAVEFRONT_SIZE == 32)
+ return __builtin_amdgcn_ballot_w32(Pred);
+ return __builtin_amdgcn_ballot_w64(Pred);
+}
+
bool isSharedMemPtr(const void *Ptr) {
return __builtin_amdgcn_is_shared(
(const __attribute__((address_space(0))) void *)Ptr);
@@ -80,6 +88,10 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
}
+uint32_t ballotSync(uint32_t Mask, int32_t Pred) {
+ return __nvvm_vote_ballot_sync(Mask, Pred);
+}
+
bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
#pragma omp end declare variant
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
new file mode 100644
index 000000000000000..0bd6355d9ef6721
--- /dev/null
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -0,0 +1,38 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+//
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <assert.h>
+#include <ompx.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+ const int num_blocks = 1;
+ const int block_size = 64;
+ const int N = num_blocks * block_size;
+ unsigned *data = (int *)malloc(N * sizeof(unsigned));
+
+ for (int i = 0; i < N; ++i)
+ data[i] = i & 0x1;
+
+#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) map(tofrom: data[0:N])
+ {
+ int tid = ompx_thread_id_x();
+ unsigned mask = ompx_ballot_sync(0xffffffff, data[tid]);
+ data[tid] += mask;
+ }
+
+ for (int i = 0; i < N; ++i)
+ assert(data[i] == ((i & 0x1) + 0xaaaaaaaa));
+
+ // CHECK: PASS
+ printf("PASS\n");
+
+ return 0;
+}
diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index 579d31aa98c54e6..827a28284c31366 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -9,6 +9,8 @@
#ifndef __OMPX_H
#define __OMPX_H
+#define INLINE [[clang::always_inline]]
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -57,7 +59,7 @@ enum {
/// ompx_{thread,block}_{id,dim}
///{
#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(NAME, VALUE) \
- static inline int ompx_##NAME(int Dim) { return VALUE; }
+ static INLINE int ompx_##NAME(int Dim) { return VALUE; }
_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(thread_id,
omp_get_ancestor_thread_num(Dim + 1))
@@ -70,7 +72,7 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(grid_dim, 1)
/// ompx_{sync_block}_{,divergent}
///{
#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(RETTY, NAME, ARGS, BODY) \
- static inline RETTY ompx_##NAME(ARGS) { BODY; }
+ static INLINE RETTY ompx_##NAME(ARGS) { BODY; }
_TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block, int Ordering,
_Pragma("omp barrier"));
@@ -81,11 +83,15 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering,
#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C
///}
+static INLINE unsigned ompx_ballot_sync(unsigned mask, int pred) {
+ __builtin_trap();
+}
+
#pragma omp end declare variant
/// ompx_{sync_block}_{,divergent}
///{
-#define _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(RETTY, NAME, ARGS) \
+#define _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(RETTY, NAME, ARGS) \
RETTY ompx_##NAME(ARGS);
_TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block, int Ordering);
@@ -98,9 +104,9 @@ _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_divergent, int Ordering);
///{
#define _TGT_KERNEL_LANGUAGE_DECL_GRID_C(NAME) \
int ompx_##NAME(int Dim); \
- static inline int ompx_##NAME##_x() { return ompx_##NAME(ompx_dim_x); } \
- static inline int ompx_##NAME##_y() { return ompx_##NAME(ompx_dim_y); } \
- static inline int ompx_##NAME##_z() { return ompx_##NAME(ompx_dim_z); }
+ static INLINE int ompx_##NAME##_x() { return ompx_##NAME(ompx_dim_x); } \
+ static INLINE int ompx_##NAME##_y() { return ompx_##NAME(ompx_dim_y); } \
+ static INLINE int ompx_##NAME##_z() { return ompx_##NAME(ompx_dim_z); }
_TGT_KERNEL_LANGUAGE_DECL_GRID_C(thread_id)
_TGT_KERNEL_LANGUAGE_DECL_GRID_C(block_dim)
@@ -109,6 +115,8 @@ _TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim)
#undef _TGT_KERNEL_LANGUAGE_DECL_GRID_C
///}
+unsigned ompx_ballot_sync(unsigned mask, int pred);
+
#ifdef __cplusplus
}
#endif
@@ -134,10 +142,10 @@ enum {
/// ompx::{thread,block}_{id,dim}_{,x,y,z}
///{
#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(NAME) \
- static inline int NAME(int Dim) noexcept { return ompx_##NAME(Dim); } \
- static inline int NAME##_x() noexcept { return NAME(ompx_dim_x); } \
- static inline int NAME##_y() noexcept { return NAME(ompx_dim_y); } \
- static inline int NAME##_z() noexcept { return NAME(ompx_dim_z); }
+ static INLINE int NAME(int Dim) noexcept { return ompx_##NAME(Dim); } \
+ static INLINE int NAME##_x() noexcept { return NAME(ompx_dim_x); } \
+ static INLINE int NAME##_y() noexcept { return NAME(ompx_dim_y); } \
+ static INLINE int NAME##_z() noexcept { return NAME(ompx_dim_z); }
_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(thread_id)
_TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(block_dim)
@@ -149,7 +157,7 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(grid_dim)
/// ompx_{sync_block}_{,divergent}
///{
#define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(RETTY, NAME, ARGS, CALL_ARGS) \
- static inline RETTY NAME(ARGS) { \
+ static INLINE RETTY NAME(ARGS) { \
return ompx_##NAME(CALL_ARGS); \
}
@@ -160,9 +168,15 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block_divergent,
#undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX
///}
+static INLINE unsigned ballot_sync(unsigned mask, int pred) {
+ return ompx_ballot_sync(mask, pred);
+}
+
} // namespace ompx
#endif
///}
+#undef INLINE
+
#endif /* __OMPX_H */
``````````
</details>
https://github.com/llvm/llvm-project/pull/91297
More information about the Openmp-commits
mailing list