[Openmp-commits] [llvm] [openmp] [OpenMP][OMPX] Add ballot_sync (PR #91297)

Shilei Tian via Openmp-commits openmp-commits at lists.llvm.org
Tue May 21 21:49:35 PDT 2024


https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/91297

>From 4c62bca156c20d78615a2c24964866c81bf1c65c Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 22 May 2024 00:33:00 -0400
Subject: [PATCH 1/2] [NFC][OpenMP][OMPX] Use
 `__attribute__((__always_inline__))` instead of `inline`

---
 openmp/runtime/src/include/ompx.h.var | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index 579d31aa98c54..b878f19a42960 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -9,6 +9,8 @@
 #ifndef __OMPX_H
 #define __OMPX_H
 
+#define INLINE [[clang::always_inline]]
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -57,7 +59,7 @@ enum {
 /// ompx_{thread,block}_{id,dim}
 ///{
 #define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(NAME, VALUE)                     \
-  static inline int ompx_##NAME(int Dim) { return VALUE; }
+  static INLINE int ompx_##NAME(int Dim) { return VALUE; }
 
 _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(thread_id,
                                       omp_get_ancestor_thread_num(Dim + 1))
@@ -70,7 +72,7 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_C(grid_dim, 1)
 /// ompx_{sync_block}_{,divergent}
 ///{
 #define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(RETTY, NAME, ARGS, BODY)         \
-  static inline RETTY ompx_##NAME(ARGS) { BODY; }
+  static INLINE RETTY ompx_##NAME(ARGS) { BODY; }
 
 _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block, int Ordering,
                                       _Pragma("omp barrier"));
@@ -85,7 +87,7 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering,
 
 /// ompx_{sync_block}_{,divergent}
 ///{
-#define _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(RETTY, NAME, ARGS)         \
+#define _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(RETTY, NAME, ARGS)                    \
   RETTY ompx_##NAME(ARGS);
 
 _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block, int Ordering);
@@ -98,9 +100,9 @@ _TGT_KERNEL_LANGUAGE_DECL_SYNC_C(void, sync_block_divergent, int Ordering);
 ///{
 #define _TGT_KERNEL_LANGUAGE_DECL_GRID_C(NAME)                                 \
   int ompx_##NAME(int Dim);                                                    \
-  static inline int ompx_##NAME##_x() { return ompx_##NAME(ompx_dim_x); }      \
-  static inline int ompx_##NAME##_y() { return ompx_##NAME(ompx_dim_y); }      \
-  static inline int ompx_##NAME##_z() { return ompx_##NAME(ompx_dim_z); }
+  static INLINE int ompx_##NAME##_x() { return ompx_##NAME(ompx_dim_x); }      \
+  static INLINE int ompx_##NAME##_y() { return ompx_##NAME(ompx_dim_y); }      \
+  static INLINE int ompx_##NAME##_z() { return ompx_##NAME(ompx_dim_z); }
 
 _TGT_KERNEL_LANGUAGE_DECL_GRID_C(thread_id)
 _TGT_KERNEL_LANGUAGE_DECL_GRID_C(block_dim)
@@ -134,10 +136,10 @@ enum {
 /// ompx::{thread,block}_{id,dim}_{,x,y,z}
 ///{
 #define _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(NAME)                          \
-  static inline int NAME(int Dim) noexcept { return ompx_##NAME(Dim); }        \
-  static inline int NAME##_x() noexcept { return NAME(ompx_dim_x); }           \
-  static inline int NAME##_y() noexcept { return NAME(ompx_dim_y); }           \
-  static inline int NAME##_z() noexcept { return NAME(ompx_dim_z); }
+  static INLINE int NAME(int Dim) noexcept { return ompx_##NAME(Dim); }        \
+  static INLINE int NAME##_x() noexcept { return NAME(ompx_dim_x); }           \
+  static INLINE int NAME##_y() noexcept { return NAME(ompx_dim_y); }           \
+  static INLINE int NAME##_z() noexcept { return NAME(ompx_dim_z); }
 
 _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(thread_id)
 _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(block_dim)
@@ -149,7 +151,7 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_GRID_CXX(grid_dim)
 /// ompx_{sync_block}_{,divergent}
 ///{
 #define _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(RETTY, NAME, ARGS, CALL_ARGS)  \
-  static inline RETTY NAME(ARGS) {               \
+  static INLINE RETTY NAME(ARGS) {                                             \
     return ompx_##NAME(CALL_ARGS);                                             \
   }
 
@@ -165,4 +167,6 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block_divergent,
 
 ///}
 
+#undef INLINE
+
 #endif /* __OMPX_H */

>From 1a906703be643158da91866969bfc5d2d5f34fb2 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Wed, 22 May 2024 00:49:21 -0400
Subject: [PATCH 2/2] [OpenMP][OMPX] Add `ballot_sync`

---
 offload/DeviceRTL/include/Utils.h             |  2 +
 offload/DeviceRTL/src/Mapping.cpp             |  4 ++
 offload/DeviceRTL/src/Utils.cpp               | 12 ++++++
 .../test/offloading/ompx_bare_ballot_sync.c   | 39 +++++++++++++++++++
 openmp/runtime/src/include/ompx.h.var         | 12 ++++++
 5 files changed, 69 insertions(+)
 create mode 100644 offload/test/offloading/ompx_bare_ballot_sync.c

diff --git a/offload/DeviceRTL/include/Utils.h b/offload/DeviceRTL/include/Utils.h
index 4ab0aea46eea1..5048345fdbc11 100644
--- a/offload/DeviceRTL/include/Utils.h
+++ b/offload/DeviceRTL/include/Utils.h
@@ -25,6 +25,8 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
 
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
 
+uint32_t ballotSync(uint32_t Mask, int32_t Pred);
+
 /// Return \p LowBits and \p HighBits packed into a single 64 bit value.
 uint64_t pack(uint32_t LowBits, uint32_t HighBits);
 
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index b2028a8fb4f50..4f39d2a299ee6 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -364,4 +364,8 @@ _TGT_KERNEL_LANGUAGE(block_id, getBlockIdInKernel)
 _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock)
 _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel)
 
+extern "C" uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
+  return utils::ballotSync(mask, pred);
+}
+
 #pragma omp end declare target
diff --git a/offload/DeviceRTL/src/Utils.cpp b/offload/DeviceRTL/src/Utils.cpp
index d07ac0fb499c9..be0432d0f5be8 100644
--- a/offload/DeviceRTL/src/Utils.cpp
+++ b/offload/DeviceRTL/src/Utils.cpp
@@ -37,6 +37,8 @@ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
 int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
                     int32_t Width);
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred);
+
 /// AMDGCN Implementation
 ///
 ///{
@@ -57,6 +59,12 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
 }
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
+  return Mask &
+         (__AMDGCN_WAVEFRONT_SIZE == 32 ? __builtin_amdgcn_ballot_w32(Pred)
+                                        : __builtin_amdgcn_ballot_w64(Pred));
+}
+
 bool isSharedMemPtr(const void *Ptr) {
   return __builtin_amdgcn_is_shared(
       (const __attribute__((address_space(0))) void *)Ptr);
@@ -80,6 +88,10 @@ int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
   return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
 }
 
+uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
+  return __nvvm_vote_ballot_sync(static_cast<uint32_t>(Mask), Pred);
+}
+
 bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
 
 #pragma omp end declare variant
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
new file mode 100644
index 0000000000000..f22c868ec7720
--- /dev/null
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -0,0 +1,39 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+//
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: s390x-ibm-linux-gnu
+// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
+
+#include <assert.h>
+#include <ompx.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+  const int num_blocks = 1;
+  const int block_size = 64;
+  const int N = num_blocks * block_size;
+  uint64_t *data = (int *)malloc(N * sizeof(uint64_t));
+
+  for (int i = 0; i < N; ++i)
+    data[i] = i & 0x1;
+
+#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) map(tofrom: data[0:N])
+  {
+    int tid = ompx_thread_id_x();
+    uint64_t mask = ompx_ballot_sync(~0U, data[tid]);
+    data[tid] += mask;
+  }
+
+  for (int i = 0; i < N; ++i)
+    assert(data[i] == ((i & 0x1) + 0xaaaaaaaa));
+
+  // CHECK: PASS
+  printf("PASS\n");
+
+  return 0;
+}
diff --git a/openmp/runtime/src/include/ompx.h.var b/openmp/runtime/src/include/ompx.h.var
index b878f19a42960..675c29820d4fd 100644
--- a/openmp/runtime/src/include/ompx.h.var
+++ b/openmp/runtime/src/include/ompx.h.var
@@ -11,6 +11,8 @@
 
 #define INLINE [[clang::always_inline]]
 
+typedef unsigned long uint64_t;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -83,6 +85,10 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C(void, sync_block_divergent, int Ordering,
 #undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_C
 ///}
 
+static INLINE uint64_t ompx_ballot_sync(uint64_t mask, int pred) {
+  __builtin_trap();
+}
+
 #pragma omp end declare variant
 
 /// ompx_{sync_block}_{,divergent}
@@ -111,6 +117,8 @@ _TGT_KERNEL_LANGUAGE_DECL_GRID_C(grid_dim)
 #undef _TGT_KERNEL_LANGUAGE_DECL_GRID_C
 ///}
 
+uint64_t ompx_ballot_sync(uint64_t mask, int pred);
+
 #ifdef __cplusplus
 }
 #endif
@@ -162,6 +170,10 @@ _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX(void, sync_block_divergent,
 #undef _TGT_KERNEL_LANGUAGE_HOST_IMPL_SYNC_CXX
 ///}
 
+static INLINE uint64_t ballot_sync(uint64_t mask, int pred) {
+  return ompx_ballot_sync(mask, pred);
+}
+
 } // namespace ompx
 #endif
 



More information about the Openmp-commits mailing list