[Openmp-commits] [openmp] 8b6cd15 - [libomptarget][amdgcn] Implement partial barrier

Mon Oct 12 13:27:45 PDT 2020

Author: JonChesterfield
Date: 2020-10-12T21:27:32+01:00
New Revision: 8b6cd15242673c04618fb0aafc07d5de9e0bbe1e

URL: https://github.com/llvm/llvm-project/commit/8b6cd15242673c04618fb0aafc07d5de9e0bbe1e
DIFF: https://github.com/llvm/llvm-project/commit/8b6cd15242673c04618fb0aafc07d5de9e0bbe1e.diff

LOG: [libomptarget][amdgcn] Implement partial barrier

[libomptarget][amdgcn] Implement partial barrier

named_sync is used to coordinate non-spmd kernels. This uses bar.sync on nvptx.
There is no corresponding ISA support on amdgcn, so this is implemented using
shared memory, one word initialized to zero.

Each wave increments the variable by one. Whichever wave is last is responsible
for resetting the variable to zero, at which point it and the others continue.

The race condition on a wave reaching the barrier before another wave has
noticed that it has been released is handled with a generation counter, packed
into the same word.

Uses a shared variable that is not needed on nvptx. Introduces a new hook,
kmpc_impl_target_init, to allow different targets to do extra initialization.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D88602

Added: 
    

Modified: 
    openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
    openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
    openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index 8afc5e77996a..34794587e0fe 100644

--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -109,11 +109,11 @@ INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
   // AMDGCN doesn't need to sync threads in a warp
 }
 
-INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
-  (void)num_threads;
-  // TODO: Implement on top of __SHARED__
-  __builtin_amdgcn_s_barrier();
-}
+// AMDGCN specific kernel initialization
+DEVICE void __kmpc_impl_target_init();
+
+// Equivalent to ptx bar.sync 1. Barrier until num_threads arrive.
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
 
 INLINE void __kmpc_impl_threadfence() {
   __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 9807483d4c42..4c3d421c78cc 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -62,6 +62,59 @@ DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
   return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }
 
+static DEVICE SHARED uint32_t L1_Barrier;
+
+DEVICE void __kmpc_impl_target_init() {
+  // Don't have global ctors, and shared memory is not zero init
+  __atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE);
+}
+
+DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
+  __atomic_thread_fence(__ATOMIC_ACQUIRE);
+
+  uint32_t num_waves = num_threads / WARPSIZE;
+
+  // Partial barrier implementation for amdgcn.
+  // Uses two 16 bit unsigned counters. One for the number of waves to have
+  // reached the barrier, and one to count how many times the barrier has been
+  // passed. These are packed in a single atomically accessed 32 bit integer.
+  // Low bits for the number of waves, assumed zero before this call.
+  // High bits to count the number of times the barrier has been passed.
+
+  assert(num_waves != 0);
+  assert(num_waves * WARPSIZE == num_threads);
+  assert(num_waves < 0xffffu);
+
+  // Increment the low 16 bits once, using the lowest active thread.
+  uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
+  bool isLowest = GetLaneId() == lowestActiveThread;
+
+  if (isLowest) {
+    uint32_t load =
+        __atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative
+
+    // Record the number of times the barrier has been passed
+    uint32_t generation = load & 0xffff0000u;
+
+    if ((load & 0x0000ffffu) == (num_waves - 1)) {
+      // Reached num_waves in low bits so this is the last wave.
+      // Set low bits to zero and increment high bits
+      load += 0x00010000u; // wrap is safe
+      load &= 0xffff0000u; // because bits zeroed second
+
+      // Reset the wave counter and release the waiting waves
+      __atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED);
+    } else {
+      // more waves still to go, spin until generation counter changes
+      do {
+        __builtin_amdgcn_s_sleep(0);
+        load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);
+      } while ((load & 0xffff0000u) == generation);
+    }
+  }
+  __atomic_thread_fence(__ATOMIC_RELEASE);
+}
+
 EXTERN uint64_t __ockl_get_local_size(uint32_t);
 EXTERN uint64_t __ockl_get_num_groups(uint32_t);
 DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); }

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
index 5ccc84539400..d9ee95e4f423 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -63,6 +63,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
   nThreads = GetNumberOfThreadsInBlock();
   threadLimit = ThreadLimit;
+  __kmpc_impl_target_init();
 }
 
 EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index f7bc7e14c528..46ce751c44c4 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -183,6 +183,11 @@ INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
 #endif // CUDA_VERSION
 }
 
+// NVPTX specific kernel initialization
+INLINE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
+}
+
+// Barrier until num_threads arrive.
 INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
   // The named barrier for active parallel threads of a team in an L1 parallel
   // region to synchronize with each other.