[Openmp-commits] [openmp] 44ee974 - [libomptarget][nfc] Refactor amdgpu partial barrier to simplify adding a second one

Jon Chesterfield via Openmp-commits openmp-commits at lists.llvm.org
Thu May 6 15:52:54 PDT 2021


Author: Jon Chesterfield
Date: 2021-05-06T23:52:19+01:00
New Revision: 44ee974e2f3ef120e1890d8aafb02fedc3c135e9

URL: https://github.com/llvm/llvm-project/commit/44ee974e2f3ef120e1890d8aafb02fedc3c135e9
DIFF: https://github.com/llvm/llvm-project/commit/44ee974e2f3ef120e1890d8aafb02fedc3c135e9.diff

LOG: [libomptarget][nfc] Refactor amdgpu partial barrier to simplify adding a second one

[libomptarget][nfc] Refactor amdgpu partial barrier to simplify adding a second one

D101976 would require a second barrier instance. This NFC to amdgpu makes it
simpler to add one (an extra global, one more line in init). Also renames the
current barrier to L0.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D102016

Added: 
    

Modified: 
    openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 63a7091ec530f..4c99a096916a3 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -52,15 +52,8 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
   return __builtin_amdgcn_read_exec();
 }
 
-uint32_t __kmpc_L1_Barrier [[clang::loader_uninitialized]];
-#pragma allocate(__kmpc_L1_Barrier) allocator(omp_pteam_mem_alloc)
-
-EXTERN void __kmpc_impl_target_init() {
-  // Don't have global ctors, and shared memory is not zero init
-  __atomic_store_n(&__kmpc_L1_Barrier, 0u, __ATOMIC_RELEASE);
-}
-
-EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
+static void pteam_mem_barrier(uint32_t num_threads, uint32_t * barrier_state)
+{
   __atomic_thread_fence(__ATOMIC_ACQUIRE);
 
   uint32_t num_waves = num_threads / WARPSIZE;
@@ -81,7 +74,7 @@ EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
   bool isLowest = GetLaneId() == lowestActiveThread;
 
   if (isLowest) {
-    uint32_t load = __atomic_fetch_add(&__kmpc_L1_Barrier, 1,
+    uint32_t load = __atomic_fetch_add(barrier_state, 1,
                                        __ATOMIC_RELAXED); // commutative
 
     // Record the number of times the barrier has been passed
@@ -94,18 +87,30 @@ EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
       load &= 0xffff0000u; // because bits zeroed second
 
       // Reset the wave counter and release the waiting waves
-      __atomic_store_n(&__kmpc_L1_Barrier, load, __ATOMIC_RELAXED);
+      __atomic_store_n(barrier_state, load, __ATOMIC_RELAXED);
     } else {
       // more waves still to go, spin until generation counter changes
       do {
         __builtin_amdgcn_s_sleep(0);
-        load = __atomic_load_n(&__kmpc_L1_Barrier, __ATOMIC_RELAXED);
+        load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED);
       } while ((load & 0xffff0000u) == generation);
     }
   }
   __atomic_thread_fence(__ATOMIC_RELEASE);
 }
 
+uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]];
+#pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc)
+
+EXTERN void __kmpc_impl_target_init() {
+  // Don't have global ctors, and shared memory is not zero init
+  __atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE);
+}
+
+EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
+  pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier);
+}
+
 namespace {
 uint32_t get_grid_dim(uint32_t n, uint16_t d) {
   uint32_t q = n / d;


        


More information about the Openmp-commits mailing list