[Openmp-commits] [openmp] b16aadf - [OpenMP] Introduce aligned synchronization into the new device RT

Johannes Doerfert via Openmp-commits openmp-commits at lists.llvm.org
Wed Oct 27 16:22:46 PDT 2021


Author: Johannes Doerfert
Date: 2021-10-27T18:22:31-05:00
New Revision: b16aadf0a79a2796c9ac68d1bfc8853ccf25c459

URL: https://github.com/llvm/llvm-project/commit/b16aadf0a79a2796c9ac68d1bfc8853ccf25c459
DIFF: https://github.com/llvm/llvm-project/commit/b16aadf0a79a2796c9ac68d1bfc8853ccf25c459.diff

LOG: [OpenMP] Introduce aligned synchronization into the new device RT

We will later use the fact that a barrier is aligned to reason about
thread divergence. For now we introduce the assumption and some more
documentation.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D112153

Added: 
    

Modified: 
    openmp/libomptarget/DeviceRTL/include/Synchronization.h
    openmp/libomptarget/DeviceRTL/src/Kernel.cpp
    openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
    openmp/libomptarget/DeviceRTL/src/State.cpp
    openmp/libomptarget/DeviceRTL/src/Synchronization.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index 7097056dfe5c..e33f37a659af 100644
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -27,6 +27,21 @@ void warp(LaneMaskTy Mask);
 /// Synchronize all threads in a block.
 void threads();
 
+/// Synchronizing threads is allowed even if they all hit 
diff erent instances of
+/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
+/// restrictive in that it requires all threads to hit the same instance. The
+/// noinline is removed by the openmp-opt pass and helps to preserve the
+/// information till then.
+///{
+#pragma omp begin assumes ext_aligned_barrier
+
+/// Synchronize all threads in a block, they are are reaching the same
+/// instruction (hence all threads in the block are "aligned").
+__attribute__((noinline)) void threadsAligned();
+
+#pragma omp end assumes
+///}
+
 } // namespace synchronize
 
 namespace fence {

diff  --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index f834754ccc56..d47fa03c367e 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -69,7 +69,7 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
   const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
   if (IsSPMD) {
     inititializeRuntime(/* IsSPMD */ true);
-    synchronize::threads();
+    synchronize::threadsAligned();
   } else {
     inititializeRuntime(/* IsSPMD */ false);
     // No need to wait since only the main threads will execute user

diff  --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index 91309b8a013f..81af773d014a 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -93,18 +93,36 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
 
   uint32_t NumThreads = determineNumberOfThreads(num_threads);
   if (mapping::isSPMDMode()) {
-    synchronize::threads();
+    // Avoid the race between the read of the `icv::Level` above and the write
+    // below by synchronizing all threads here.
+    synchronize::threadsAligned();
     {
+      // Note that the order here is important. `icv::Level` has to be updated
+      // last or the other updates will cause a thread specific state to be
+      // created.
       state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
                                             1u, TId == 0);
       state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0);
       state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0);
-      synchronize::threads();
+
+      // Synchronize all threads after the main thread (TId == 0) set up the
+      // team state properly.
+      synchronize::threadsAligned();
+
+      ASSERT(state::ParallelTeamSize == NumThreads);
+      ASSERT(icv::ActiveLevel == 1u);
+      ASSERT(icv::Level == 1u);
 
       if (TId < NumThreads)
         invokeMicrotask(TId, 0, fn, args, nargs);
-      synchronize::threads();
+
+      // Synchronize all threads at the end of a parallel region.
+      synchronize::threadsAligned();
     }
+
+    ASSERT(state::ParallelTeamSize == 1u);
+    ASSERT(icv::ActiveLevel == 0u);
+    ASSERT(icv::Level == 0u);
     return;
   }
 
@@ -130,6 +148,9 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   }
 
   {
+    // Note that the order here is important. `icv::Level` has to be updated
+    // last or the other updates will cause a thread specific state to be
+    // created.
     state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
                                           1u, true);
     state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,

diff  --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index ae6a93e7927b..f39b61c66930 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -41,8 +41,8 @@ namespace {
 ///{
 
 extern "C" {
-void *malloc(uint64_t Size);
-void free(void *Ptr);
+__attribute__((leaf)) void *malloc(uint64_t Size);
+__attribute__((leaf)) void free(void *Ptr);
 }
 
 ///}

diff  --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 17a91de97d28..c77e766ae6ca 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -132,6 +132,8 @@ void syncWarp(__kmpc_impl_lanemask_t) {
 
 void syncThreads() { __builtin_amdgcn_s_barrier(); }
 
+void syncThreadsAligned() { syncThreads(); }
+
 void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); }
 
 void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); }
@@ -179,6 +181,8 @@ void syncThreads() {
   asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
 }
 
+void syncThreadsAligned() { __syncthreads(); }
+
 constexpr uint32_t OMP_SPIN = 1000;
 constexpr uint32_t UNSET = 0;
 constexpr uint32_t SET = 1;
@@ -227,6 +231,8 @@ void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
 
 void synchronize::threads() { impl::syncThreads(); }
 
+void synchronize::threadsAligned() { impl::syncThreadsAligned(); }
+
 void fence::team(int Ordering) { impl::fenceTeam(Ordering); }
 
 void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }
@@ -238,7 +244,7 @@ uint32_t atomic::load(uint32_t *Addr, int Ordering) {
 }
 
 void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
-   impl::atomicStore(Addr, V, Ordering);
+  impl::atomicStore(Addr, V, Ordering);
 }
 
 uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
@@ -275,7 +281,7 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
 
 __attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
                                                           int32_t TId) {
-  synchronize::threads();
+  synchronize::threadsAligned();
 }
 
 int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {


        


More information about the Openmp-commits mailing list