[Openmp-commits] [openmp] 67fed13 - [OpenMP] Ensure memory fences are created with barriers for AMDGPUs

Mon Apr 17 15:28:20 PDT 2023

Author: Johannes Doerfert
Date: 2023-04-17T15:27:17-07:00
New Revision: 67fed132f39c81e8006c4463ab1f173fea5e4e4b

URL: https://github.com/llvm/llvm-project/commit/67fed132f39c81e8006c4463ab1f173fea5e4e4b
DIFF: https://github.com/llvm/llvm-project/commit/67fed132f39c81e8006c4463ab1f173fea5e4e4b.diff

LOG: [OpenMP] Ensure memory fences are created with barriers for AMDGPUs

It turns out that the __builtin_amdgcn_s_barrier() alone does not emit
a fence. We somehow got away with this and assumed it would work as it
(hopefully) is correct on the NVIDIA path where we just emit a
__syncthreads. After talking to @arsenm we now (mostly) align with the
OpenCL barrier implementation [1] and emit explicit fences for AMDGPUs.

It seems this was the underlying cause for #59759, but I am not 100%
certain. There is a chance this simply hides the problem.

Fixes: https://github.com/llvm/llvm-project/issues/59759

[1] https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/07b347366eb2c6ebc3414af323c623cbbbafc854/opencl/src/workgroup/wgbarrier.cl#L21

Added: 
    openmp/libomptarget/test/offloading/barrier_fence.c

Modified: 
    openmp/libomptarget/DeviceRTL/include/Synchronization.h
    openmp/libomptarget/DeviceRTL/src/Kernel.cpp
    openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
    openmp/libomptarget/DeviceRTL/src/Synchronization.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index 4b8068f9e4267..130578ed43020 100644

--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -16,34 +16,6 @@
 
 namespace ompx {
 
-namespace synchronize {
-
-/// Initialize the synchronization machinery. Must be called by all threads.
-void init(bool IsSPMD);
-
-/// Synchronize all threads in a warp identified by \p Mask.
-void warp(LaneMaskTy Mask);
-
-/// Synchronize all threads in a block.
-void threads();
-
-/// Synchronizing threads is allowed even if they all hit 
diff erent instances of
-/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
-/// restrictive in that it requires all threads to hit the same instance. The
-/// noinline is removed by the openmp-opt pass and helps to preserve the
-/// information till then.
-///{
-#pragma omp begin assumes ext_aligned_barrier
-
-/// Synchronize all threads in a block, they are are reaching the same
-/// instruction (hence all threads in the block are "aligned").
-__attribute__((noinline)) void threadsAligned();
-
-#pragma omp end assumes
-///}
-
-} // namespace synchronize
-
 namespace atomic {
 
 enum OrderingTy {
@@ -111,6 +83,38 @@ ATOMIC_FP_OP(double)
 
 } // namespace atomic
 
+namespace synchronize {
+
+/// Initialize the synchronization machinery. Must be called by all threads.
+void init(bool IsSPMD);
+
+/// Synchronize all threads in a warp identified by \p Mask.
+void warp(LaneMaskTy Mask);
+
+/// Synchronize all threads in a block and perform a fence before and after the
+/// barrier according to \p Ordering. Note that the fence might be part of the
+/// barrier.
+void threads(atomic::OrderingTy Ordering);
+
+/// Synchronizing threads is allowed even if they all hit 
diff erent instances of
+/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
+/// restrictive in that it requires all threads to hit the same instance. The
+/// noinline is removed by the openmp-opt pass and helps to preserve the
+/// information till then.
+///{
+#pragma omp begin assumes ext_aligned_barrier
+
+/// Synchronize all threads in a block, they are reaching the same instruction
+/// (hence all threads in the block are "aligned"). Also perform a fence before
+/// and after the barrier according to \p Ordering. Note that the
+/// fence might be part of the barrier if the target offers this.
+__attribute__((noinline)) void threadsAligned(atomic::OrderingTy Ordering);
+
+#pragma omp end assumes
+///}
+
+} // namespace synchronize
+
 namespace fence {
 
 /// Memory fence with \p Ordering semantics for the team.

diff  --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index c88aacbf6e432..fa774afe469b7 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -40,7 +40,7 @@ static void genericStateMachine(IdentTy *Ident) {
     ParallelRegionFnTy WorkFn = nullptr;
 
     // Wait for the signal that we have a new work function.
-    synchronize::threads();
+    synchronize::threads(atomic::seq_cst);
 
     // Retrieve the work function from the runtime.
     bool IsActive = __kmpc_kernel_parallel(&WorkFn);
@@ -56,7 +56,7 @@ static void genericStateMachine(IdentTy *Ident) {
       __kmpc_kernel_end_parallel();
     }
 
-    synchronize::threads();
+    synchronize::threads(atomic::seq_cst);
 
   } while (true);
 }
@@ -74,7 +74,7 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
       Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
   if (IsSPMD) {
     inititializeRuntime(/* IsSPMD */ true);
-    synchronize::threadsAligned();
+    synchronize::threadsAligned(atomic::relaxed);
   } else {
     inititializeRuntime(/* IsSPMD */ false);
     // No need to wait since only the main threads will execute user
@@ -83,6 +83,10 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
 
   if (IsSPMD) {
     state::assumeInitialState(IsSPMD);
+
+    // Synchronize to ensure the assertions above are in an aligned region.
+    // The barrier is eliminated later.
+    synchronize::threadsAligned(atomic::relaxed);
     return -1;
   }
 
@@ -132,7 +136,7 @@ void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode) {
   FunctionTracingRAII();
   const bool IsSPMD =
       Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
-  state::assumeInitialState(IsSPMD);
+
   if (IsSPMD)
     return;
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index d2fee11236302..d32dd7e4f9980 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -113,7 +113,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   if (mapping::isSPMDMode()) {
     // Avoid the race between the read of the `icv::Level` above and the write
     // below by synchronizing all threads here.
-    synchronize::threadsAligned();
+    synchronize::threadsAligned(atomic::seq_cst);
     {
       // Note that the order here is important. `icv::Level` has to be updated
       // last or the other updates will cause a thread specific state to be
@@ -128,28 +128,36 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
 
       // Synchronize all threads after the main thread (TId == 0) set up the
       // team state properly.
-      synchronize::threadsAligned();
+      synchronize::threadsAligned(atomic::acq_rel);
 
       state::ParallelTeamSize.assert_eq(NumThreads, ident,
                                         /* ForceTeamState */ true);
       icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
       icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);
 
+      // Ensure we synchronize before we run user code to avoid invalidating the
+      // assumptions above.
+      synchronize::threadsAligned(atomic::relaxed);
+
       if (TId < NumThreads)
         invokeMicrotask(TId, 0, fn, args, nargs);
 
       // Synchronize all threads at the end of a parallel region.
-      synchronize::threadsAligned();
+      synchronize::threadsAligned(atomic::seq_cst);
     }
 
     // Synchronize all threads to make sure every thread exits the scope above;
     // otherwise the following assertions and the assumption in
     // __kmpc_target_deinit may not hold.
-    synchronize::threadsAligned();
+    synchronize::threadsAligned(atomic::acq_rel);
 
     state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true);
     icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true);
     icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true);
+
+    // Ensure we synchronize to create an aligned region around the assumptions.
+    synchronize::threadsAligned(atomic::relaxed);
+
     return;
   }
 
@@ -243,9 +251,9 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                                /* ForceTeamState */ true);
 
     // Master signals work to activate workers.
-    synchronize::threads();
+    synchronize::threads(atomic::seq_cst);
     // Master waits for workers to signal.
-    synchronize::threads();
+    synchronize::threads(atomic::seq_cst);
   }
 
   if (nargs)

diff  --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index eddf37f851e73..babdf8ceeae78 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -123,8 +123,8 @@ void fenceTeam(atomic::OrderingTy Ordering);
 void fenceKernel(atomic::OrderingTy Ordering);
 void fenceSystem(atomic::OrderingTy Ordering);
 void syncWarp(__kmpc_impl_lanemask_t);
-void syncThreads();
-void syncThreadsAligned() { syncThreads(); }
+void syncThreads(atomic::OrderingTy Ordering);
+void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
 void unsetLock(omp_lock_t *);
 int testLock(omp_lock_t *);
 void initLock(omp_lock_t *);
@@ -261,8 +261,16 @@ void syncWarp(__kmpc_impl_lanemask_t) {
   // AMDGCN doesn't need to sync threads in a warp
 }
 
-void syncThreads() { __builtin_amdgcn_s_barrier(); }
-void syncThreadsAligned() { syncThreads(); }
+void syncThreads(atomic::OrderingTy Ordering) {
+  if (Ordering != atomic::relaxed)
+    fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
+
+  __builtin_amdgcn_s_barrier();
+
+  if (Ordering != atomic::relaxed)
+    fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst);
+}
+void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
 
 // TODO: Don't have wavefront lane locks. Possibly can't have them.
 void unsetLock(omp_lock_t *) { __builtin_trap(); }
@@ -327,12 +335,12 @@ void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
 
 void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
 
-void syncThreads() {
+void syncThreads(atomic::OrderingTy Ordering) {
   constexpr int BarrierNo = 8;
   asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
 }
 
-void syncThreadsAligned() { __syncthreads(); }
+void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
 
 constexpr uint32_t OMP_SPIN = 1000;
 constexpr uint32_t UNSET = 0;
@@ -381,9 +389,13 @@ void synchronize::init(bool IsSPMD) {
 
 void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
 
-void synchronize::threads() { impl::syncThreads(); }
+void synchronize::threads(atomic::OrderingTy Ordering) {
+  impl::syncThreads(Ordering);
+}
 
-void synchronize::threadsAligned() { impl::syncThreadsAligned(); }
+void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
+  impl::syncThreadsAligned(Ordering);
+}
 
 void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
 
@@ -504,13 +516,13 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
 __attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
                                                           int32_t TId) {
   FunctionTracingRAII();
-  synchronize::threadsAligned();
+  synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
 }
 
 __attribute__((noinline)) void __kmpc_barrier_simple_generic(IdentTy *Loc,
                                                              int32_t TId) {
   FunctionTracingRAII();
-  synchronize::threads();
+  synchronize::threads(atomic::OrderingTy::seq_cst);
 }
 
 int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {

diff  --git a/openmp/libomptarget/test/offloading/barrier_fence.c b/openmp/libomptarget/test/offloading/barrier_fence.c
new file mode 100644
index 0000000000000..23b8006a27722
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/barrier_fence.c
@@ -0,0 +1,78 @@
+// RUN: %libomptarget-compile-generic -fopenmp-offload-mandatory -O3
+// RUN: %libomptarget-run-generic
+
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <omp.h>
+#include <stdio.h>
+
+struct IdentTy;
+void __kmpc_barrier_simple_spmd(struct IdentTy *Loc, int32_t TId);
+void __kmpc_barrier_simple_generic(struct IdentTy *Loc, int32_t TId);
+
+#pragma omp begin declare target device_type(nohost)
+static int A[512] __attribute__((address_space(3), loader_uninitialized));
+static int B[512 * 32] __attribute__((loader_uninitialized));
+#pragma omp end declare target
+
+int main() {
+  printf("Testing simple spmd barrier\n");
+  for (int r = 0; r < 50; r++) {
+#pragma omp target teams distribute thread_limit(512) num_teams(440)
+    for (int j = 0; j < 512 * 32; ++j) {
+#pragma omp parallel firstprivate(j)
+      {
+        int TId = omp_get_thread_num();
+        int TeamId = omp_get_team_num();
+        int NT = omp_get_num_threads();
+        // Sequential
+        for (int i = 0; i < NT; ++i) {
+          // Test shared memory globals
+          if (TId == i)
+            A[i] = i + j;
+          __kmpc_barrier_simple_spmd(0, TId);
+          if (A[i] != i + j)
+            __builtin_trap();
+          __kmpc_barrier_simple_spmd(0, TId);
+          // Test generic globals
+          if (TId == i)
+            B[TeamId] = i;
+          __kmpc_barrier_simple_spmd(0, TId);
+          if (B[TeamId] != i)
+            __builtin_trap();
+          __kmpc_barrier_simple_spmd(0, TId);
+        }
+      }
+    }
+  }
+
+  printf("Testing simple generic barrier\n");
+  for (int r = 0; r < 50; r++) {
+#pragma omp target teams distribute thread_limit(512) num_teams(440)
+    for (int j = 0; j < 512 * 32; ++j) {
+#pragma omp parallel firstprivate(j)
+      {
+        int TId = omp_get_thread_num();
+        int TeamId = omp_get_team_num();
+        int NT = omp_get_num_threads();
+        // Sequential
+        for (int i = 0; i < NT; ++i) {
+          if (TId == i)
+            A[i] = i + j;
+          __kmpc_barrier_simple_generic(0, TId);
+          if (A[i] != i + j)
+            __builtin_trap();
+          __kmpc_barrier_simple_generic(0, TId);
+          if (TId == i)
+            B[TeamId] = i;
+          __kmpc_barrier_simple_generic(0, TId);
+          if (B[TeamId] != i)
+            __builtin_trap();
+          __kmpc_barrier_simple_generic(0, TId);
+        }
+      }
+    }
+  }
+  return 0;
+}