[Openmp-commits] [openmp] ead2d86 - Revert "[OpenMP] Ensure memory fences are created with barriers for AMDGPUs"

Fri Mar 24 19:10:11 PDT 2023

Author: Ye Luo
Date: 2023-03-24T21:10:03-05:00
New Revision: ead2d86ee9b14844897714c68516d242a42e5b68

URL: https://github.com/llvm/llvm-project/commit/ead2d86ee9b14844897714c68516d242a42e5b68
DIFF: https://github.com/llvm/llvm-project/commit/ead2d86ee9b14844897714c68516d242a42e5b68.diff

LOG: Revert "[OpenMP] Ensure memory fences are created with barriers for AMDGPUs"

This reverts commit 36d6217c4eb02c15168bf74c9f7ef44ea4fb7e41.

Added: 
    

Modified: 
    openmp/libomptarget/DeviceRTL/include/Synchronization.h
    openmp/libomptarget/DeviceRTL/src/Kernel.cpp
    openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
    openmp/libomptarget/DeviceRTL/src/Synchronization.cpp

Removed: 
    openmp/libomptarget/test/offloading/barrier_fence.c


################################################################################
diff  --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index 130578ed43020..4b8068f9e4267 100644

--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -16,6 +16,34 @@
 
 namespace ompx {
 
+namespace synchronize {
+
+/// Initialize the synchronization machinery. Must be called by all threads.
+void init(bool IsSPMD);
+
+/// Synchronize all threads in a warp identified by \p Mask.
+void warp(LaneMaskTy Mask);
+
+/// Synchronize all threads in a block.
+void threads();
+
+/// Synchronizing threads is allowed even if they all hit 
diff erent instances of
+/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
+/// restrictive in that it requires all threads to hit the same instance. The
+/// noinline is removed by the openmp-opt pass and helps to preserve the
+/// information till then.
+///{
+#pragma omp begin assumes ext_aligned_barrier
+
+/// Synchronize all threads in a block, they are are reaching the same
+/// instruction (hence all threads in the block are "aligned").
+__attribute__((noinline)) void threadsAligned();
+
+#pragma omp end assumes
+///}
+
+} // namespace synchronize
+
 namespace atomic {
 
 enum OrderingTy {
@@ -83,38 +111,6 @@ ATOMIC_FP_OP(double)
 
 } // namespace atomic
 
-namespace synchronize {
-
-/// Initialize the synchronization machinery. Must be called by all threads.
-void init(bool IsSPMD);
-
-/// Synchronize all threads in a warp identified by \p Mask.
-void warp(LaneMaskTy Mask);
-
-/// Synchronize all threads in a block and perform a fence before and after the
-/// barrier according to \p Ordering. Note that the fence might be part of the
-/// barrier.
-void threads(atomic::OrderingTy Ordering);
-
-/// Synchronizing threads is allowed even if they all hit 
diff erent instances of
-/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
-/// restrictive in that it requires all threads to hit the same instance. The
-/// noinline is removed by the openmp-opt pass and helps to preserve the
-/// information till then.
-///{
-#pragma omp begin assumes ext_aligned_barrier
-
-/// Synchronize all threads in a block, they are reaching the same instruction
-/// (hence all threads in the block are "aligned"). Also perform a fence before
-/// and after the barrier according to \p Ordering. Note that the
-/// fence might be part of the barrier if the target offers this.
-__attribute__((noinline)) void threadsAligned(atomic::OrderingTy Ordering);
-
-#pragma omp end assumes
-///}
-
-} // namespace synchronize
-
 namespace fence {
 
 /// Memory fence with \p Ordering semantics for the team.

diff  --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index fa615789c05cb..c88aacbf6e432 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -40,7 +40,7 @@ static void genericStateMachine(IdentTy *Ident) {
     ParallelRegionFnTy WorkFn = nullptr;
 
     // Wait for the signal that we have a new work function.
-    synchronize::threads(atomic::seq_cst);
+    synchronize::threads();
 
     // Retrieve the work function from the runtime.
     bool IsActive = __kmpc_kernel_parallel(&WorkFn);
@@ -56,7 +56,7 @@ static void genericStateMachine(IdentTy *Ident) {
       __kmpc_kernel_end_parallel();
     }
 
-    synchronize::threads(atomic::seq_cst);
+    synchronize::threads();
 
   } while (true);
 }
@@ -74,7 +74,7 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
       Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
   if (IsSPMD) {
     inititializeRuntime(/* IsSPMD */ true);
-    synchronize::threadsAligned(atomic::relaxed);
+    synchronize::threadsAligned();
   } else {
     inititializeRuntime(/* IsSPMD */ false);
     // No need to wait since only the main threads will execute user
@@ -83,10 +83,6 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
 
   if (IsSPMD) {
     state::assumeInitialState(IsSPMD);
-
-    // Synchronize to ensure the assertions above are in an aligned region.
-    // The barrier is eliminated later.
-    synchronize::threadsAligned(atomic::relaxed);
     return -1;
   }
 
@@ -136,11 +132,7 @@ void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode) {
   FunctionTracingRAII();
   const bool IsSPMD =
       Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
-
-  synchronize::threadsAligned(atomic::acq_rel);
   state::assumeInitialState(IsSPMD);
-  synchronize::threadsAligned(atomic::relaxed);
-
   if (IsSPMD)
     return;
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index d32dd7e4f9980..d2fee11236302 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -113,7 +113,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
   if (mapping::isSPMDMode()) {
     // Avoid the race between the read of the `icv::Level` above and the write
     // below by synchronizing all threads here.
-    synchronize::threadsAligned(atomic::seq_cst);
+    synchronize::threadsAligned();
     {
       // Note that the order here is important. `icv::Level` has to be updated
       // last or the other updates will cause a thread specific state to be
@@ -128,36 +128,28 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
 
       // Synchronize all threads after the main thread (TId == 0) set up the
       // team state properly.
-      synchronize::threadsAligned(atomic::acq_rel);
+      synchronize::threadsAligned();
 
       state::ParallelTeamSize.assert_eq(NumThreads, ident,
                                         /* ForceTeamState */ true);
       icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
       icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);
 
-      // Ensure we synchronize before we run user code to avoid invalidating the
-      // assumptions above.
-      synchronize::threadsAligned(atomic::relaxed);
-
       if (TId < NumThreads)
         invokeMicrotask(TId, 0, fn, args, nargs);
 
       // Synchronize all threads at the end of a parallel region.
-      synchronize::threadsAligned(atomic::seq_cst);
+      synchronize::threadsAligned();
     }
 
     // Synchronize all threads to make sure every thread exits the scope above;
     // otherwise the following assertions and the assumption in
     // __kmpc_target_deinit may not hold.
-    synchronize::threadsAligned(atomic::acq_rel);
+    synchronize::threadsAligned();
 
     state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true);
     icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true);
     icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true);
-
-    // Ensure we synchronize to create an aligned region around the assumptions.
-    synchronize::threadsAligned(atomic::relaxed);
-
     return;
   }
 
@@ -251,9 +243,9 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
                                /* ForceTeamState */ true);
 
     // Master signals work to activate workers.
-    synchronize::threads(atomic::seq_cst);
+    synchronize::threads();
     // Master waits for workers to signal.
-    synchronize::threads(atomic::seq_cst);
+    synchronize::threads();
   }
 
   if (nargs)

diff  --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 5d106a1a4dcf3..90d03dd490b24 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -123,8 +123,8 @@ void fenceTeam(atomic::OrderingTy Ordering);
 void fenceKernel(atomic::OrderingTy Ordering);
 void fenceSystem(atomic::OrderingTy Ordering);
 void syncWarp(__kmpc_impl_lanemask_t);
-void syncThreads(atomic::OrderingTy Ordering);
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
+void syncThreads();
+void syncThreadsAligned() { syncThreads(); }
 void unsetLock(omp_lock_t *);
 int testLock(omp_lock_t *);
 void initLock(omp_lock_t *);
@@ -261,16 +261,8 @@ void syncWarp(__kmpc_impl_lanemask_t) {
   // AMDGCN doesn't need to sync threads in a warp
 }
 
-void syncThreads(atomic::OrderingTy Ordering) {
-  if (Ordering != atomic::relaxed)
-    fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
-
-  __builtin_amdgcn_s_barrier();
-
-  if (Ordering != atomic::relaxed)
-    fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst);
-}
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
+void syncThreads() { __builtin_amdgcn_s_barrier(); }
+void syncThreadsAligned() { syncThreads(); }
 
 // TODO: Don't have wavefront lane locks. Possibly can't have them.
 void unsetLock(omp_lock_t *) { __builtin_trap(); }
@@ -335,12 +327,12 @@ void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
 
 void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
 
-void syncThreads(atomic::OrderingTy Ordering) {
+void syncThreads() {
   constexpr int BarrierNo = 8;
   asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
 }
 
-void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
+void syncThreadsAligned() { __syncthreads(); }
 
 constexpr uint32_t OMP_SPIN = 1000;
 constexpr uint32_t UNSET = 0;
@@ -389,13 +381,9 @@ void synchronize::init(bool IsSPMD) {
 
 void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
 
-void synchronize::threads(atomic::OrderingTy Ordering) {
-  impl::syncThreads(Ordering);
-}
+void synchronize::threads() { impl::syncThreads(); }
 
-void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
-  impl::syncThreadsAligned(Ordering);
-}
+void synchronize::threadsAligned() { impl::syncThreadsAligned(); }
 
 void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
 
@@ -516,13 +504,13 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
 __attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
                                                           int32_t TId) {
   FunctionTracingRAII();
-  synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
+  synchronize::threadsAligned();
 }
 
 __attribute__((noinline)) void __kmpc_barrier_simple_generic(IdentTy *Loc,
                                                              int32_t TId) {
   FunctionTracingRAII();
-  synchronize::threads(atomic::OrderingTy::seq_cst);
+  synchronize::threads();
 }
 
 int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {

diff  --git a/openmp/libomptarget/test/offloading/barrier_fence.c b/openmp/libomptarget/test/offloading/barrier_fence.c
deleted file mode 100644
index cf796b4301489..0000000000000
--- a/openmp/libomptarget/test/offloading/barrier_fence.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-offload-mandatory -O3
-// RUN: %libomptarget-run-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-struct IdentTy;
-void __kmpc_barrier_simple_spmd(struct IdentTy *Loc, int32_t TId);
-void __kmpc_barrier_simple_generic(struct IdentTy *Loc, int32_t TId);
-
-#pragma omp begin declare target device_type(nohost)
-static int A[512] __attribute__((address_space(3), loader_uninitialized));
-static int B[512 * 32] __attribute__((loader_uninitialized));
-#pragma omp end declare target
-
-int main() {
-  printf("Testing simple spmd barrier\n");
-  for (int r = 0; r < 50; r++) {
-#pragma omp target teams distribute thread_limit(512) num_teams(440)
-    for (int j = 0; j < 512 * 32; ++j) {
-#pragma omp parallel firstprivate(j)
-      {
-        int TId = omp_get_thread_num();
-        int TeamId = omp_get_team_num();
-        int NT = omp_get_num_threads();
-        // Sequential
-        for (int i = 0; i < NT; ++i) {
-          // Test shared memory globals
-          if (TId == i)
-            A[i] = i + j;
-          __kmpc_barrier_simple_spmd(0, TId);
-          if (A[i] != i + j)
-            __builtin_trap();
-          __kmpc_barrier_simple_spmd(0, TId);
-          // Test generic globals
-          if (TId == i)
-            B[TeamId] = i;
-          __kmpc_barrier_simple_spmd(0, TId);
-          if (B[TeamId] != i)
-            __builtin_trap();
-          __kmpc_barrier_simple_spmd(0, TId);
-        }
-      }
-    }
-  }
-
-  printf("Testing simple generic barrier\n");
-  for (int r = 0; r < 50; r++) {
-#pragma omp target teams distribute thread_limit(512) num_teams(440)
-    for (int j = 0; j < 512 * 32; ++j) {
-#pragma omp parallel firstprivate(j)
-      {
-        int TId = omp_get_thread_num();
-        int TeamId = omp_get_team_num();
-        int NT = omp_get_num_threads();
-        // Sequential
-        for (int i = 0; i < NT; ++i) {
-          if (TId == i)
-            A[i] = i + j;
-          __kmpc_barrier_simple_generic(0, TId);
-          if (A[i] != i + j)
-            __builtin_trap();
-          __kmpc_barrier_simple_generic(0, TId);
-          if (TId == i)
-            B[TeamId] = i;
-          __kmpc_barrier_simple_generic(0, TId);
-          if (B[TeamId] != i)
-            __builtin_trap();
-          __kmpc_barrier_simple_generic(0, TId);
-        }
-      }
-    }
-  }
-  return 0;
-}