[Openmp-commits] [openmp] ead2d86 - Revert "[OpenMP] Ensure memory fences are created with barriers for AMDGPUs"
Ye Luo via Openmp-commits
openmp-commits at lists.llvm.org
Fri Mar 24 19:10:11 PDT 2023
Author: Ye Luo
Date: 2023-03-24T21:10:03-05:00
New Revision: ead2d86ee9b14844897714c68516d242a42e5b68
URL: https://github.com/llvm/llvm-project/commit/ead2d86ee9b14844897714c68516d242a42e5b68
DIFF: https://github.com/llvm/llvm-project/commit/ead2d86ee9b14844897714c68516d242a42e5b68.diff
LOG: Revert "[OpenMP] Ensure memory fences are created with barriers for AMDGPUs"
This reverts commit 36d6217c4eb02c15168bf74c9f7ef44ea4fb7e41.
Added:
Modified:
openmp/libomptarget/DeviceRTL/include/Synchronization.h
openmp/libomptarget/DeviceRTL/src/Kernel.cpp
openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
Removed:
openmp/libomptarget/test/offloading/barrier_fence.c
################################################################################
diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index 130578ed43020..4b8068f9e4267 100644
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -16,6 +16,34 @@
namespace ompx {
+namespace synchronize {
+
+/// Initialize the synchronization machinery. Must be called by all threads.
+void init(bool IsSPMD);
+
+/// Synchronize all threads in a warp identified by \p Mask.
+void warp(LaneMaskTy Mask);
+
+/// Synchronize all threads in a block.
+void threads();
+
+/// Synchronizing threads is allowed even if they all hit
diff erent instances of
+/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
+/// restrictive in that it requires all threads to hit the same instance. The
+/// noinline is removed by the openmp-opt pass and helps to preserve the
+/// information till then.
+///{
+#pragma omp begin assumes ext_aligned_barrier
+
+/// Synchronize all threads in a block, they are are reaching the same
+/// instruction (hence all threads in the block are "aligned").
+__attribute__((noinline)) void threadsAligned();
+
+#pragma omp end assumes
+///}
+
+} // namespace synchronize
+
namespace atomic {
enum OrderingTy {
@@ -83,38 +111,6 @@ ATOMIC_FP_OP(double)
} // namespace atomic
-namespace synchronize {
-
-/// Initialize the synchronization machinery. Must be called by all threads.
-void init(bool IsSPMD);
-
-/// Synchronize all threads in a warp identified by \p Mask.
-void warp(LaneMaskTy Mask);
-
-/// Synchronize all threads in a block and perform a fence before and after the
-/// barrier according to \p Ordering. Note that the fence might be part of the
-/// barrier.
-void threads(atomic::OrderingTy Ordering);
-
-/// Synchronizing threads is allowed even if they all hit
diff erent instances of
-/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
-/// restrictive in that it requires all threads to hit the same instance. The
-/// noinline is removed by the openmp-opt pass and helps to preserve the
-/// information till then.
-///{
-#pragma omp begin assumes ext_aligned_barrier
-
-/// Synchronize all threads in a block, they are reaching the same instruction
-/// (hence all threads in the block are "aligned"). Also perform a fence before
-/// and after the barrier according to \p Ordering. Note that the
-/// fence might be part of the barrier if the target offers this.
-__attribute__((noinline)) void threadsAligned(atomic::OrderingTy Ordering);
-
-#pragma omp end assumes
-///}
-
-} // namespace synchronize
-
namespace fence {
/// Memory fence with \p Ordering semantics for the team.
diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index fa615789c05cb..c88aacbf6e432 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -40,7 +40,7 @@ static void genericStateMachine(IdentTy *Ident) {
ParallelRegionFnTy WorkFn = nullptr;
// Wait for the signal that we have a new work function.
- synchronize::threads(atomic::seq_cst);
+ synchronize::threads();
// Retrieve the work function from the runtime.
bool IsActive = __kmpc_kernel_parallel(&WorkFn);
@@ -56,7 +56,7 @@ static void genericStateMachine(IdentTy *Ident) {
__kmpc_kernel_end_parallel();
}
- synchronize::threads(atomic::seq_cst);
+ synchronize::threads();
} while (true);
}
@@ -74,7 +74,7 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
if (IsSPMD) {
inititializeRuntime(/* IsSPMD */ true);
- synchronize::threadsAligned(atomic::relaxed);
+ synchronize::threadsAligned();
} else {
inititializeRuntime(/* IsSPMD */ false);
// No need to wait since only the main threads will execute user
@@ -83,10 +83,6 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
if (IsSPMD) {
state::assumeInitialState(IsSPMD);
-
- // Synchronize to ensure the assertions above are in an aligned region.
- // The barrier is eliminated later.
- synchronize::threadsAligned(atomic::relaxed);
return -1;
}
@@ -136,11 +132,7 @@ void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode) {
FunctionTracingRAII();
const bool IsSPMD =
Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
-
- synchronize::threadsAligned(atomic::acq_rel);
state::assumeInitialState(IsSPMD);
- synchronize::threadsAligned(atomic::relaxed);
-
if (IsSPMD)
return;
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index d32dd7e4f9980..d2fee11236302 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -113,7 +113,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
if (mapping::isSPMDMode()) {
// Avoid the race between the read of the `icv::Level` above and the write
// below by synchronizing all threads here.
- synchronize::threadsAligned(atomic::seq_cst);
+ synchronize::threadsAligned();
{
// Note that the order here is important. `icv::Level` has to be updated
// last or the other updates will cause a thread specific state to be
@@ -128,36 +128,28 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
// Synchronize all threads after the main thread (TId == 0) set up the
// team state properly.
- synchronize::threadsAligned(atomic::acq_rel);
+ synchronize::threadsAligned();
state::ParallelTeamSize.assert_eq(NumThreads, ident,
/* ForceTeamState */ true);
icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);
- // Ensure we synchronize before we run user code to avoid invalidating the
- // assumptions above.
- synchronize::threadsAligned(atomic::relaxed);
-
if (TId < NumThreads)
invokeMicrotask(TId, 0, fn, args, nargs);
// Synchronize all threads at the end of a parallel region.
- synchronize::threadsAligned(atomic::seq_cst);
+ synchronize::threadsAligned();
}
// Synchronize all threads to make sure every thread exits the scope above;
// otherwise the following assertions and the assumption in
// __kmpc_target_deinit may not hold.
- synchronize::threadsAligned(atomic::acq_rel);
+ synchronize::threadsAligned();
state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true);
icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true);
icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true);
-
- // Ensure we synchronize to create an aligned region around the assumptions.
- synchronize::threadsAligned(atomic::relaxed);
-
return;
}
@@ -251,9 +243,9 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
/* ForceTeamState */ true);
// Master signals work to activate workers.
- synchronize::threads(atomic::seq_cst);
+ synchronize::threads();
// Master waits for workers to signal.
- synchronize::threads(atomic::seq_cst);
+ synchronize::threads();
}
if (nargs)
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 5d106a1a4dcf3..90d03dd490b24 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -123,8 +123,8 @@ void fenceTeam(atomic::OrderingTy Ordering);
void fenceKernel(atomic::OrderingTy Ordering);
void fenceSystem(atomic::OrderingTy Ordering);
void syncWarp(__kmpc_impl_lanemask_t);
-void syncThreads(atomic::OrderingTy Ordering);
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
+void syncThreads();
+void syncThreadsAligned() { syncThreads(); }
void unsetLock(omp_lock_t *);
int testLock(omp_lock_t *);
void initLock(omp_lock_t *);
@@ -261,16 +261,8 @@ void syncWarp(__kmpc_impl_lanemask_t) {
// AMDGCN doesn't need to sync threads in a warp
}
-void syncThreads(atomic::OrderingTy Ordering) {
- if (Ordering != atomic::relaxed)
- fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
-
- __builtin_amdgcn_s_barrier();
-
- if (Ordering != atomic::relaxed)
- fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst);
-}
-void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
+void syncThreads() { __builtin_amdgcn_s_barrier(); }
+void syncThreadsAligned() { syncThreads(); }
// TODO: Don't have wavefront lane locks. Possibly can't have them.
void unsetLock(omp_lock_t *) { __builtin_trap(); }
@@ -335,12 +327,12 @@ void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
-void syncThreads(atomic::OrderingTy Ordering) {
+void syncThreads() {
constexpr int BarrierNo = 8;
asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
}
-void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
+void syncThreadsAligned() { __syncthreads(); }
constexpr uint32_t OMP_SPIN = 1000;
constexpr uint32_t UNSET = 0;
@@ -389,13 +381,9 @@ void synchronize::init(bool IsSPMD) {
void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
-void synchronize::threads(atomic::OrderingTy Ordering) {
- impl::syncThreads(Ordering);
-}
+void synchronize::threads() { impl::syncThreads(); }
-void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
- impl::syncThreadsAligned(Ordering);
-}
+void synchronize::threadsAligned() { impl::syncThreadsAligned(); }
void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
@@ -516,13 +504,13 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
__attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
int32_t TId) {
FunctionTracingRAII();
- synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
+ synchronize::threadsAligned();
}
__attribute__((noinline)) void __kmpc_barrier_simple_generic(IdentTy *Loc,
int32_t TId) {
FunctionTracingRAII();
- synchronize::threads(atomic::OrderingTy::seq_cst);
+ synchronize::threads();
}
int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
diff --git a/openmp/libomptarget/test/offloading/barrier_fence.c b/openmp/libomptarget/test/offloading/barrier_fence.c
deleted file mode 100644
index cf796b4301489..0000000000000
--- a/openmp/libomptarget/test/offloading/barrier_fence.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// RUN: %libomptarget-compile-generic -fopenmp-offload-mandatory -O3
-// RUN: %libomptarget-run-generic
-
-#include <omp.h>
-#include <stdio.h>
-
-struct IdentTy;
-void __kmpc_barrier_simple_spmd(struct IdentTy *Loc, int32_t TId);
-void __kmpc_barrier_simple_generic(struct IdentTy *Loc, int32_t TId);
-
-#pragma omp begin declare target device_type(nohost)
-static int A[512] __attribute__((address_space(3), loader_uninitialized));
-static int B[512 * 32] __attribute__((loader_uninitialized));
-#pragma omp end declare target
-
-int main() {
- printf("Testing simple spmd barrier\n");
- for (int r = 0; r < 50; r++) {
-#pragma omp target teams distribute thread_limit(512) num_teams(440)
- for (int j = 0; j < 512 * 32; ++j) {
-#pragma omp parallel firstprivate(j)
- {
- int TId = omp_get_thread_num();
- int TeamId = omp_get_team_num();
- int NT = omp_get_num_threads();
- // Sequential
- for (int i = 0; i < NT; ++i) {
- // Test shared memory globals
- if (TId == i)
- A[i] = i + j;
- __kmpc_barrier_simple_spmd(0, TId);
- if (A[i] != i + j)
- __builtin_trap();
- __kmpc_barrier_simple_spmd(0, TId);
- // Test generic globals
- if (TId == i)
- B[TeamId] = i;
- __kmpc_barrier_simple_spmd(0, TId);
- if (B[TeamId] != i)
- __builtin_trap();
- __kmpc_barrier_simple_spmd(0, TId);
- }
- }
- }
- }
-
- printf("Testing simple generic barrier\n");
- for (int r = 0; r < 50; r++) {
-#pragma omp target teams distribute thread_limit(512) num_teams(440)
- for (int j = 0; j < 512 * 32; ++j) {
-#pragma omp parallel firstprivate(j)
- {
- int TId = omp_get_thread_num();
- int TeamId = omp_get_team_num();
- int NT = omp_get_num_threads();
- // Sequential
- for (int i = 0; i < NT; ++i) {
- if (TId == i)
- A[i] = i + j;
- __kmpc_barrier_simple_generic(0, TId);
- if (A[i] != i + j)
- __builtin_trap();
- __kmpc_barrier_simple_generic(0, TId);
- if (TId == i)
- B[TeamId] = i;
- __kmpc_barrier_simple_generic(0, TId);
- if (B[TeamId] != i)
- __builtin_trap();
- __kmpc_barrier_simple_generic(0, TId);
- }
- }
- }
- }
- return 0;
-}
More information about the Openmp-commits
mailing list