[Openmp-commits] [openmp] 67fed13 - [OpenMP] Ensure memory fences are created with barriers for AMDGPUs
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Mon Apr 17 15:28:20 PDT 2023
Author: Johannes Doerfert
Date: 2023-04-17T15:27:17-07:00
New Revision: 67fed132f39c81e8006c4463ab1f173fea5e4e4b
URL: https://github.com/llvm/llvm-project/commit/67fed132f39c81e8006c4463ab1f173fea5e4e4b
DIFF: https://github.com/llvm/llvm-project/commit/67fed132f39c81e8006c4463ab1f173fea5e4e4b.diff
LOG: [OpenMP] Ensure memory fences are created with barriers for AMDGPUs
It turns out that the __builtin_amdgcn_s_barrier() alone does not emit
a fence. We somehow got away with this and assumed it would work as it
(hopefully) is correct on the NVIDIA path where we just emit a
__syncthreads. After talking to @arsenm we now (mostly) align with the
OpenCL barrier implementation [1] and emit explicit fences for AMDGPUs.
It seems this was the underlying cause for #59759, but I am not 100%
certain. There is a chance this simply hides the problem.
Fixes: https://github.com/llvm/llvm-project/issues/59759
[1] https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/07b347366eb2c6ebc3414af323c623cbbbafc854/opencl/src/workgroup/wgbarrier.cl#L21
Added:
openmp/libomptarget/test/offloading/barrier_fence.c
Modified:
openmp/libomptarget/DeviceRTL/include/Synchronization.h
openmp/libomptarget/DeviceRTL/src/Kernel.cpp
openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index 4b8068f9e4267..130578ed43020 100644
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -16,34 +16,6 @@
namespace ompx {
-namespace synchronize {
-
-/// Initialize the synchronization machinery. Must be called by all threads.
-void init(bool IsSPMD);
-
-/// Synchronize all threads in a warp identified by \p Mask.
-void warp(LaneMaskTy Mask);
-
-/// Synchronize all threads in a block.
-void threads();
-
-/// Synchronizing threads is allowed even if they all hit
diff erent instances of
-/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
-/// restrictive in that it requires all threads to hit the same instance. The
-/// noinline is removed by the openmp-opt pass and helps to preserve the
-/// information till then.
-///{
-#pragma omp begin assumes ext_aligned_barrier
-
-/// Synchronize all threads in a block, they are are reaching the same
-/// instruction (hence all threads in the block are "aligned").
-__attribute__((noinline)) void threadsAligned();
-
-#pragma omp end assumes
-///}
-
-} // namespace synchronize
-
namespace atomic {
enum OrderingTy {
@@ -111,6 +83,38 @@ ATOMIC_FP_OP(double)
} // namespace atomic
+namespace synchronize {
+
+/// Initialize the synchronization machinery. Must be called by all threads.
+void init(bool IsSPMD);
+
+/// Synchronize all threads in a warp identified by \p Mask.
+void warp(LaneMaskTy Mask);
+
+/// Synchronize all threads in a block and perform a fence before and after the
+/// barrier according to \p Ordering. Note that the fence might be part of the
+/// barrier.
+void threads(atomic::OrderingTy Ordering);
+
+/// Synchronizing threads is allowed even if they all hit
diff erent instances of
+/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
+/// restrictive in that it requires all threads to hit the same instance. The
+/// noinline is removed by the openmp-opt pass and helps to preserve the
+/// information till then.
+///{
+#pragma omp begin assumes ext_aligned_barrier
+
+/// Synchronize all threads in a block, they are reaching the same instruction
+/// (hence all threads in the block are "aligned"). Also perform a fence before
+/// and after the barrier according to \p Ordering. Note that the
+/// fence might be part of the barrier if the target offers this.
+__attribute__((noinline)) void threadsAligned(atomic::OrderingTy Ordering);
+
+#pragma omp end assumes
+///}
+
+} // namespace synchronize
+
namespace fence {
/// Memory fence with \p Ordering semantics for the team.
diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index c88aacbf6e432..fa774afe469b7 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -40,7 +40,7 @@ static void genericStateMachine(IdentTy *Ident) {
ParallelRegionFnTy WorkFn = nullptr;
// Wait for the signal that we have a new work function.
- synchronize::threads();
+ synchronize::threads(atomic::seq_cst);
// Retrieve the work function from the runtime.
bool IsActive = __kmpc_kernel_parallel(&WorkFn);
@@ -56,7 +56,7 @@ static void genericStateMachine(IdentTy *Ident) {
__kmpc_kernel_end_parallel();
}
- synchronize::threads();
+ synchronize::threads(atomic::seq_cst);
} while (true);
}
@@ -74,7 +74,7 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
if (IsSPMD) {
inititializeRuntime(/* IsSPMD */ true);
- synchronize::threadsAligned();
+ synchronize::threadsAligned(atomic::relaxed);
} else {
inititializeRuntime(/* IsSPMD */ false);
// No need to wait since only the main threads will execute user
@@ -83,6 +83,10 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
if (IsSPMD) {
state::assumeInitialState(IsSPMD);
+
+ // Synchronize to ensure the assertions above are in an aligned region.
+ // The barrier is eliminated later.
+ synchronize::threadsAligned(atomic::relaxed);
return -1;
}
@@ -132,7 +136,7 @@ void __kmpc_target_deinit(IdentTy *Ident, int8_t Mode) {
FunctionTracingRAII();
const bool IsSPMD =
Mode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
- state::assumeInitialState(IsSPMD);
+
if (IsSPMD)
return;
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index d2fee11236302..d32dd7e4f9980 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -113,7 +113,7 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
if (mapping::isSPMDMode()) {
// Avoid the race between the read of the `icv::Level` above and the write
// below by synchronizing all threads here.
- synchronize::threadsAligned();
+ synchronize::threadsAligned(atomic::seq_cst);
{
// Note that the order here is important. `icv::Level` has to be updated
// last or the other updates will cause a thread specific state to be
@@ -128,28 +128,36 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
// Synchronize all threads after the main thread (TId == 0) set up the
// team state properly.
- synchronize::threadsAligned();
+ synchronize::threadsAligned(atomic::acq_rel);
state::ParallelTeamSize.assert_eq(NumThreads, ident,
/* ForceTeamState */ true);
icv::ActiveLevel.assert_eq(1u, ident, /* ForceTeamState */ true);
icv::Level.assert_eq(1u, ident, /* ForceTeamState */ true);
+ // Ensure we synchronize before we run user code to avoid invalidating the
+ // assumptions above.
+ synchronize::threadsAligned(atomic::relaxed);
+
if (TId < NumThreads)
invokeMicrotask(TId, 0, fn, args, nargs);
// Synchronize all threads at the end of a parallel region.
- synchronize::threadsAligned();
+ synchronize::threadsAligned(atomic::seq_cst);
}
// Synchronize all threads to make sure every thread exits the scope above;
// otherwise the following assertions and the assumption in
// __kmpc_target_deinit may not hold.
- synchronize::threadsAligned();
+ synchronize::threadsAligned(atomic::acq_rel);
state::ParallelTeamSize.assert_eq(1u, ident, /* ForceTeamState */ true);
icv::ActiveLevel.assert_eq(0u, ident, /* ForceTeamState */ true);
icv::Level.assert_eq(0u, ident, /* ForceTeamState */ true);
+
+ // Ensure we synchronize to create an aligned region around the assumptions.
+ synchronize::threadsAligned(atomic::relaxed);
+
return;
}
@@ -243,9 +251,9 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
/* ForceTeamState */ true);
// Master signals work to activate workers.
- synchronize::threads();
+ synchronize::threads(atomic::seq_cst);
// Master waits for workers to signal.
- synchronize::threads();
+ synchronize::threads(atomic::seq_cst);
}
if (nargs)
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index eddf37f851e73..babdf8ceeae78 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -123,8 +123,8 @@ void fenceTeam(atomic::OrderingTy Ordering);
void fenceKernel(atomic::OrderingTy Ordering);
void fenceSystem(atomic::OrderingTy Ordering);
void syncWarp(__kmpc_impl_lanemask_t);
-void syncThreads();
-void syncThreadsAligned() { syncThreads(); }
+void syncThreads(atomic::OrderingTy Ordering);
+void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
void unsetLock(omp_lock_t *);
int testLock(omp_lock_t *);
void initLock(omp_lock_t *);
@@ -261,8 +261,16 @@ void syncWarp(__kmpc_impl_lanemask_t) {
// AMDGCN doesn't need to sync threads in a warp
}
-void syncThreads() { __builtin_amdgcn_s_barrier(); }
-void syncThreadsAligned() { syncThreads(); }
+void syncThreads(atomic::OrderingTy Ordering) {
+ if (Ordering != atomic::relaxed)
+ fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst);
+
+ __builtin_amdgcn_s_barrier();
+
+ if (Ordering != atomic::relaxed)
+ fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst);
+}
+void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
// TODO: Don't have wavefront lane locks. Possibly can't have them.
void unsetLock(omp_lock_t *) { __builtin_trap(); }
@@ -327,12 +335,12 @@ void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); }
void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
-void syncThreads() {
+void syncThreads(atomic::OrderingTy Ordering) {
constexpr int BarrierNo = 8;
asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
}
-void syncThreadsAligned() { __syncthreads(); }
+void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); }
constexpr uint32_t OMP_SPIN = 1000;
constexpr uint32_t UNSET = 0;
@@ -381,9 +389,13 @@ void synchronize::init(bool IsSPMD) {
void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
-void synchronize::threads() { impl::syncThreads(); }
+void synchronize::threads(atomic::OrderingTy Ordering) {
+ impl::syncThreads(Ordering);
+}
-void synchronize::threadsAligned() { impl::syncThreadsAligned(); }
+void synchronize::threadsAligned(atomic::OrderingTy Ordering) {
+ impl::syncThreadsAligned(Ordering);
+}
void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); }
@@ -504,13 +516,13 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
__attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
int32_t TId) {
FunctionTracingRAII();
- synchronize::threadsAligned();
+ synchronize::threadsAligned(atomic::OrderingTy::seq_cst);
}
__attribute__((noinline)) void __kmpc_barrier_simple_generic(IdentTy *Loc,
int32_t TId) {
FunctionTracingRAII();
- synchronize::threads();
+ synchronize::threads(atomic::OrderingTy::seq_cst);
}
int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
diff --git a/openmp/libomptarget/test/offloading/barrier_fence.c b/openmp/libomptarget/test/offloading/barrier_fence.c
new file mode 100644
index 0000000000000..23b8006a27722
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/barrier_fence.c
@@ -0,0 +1,78 @@
+// RUN: %libomptarget-compile-generic -fopenmp-offload-mandatory -O3
+// RUN: %libomptarget-run-generic
+
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <omp.h>
+#include <stdio.h>
+
+struct IdentTy;
+void __kmpc_barrier_simple_spmd(struct IdentTy *Loc, int32_t TId);
+void __kmpc_barrier_simple_generic(struct IdentTy *Loc, int32_t TId);
+
+#pragma omp begin declare target device_type(nohost)
+static int A[512] __attribute__((address_space(3), loader_uninitialized));
+static int B[512 * 32] __attribute__((loader_uninitialized));
+#pragma omp end declare target
+
+int main() {
+ printf("Testing simple spmd barrier\n");
+ for (int r = 0; r < 50; r++) {
+#pragma omp target teams distribute thread_limit(512) num_teams(440)
+ for (int j = 0; j < 512 * 32; ++j) {
+#pragma omp parallel firstprivate(j)
+ {
+ int TId = omp_get_thread_num();
+ int TeamId = omp_get_team_num();
+ int NT = omp_get_num_threads();
+ // Sequential
+ for (int i = 0; i < NT; ++i) {
+ // Test shared memory globals
+ if (TId == i)
+ A[i] = i + j;
+ __kmpc_barrier_simple_spmd(0, TId);
+ if (A[i] != i + j)
+ __builtin_trap();
+ __kmpc_barrier_simple_spmd(0, TId);
+ // Test generic globals
+ if (TId == i)
+ B[TeamId] = i;
+ __kmpc_barrier_simple_spmd(0, TId);
+ if (B[TeamId] != i)
+ __builtin_trap();
+ __kmpc_barrier_simple_spmd(0, TId);
+ }
+ }
+ }
+ }
+
+ printf("Testing simple generic barrier\n");
+ for (int r = 0; r < 50; r++) {
+#pragma omp target teams distribute thread_limit(512) num_teams(440)
+ for (int j = 0; j < 512 * 32; ++j) {
+#pragma omp parallel firstprivate(j)
+ {
+ int TId = omp_get_thread_num();
+ int TeamId = omp_get_team_num();
+ int NT = omp_get_num_threads();
+ // Sequential
+ for (int i = 0; i < NT; ++i) {
+ if (TId == i)
+ A[i] = i + j;
+ __kmpc_barrier_simple_generic(0, TId);
+ if (A[i] != i + j)
+ __builtin_trap();
+ __kmpc_barrier_simple_generic(0, TId);
+ if (TId == i)
+ B[TeamId] = i;
+ __kmpc_barrier_simple_generic(0, TId);
+ if (B[TeamId] != i)
+ __builtin_trap();
+ __kmpc_barrier_simple_generic(0, TId);
+ }
+ }
+ }
+ }
+ return 0;
+}
More information about the Openmp-commits
mailing list