[Openmp-commits] [openmp] b16aadf - [OpenMP] Introduce aligned synchronization into the new device RT
Johannes Doerfert via Openmp-commits
openmp-commits at lists.llvm.org
Wed Oct 27 16:22:46 PDT 2021
Author: Johannes Doerfert
Date: 2021-10-27T18:22:31-05:00
New Revision: b16aadf0a79a2796c9ac68d1bfc8853ccf25c459
URL: https://github.com/llvm/llvm-project/commit/b16aadf0a79a2796c9ac68d1bfc8853ccf25c459
DIFF: https://github.com/llvm/llvm-project/commit/b16aadf0a79a2796c9ac68d1bfc8853ccf25c459.diff
LOG: [OpenMP] Introduce aligned synchronization into the new device RT
We will later use the fact that a barrier is aligned to reason about
thread divergence. For now we introduce the assumption and some more
documentation.
Reviewed By: tianshilei1992
Differential Revision: https://reviews.llvm.org/D112153
Added:
Modified:
openmp/libomptarget/DeviceRTL/include/Synchronization.h
openmp/libomptarget/DeviceRTL/src/Kernel.cpp
openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
openmp/libomptarget/DeviceRTL/src/State.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index 7097056dfe5c..e33f37a659af 100644
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -27,6 +27,21 @@ void warp(LaneMaskTy Mask);
/// Synchronize all threads in a block.
void threads();
+/// Synchronizing threads is allowed even if they all hit
diff erent instances of
+/// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more
+/// restrictive in that it requires all threads to hit the same instance. The
+/// noinline is removed by the openmp-opt pass and helps to preserve the
+/// information till then.
+///{
+#pragma omp begin assumes ext_aligned_barrier
+
+/// Synchronize all threads in a block, they are are reaching the same
+/// instruction (hence all threads in the block are "aligned").
+__attribute__((noinline)) void threadsAligned();
+
+#pragma omp end assumes
+///}
+
} // namespace synchronize
namespace fence {
diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index f834754ccc56..d47fa03c367e 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -69,7 +69,7 @@ int32_t __kmpc_target_init(IdentTy *Ident, int8_t Mode,
const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
if (IsSPMD) {
inititializeRuntime(/* IsSPMD */ true);
- synchronize::threads();
+ synchronize::threadsAligned();
} else {
inititializeRuntime(/* IsSPMD */ false);
// No need to wait since only the main threads will execute user
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
index 91309b8a013f..81af773d014a 100644
--- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -93,18 +93,36 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
uint32_t NumThreads = determineNumberOfThreads(num_threads);
if (mapping::isSPMDMode()) {
- synchronize::threads();
+ // Avoid the race between the read of the `icv::Level` above and the write
+ // below by synchronizing all threads here.
+ synchronize::threadsAligned();
{
+ // Note that the order here is important. `icv::Level` has to be updated
+ // last or the other updates will cause a thread specific state to be
+ // created.
state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
1u, TId == 0);
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, TId == 0);
state::ValueRAII LevelRAII(icv::Level, 1u, 0u, TId == 0);
- synchronize::threads();
+
+ // Synchronize all threads after the main thread (TId == 0) set up the
+ // team state properly.
+ synchronize::threadsAligned();
+
+ ASSERT(state::ParallelTeamSize == NumThreads);
+ ASSERT(icv::ActiveLevel == 1u);
+ ASSERT(icv::Level == 1u);
if (TId < NumThreads)
invokeMicrotask(TId, 0, fn, args, nargs);
- synchronize::threads();
+
+ // Synchronize all threads at the end of a parallel region.
+ synchronize::threadsAligned();
}
+
+ ASSERT(state::ParallelTeamSize == 1u);
+ ASSERT(icv::ActiveLevel == 0u);
+ ASSERT(icv::Level == 0u);
return;
}
@@ -130,6 +148,9 @@ void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr,
}
{
+ // Note that the order here is important. `icv::Level` has to be updated
+ // last or the other updates will cause a thread specific state to be
+ // created.
state::ValueRAII ParallelTeamSizeRAII(state::ParallelTeamSize, NumThreads,
1u, true);
state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index ae6a93e7927b..f39b61c66930 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -41,8 +41,8 @@ namespace {
///{
extern "C" {
-void *malloc(uint64_t Size);
-void free(void *Ptr);
+__attribute__((leaf)) void *malloc(uint64_t Size);
+__attribute__((leaf)) void free(void *Ptr);
}
///}
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 17a91de97d28..c77e766ae6ca 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -132,6 +132,8 @@ void syncWarp(__kmpc_impl_lanemask_t) {
void syncThreads() { __builtin_amdgcn_s_barrier(); }
+void syncThreadsAligned() { syncThreads(); }
+
void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); }
void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); }
@@ -179,6 +181,8 @@ void syncThreads() {
asm volatile("barrier.sync %0;" : : "r"(BarrierNo) : "memory");
}
+void syncThreadsAligned() { __syncthreads(); }
+
constexpr uint32_t OMP_SPIN = 1000;
constexpr uint32_t UNSET = 0;
constexpr uint32_t SET = 1;
@@ -227,6 +231,8 @@ void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
void synchronize::threads() { impl::syncThreads(); }
+void synchronize::threadsAligned() { impl::syncThreadsAligned(); }
+
void fence::team(int Ordering) { impl::fenceTeam(Ordering); }
void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }
@@ -238,7 +244,7 @@ uint32_t atomic::load(uint32_t *Addr, int Ordering) {
}
void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
- impl::atomicStore(Addr, V, Ordering);
+ impl::atomicStore(Addr, V, Ordering);
}
uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
@@ -275,7 +281,7 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
__attribute__((noinline)) void __kmpc_barrier_simple_spmd(IdentTy *Loc,
int32_t TId) {
- synchronize::threads();
+ synchronize::threadsAligned();
}
int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
More information about the Openmp-commits
mailing list