[Openmp-commits] [openmp] 6a1d1f7 - [OpenMP] Added memory scope to atomic::inc API and used the device scope in reduction.
Dhruva Chakrabarti via Openmp-commits
openmp-commits at lists.llvm.org
Fri Jun 30 12:05:21 PDT 2023
Author: Dhruva Chakrabarti
Date: 2023-06-30T15:05:01-04:00
New Revision: 6a1d1f7eefe81faa1f7c6c47e8b9da0bfeb8c2e8
URL: https://github.com/llvm/llvm-project/commit/6a1d1f7eefe81faa1f7c6c47e8b9da0bfeb8c2e8
DIFF: https://github.com/llvm/llvm-project/commit/6a1d1f7eefe81faa1f7c6c47e8b9da0bfeb8c2e8.diff
LOG: [OpenMP] Added memory scope to atomic::inc API and used the device scope in reduction.
With https://reviews.llvm.org/D137524, memory scope and ordering
attributes are being used to generate the required instructions for
atomic inc/dec on AMDGPU. This patch adds the memory scope attribute to
the atomic::inc API and uses the device scope in reduction. Without
the device scope in atomic_inc, the default system scope leads to
unnecessary L2 write-backs/invalidates.
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D154172
Added:
Modified:
openmp/libomptarget/DeviceRTL/include/Synchronization.h
openmp/libomptarget/DeviceRTL/src/Reduction.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index 130578ed430207..b31238fbbc9c74 100644
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -26,8 +26,15 @@ enum OrderingTy {
seq_cst = __ATOMIC_SEQ_CST,
};
+enum MemScopeTy {
+ all, // All threads on all devices
+ device, // All threads on the device
+ cgroup // All threads in the contention group, e.g. the team
+};
+
/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
-uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering);
+uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
+ MemScopeTy MemScope = MemScopeTy::all);
/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
/// result is stored in \p *Addr;
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index db401852360f59..f544928e33c409 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -228,7 +228,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
// Increment team counter.
// This counter is incremented by all teams in the current
// BUFFER_SIZE chunk.
- ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst);
+ ChunkTeamCount = atomic::inc(&Cnt, num_of_records - 1u, atomic::seq_cst,
+ atomic::MemScopeTy::device);
}
// Synchronize
if (mapping::isSPMDMode())
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index 5325b9f8a5cb9f..550d9614a963e8 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -29,8 +29,8 @@ namespace impl {
///
///{
/// NOTE: This function needs to be implemented by every target.
-uint32_t atomicInc(uint32_t *Address, uint32_t Val,
- atomic::OrderingTy Ordering);
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
+ atomic::MemScopeTy MemScope);
template <typename Ty>
Ty atomicAdd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
@@ -116,7 +116,8 @@ uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
///}
// Forward declarations defined to be defined for AMDGCN and NVPTX.
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering);
+uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
+ atomic::MemScopeTy MemScope);
void namedBarrierInit();
void namedBarrier();
void fenceTeam(atomic::OrderingTy Ordering);
@@ -138,22 +139,35 @@ void setCriticalLock(omp_lock_t *);
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering) {
+uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
+ atomic::MemScopeTy MemScope) {
// builtin_amdgcn_atomic_inc32 should expand to this switch when
// passed a runtime value, but does not do so yet. Workaround here.
+
+#define ScopeSwitch(ORDER) \
+ switch (MemScope) { \
+ case atomic::MemScopeTy::all: \
+ return __builtin_amdgcn_atomic_inc32(A, V, ORDER, ""); \
+ case atomic::MemScopeTy::device: \
+ return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "agent"); \
+ case atomic::MemScopeTy::cgroup: \
+ return __builtin_amdgcn_atomic_inc32(A, V, ORDER, "workgroup"); \
+ }
+
+#define Case(ORDER) \
+ case ORDER: \
+ ScopeSwitch(ORDER)
+
switch (Ordering) {
default:
__builtin_unreachable();
- case atomic::relaxed:
- return __builtin_amdgcn_atomic_inc32(A, V, atomic::relaxed, "");
- case atomic::aquire:
- return __builtin_amdgcn_atomic_inc32(A, V, atomic::aquire, "");
- case atomic::release:
- return __builtin_amdgcn_atomic_inc32(A, V, atomic::release, "");
- case atomic::acq_rel:
- return __builtin_amdgcn_atomic_inc32(A, V, atomic::acq_rel, "");
- case atomic::seq_cst:
- return __builtin_amdgcn_atomic_inc32(A, V, atomic::seq_cst, "");
+ Case(atomic::relaxed);
+ Case(atomic::aquire);
+ Case(atomic::release);
+ Case(atomic::acq_rel);
+ Case(atomic::seq_cst);
+#undef Case
+#undef ScopeSwitch
}
}
@@ -308,8 +322,8 @@ void setCriticalLock(omp_lock_t *Lock) {
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
-uint32_t atomicInc(uint32_t *Address, uint32_t Val,
- atomic::OrderingTy Ordering) {
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
+ atomic::MemScopeTy MemScope) {
return __nvvm_atom_inc_gen_ui(Address, Val);
}
@@ -480,8 +494,9 @@ ATOMIC_FP_OP(double, int64_t, uint64_t)
#undef ATOMIC_INT_OP
#undef ATOMIC_FP_OP
-uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
- return impl::atomicInc(Addr, V, Ordering);
+uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering,
+ atomic::MemScopeTy MemScope) {
+ return impl::atomicInc(Addr, V, Ordering, MemScope);
}
void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
More information about the Openmp-commits
mailing list