[Openmp-commits] [openmp] 7272982 - [libomptarget] Refactor DeviceRTL prior to AMDGPU bringup
Jon Chesterfield via Openmp-commits
openmp-commits at lists.llvm.org
Tue Oct 19 00:05:17 PDT 2021
Author: Jon Chesterfield
Date: 2021-10-19T08:05:06+01:00
New Revision: 7272982e1dfe89d492de4aac3e23d9cdcb1198fa
URL: https://github.com/llvm/llvm-project/commit/7272982e1dfe89d492de4aac3e23d9cdcb1198fa
DIFF: https://github.com/llvm/llvm-project/commit/7272982e1dfe89d492de4aac3e23d9cdcb1198fa.diff
LOG: [libomptarget] Refactor DeviceRTL prior to AMDGPU bringup
Subset of D111993. Fix typos, rename read to load.
Reviewed By: tianshilei1992
Differential Revision: https://reviews.llvm.org/D111999
Added:
Modified:
openmp/libomptarget/DeviceRTL/include/Synchronization.h
openmp/libomptarget/DeviceRTL/src/Mapping.cpp
openmp/libomptarget/DeviceRTL/src/Reduction.cpp
openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
openmp/libomptarget/DeviceRTL/src/Utils.cpp
Removed:
################################################################################
diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index ace624e3887fd..7097056dfe5c1 100644
--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -44,14 +44,11 @@ void system(int Ordering);
namespace atomic {
-/// Atomically read \p Addr with \p Ordering semantics.
-uint32_t read(uint32_t *Addr, int Ordering);
+/// Atomically load \p Addr with \p Ordering semantics.
+uint32_t load(uint32_t *Addr, int Ordering);
/// Atomically store \p V to \p Addr with \p Ordering semantics.
-uint32_t store(uint32_t *Addr, uint32_t V, int Ordering);
-
-/// Atomically store \p V to \p Addr with \p Ordering semantics.
-uint64_t store(uint64_t *Addr, uint64_t V, int Ordering);
+void store(uint32_t *Addr, uint32_t V, int Ordering);
/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index c2041856b6cc6..740cc7be899f0 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -29,7 +29,7 @@ namespace impl {
#pragma omp begin declare variant match(device = {arch(amdgcn)})
constexpr const llvm::omp::GV &getGridValue() {
- return llvm::omp::AMDGPUGridValues;
+ return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
}
uint32_t getGridDim(uint32_t n, uint16_t d) {
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index a06ac23c4276d..05efe956b38bf 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -209,7 +209,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
// to the number of slots in the buffer.
bool IsMaster = (ThreadId == 0);
while (IsMaster) {
- Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
+ Bound = atomic::load((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
if (TeamId < Bound + num_of_records)
break;
}
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index c9a1ac6f73697..17a91de97d286 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -31,10 +31,14 @@ namespace impl {
/// NOTE: This function needs to be implemented by every target.
uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);
-uint32_t atomicRead(uint32_t *Address, int Ordering) {
+uint32_t atomicLoad(uint32_t *Address, int Ordering) {
return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);
}
+void atomicStore(uint32_t *Address, uint32_t Val, int Ordering) {
+ __atomic_store_n(Address, Val, Ordering);
+}
+
uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) {
return __atomic_fetch_add(Address, Val, Ordering);
}
@@ -68,7 +72,7 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, "");
}
-uint32_t SHARD(namedBarrierTracker);
+uint32_t SHARED(namedBarrierTracker);
void namedBarrierInit() {
// Don't have global ctors, and shared memory is not zero init
@@ -79,7 +83,7 @@ void namedBarrier() {
uint32_t NumThreads = omp_get_num_threads();
// assert(NumThreads % 32 == 0);
- uint32_t WarpSize = maping::getWarpSize();
+ uint32_t WarpSize = mapping::getWarpSize();
uint32_t NumWaves = NumThreads / WarpSize;
fence::team(__ATOMIC_ACQUIRE);
@@ -115,7 +119,7 @@ void namedBarrier() {
// more waves still to go, spin until generation counter changes
do {
__builtin_amdgcn_s_sleep(0);
- load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED);
+ load = atomic::load(&namedBarrierTracker, __ATOMIC_RELAXED);
} while ((load & 0xffff0000u) == generation);
}
}
@@ -192,7 +196,7 @@ int testLock(omp_lock_t *Lock) {
void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
-void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); }
+void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
void setLock(omp_lock_t *Lock) {
// TODO: not sure spinning is a good idea here..
@@ -229,8 +233,12 @@ void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }
void fence::system(int Ordering) { impl::fenceSystem(Ordering); }
-uint32_t atomic::read(uint32_t *Addr, int Ordering) {
- return impl::atomicRead(Addr, Ordering);
+uint32_t atomic::load(uint32_t *Addr, int Ordering) {
+ return impl::atomicLoad(Addr, Ordering);
+}
+
+void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
+ impl::atomicStore(Addr, V, Ordering);
}
uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
@@ -300,7 +308,7 @@ void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
-void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); }
+void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
index f11c54ed1c497..3f65f2166481a 100644
--- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -35,8 +35,9 @@ namespace impl {
#pragma omp begin declare variant match(device = {arch(amdgcn)})
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
- *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF));
- *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
+ static_assert(sizeof(unsigned long) == 8, "");
+ *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
+ *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
}
uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
@@ -75,7 +76,7 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
int Width = mapping::getWarpSize();
- int Self = mapping::getgetThreadIdInWarp();
+ int Self = mapping::getThreadIdInWarp();
int Index = SrcLane + (Self & ~(Width - 1));
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
}
More information about the Openmp-commits
mailing list