[Openmp-commits] [openmp] 7272982 - [libomptarget] Refactor DeviceRTL prior to AMDGPU bringup

Tue Oct 19 00:05:17 PDT 2021

Author: Jon Chesterfield
Date: 2021-10-19T08:05:06+01:00
New Revision: 7272982e1dfe89d492de4aac3e23d9cdcb1198fa

URL: https://github.com/llvm/llvm-project/commit/7272982e1dfe89d492de4aac3e23d9cdcb1198fa
DIFF: https://github.com/llvm/llvm-project/commit/7272982e1dfe89d492de4aac3e23d9cdcb1198fa.diff

LOG: [libomptarget] Refactor DeviceRTL prior to AMDGPU bringup

Subset of D111993. Fix typos, rename read to load.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D111999

Added: 
    

Modified: 
    openmp/libomptarget/DeviceRTL/include/Synchronization.h
    openmp/libomptarget/DeviceRTL/src/Mapping.cpp
    openmp/libomptarget/DeviceRTL/src/Reduction.cpp
    openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
    openmp/libomptarget/DeviceRTL/src/Utils.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
index ace624e3887fd..7097056dfe5c1 100644

--- a/openmp/libomptarget/DeviceRTL/include/Synchronization.h
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -44,14 +44,11 @@ void system(int Ordering);
 
 namespace atomic {
 
-/// Atomically read \p Addr with \p Ordering semantics.
-uint32_t read(uint32_t *Addr, int Ordering);
+/// Atomically load \p Addr with \p Ordering semantics.
+uint32_t load(uint32_t *Addr, int Ordering);
 
 /// Atomically store \p V to \p Addr with \p Ordering semantics.
-uint32_t store(uint32_t *Addr, uint32_t V, int Ordering);
-
-/// Atomically store \p V to \p Addr with \p Ordering semantics.
-uint64_t store(uint64_t *Addr, uint64_t V, int Ordering);
+void store(uint32_t *Addr, uint32_t V, int Ordering);
 
 /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
 uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);

diff  --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index c2041856b6cc6..740cc7be899f0 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -29,7 +29,7 @@ namespace impl {
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 constexpr const llvm::omp::GV &getGridValue() {
-  return llvm::omp::AMDGPUGridValues;
+  return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
 }
 
 uint32_t getGridDim(uint32_t n, uint16_t d) {

diff  --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index a06ac23c4276d..05efe956b38bf 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -209,7 +209,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   // to the number of slots in the buffer.
   bool IsMaster = (ThreadId == 0);
   while (IsMaster) {
-    Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
+    Bound = atomic::load((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
     if (TeamId < Bound + num_of_records)
       break;
   }

diff  --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index c9a1ac6f73697..17a91de97d286 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -31,10 +31,14 @@ namespace impl {
 /// NOTE: This function needs to be implemented by every target.
 uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);
 
-uint32_t atomicRead(uint32_t *Address, int Ordering) {
+uint32_t atomicLoad(uint32_t *Address, int Ordering) {
   return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);
 }
 
+void atomicStore(uint32_t *Address, uint32_t Val, int Ordering) {
+  __atomic_store_n(Address, Val, Ordering);
+}
+
 uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) {
   return __atomic_fetch_add(Address, Val, Ordering);
 }
@@ -68,7 +72,7 @@ uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
   return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, "");
 }
 
-uint32_t SHARD(namedBarrierTracker);
+uint32_t SHARED(namedBarrierTracker);
 
 void namedBarrierInit() {
   // Don't have global ctors, and shared memory is not zero init
@@ -79,7 +83,7 @@ void namedBarrier() {
   uint32_t NumThreads = omp_get_num_threads();
   // assert(NumThreads % 32 == 0);
 
-  uint32_t WarpSize = maping::getWarpSize();
+  uint32_t WarpSize = mapping::getWarpSize();
   uint32_t NumWaves = NumThreads / WarpSize;
 
   fence::team(__ATOMIC_ACQUIRE);
@@ -115,7 +119,7 @@ void namedBarrier() {
       // more waves still to go, spin until generation counter changes
       do {
         __builtin_amdgcn_s_sleep(0);
-        load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED);
+        load = atomic::load(&namedBarrierTracker, __ATOMIC_RELAXED);
       } while ((load & 0xffff0000u) == generation);
     }
   }
@@ -192,7 +196,7 @@ int testLock(omp_lock_t *Lock) {
 
 void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
 
-void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); }
+void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
 
 void setLock(omp_lock_t *Lock) {
   // TODO: not sure spinning is a good idea here..
@@ -229,8 +233,12 @@ void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }
 
 void fence::system(int Ordering) { impl::fenceSystem(Ordering); }
 
-uint32_t atomic::read(uint32_t *Addr, int Ordering) {
-  return impl::atomicRead(Addr, Ordering);
+uint32_t atomic::load(uint32_t *Addr, int Ordering) {
+  return impl::atomicLoad(Addr, Ordering);
+}
+
+void atomic::store(uint32_t *Addr, uint32_t V, int Ordering) {
+   impl::atomicStore(Addr, V, Ordering);
 }
 
 uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
@@ -300,7 +308,7 @@ void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
 
 void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
 
-void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); }
+void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); }
 
 void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
index f11c54ed1c497..3f65f2166481a 100644
--- a/openmp/libomptarget/DeviceRTL/src/Utils.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -35,8 +35,9 @@ namespace impl {
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
 void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
-  *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF));
-  *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
+  static_assert(sizeof(unsigned long) == 8, "");
+  *LowBits = (uint32_t)(Val & 0x00000000FFFFFFFFUL);
+  *HighBits = (uint32_t)((Val & 0xFFFFFFFF00000000UL) >> 32);
 }
 
 uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
@@ -75,7 +76,7 @@ uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
 
 int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
   int Width = mapping::getWarpSize();
-  int Self = mapping::getgetThreadIdInWarp();
+  int Self = mapping::getThreadIdInWarp();
   int Index = SrcLane + (Self & ~(Width - 1));
   return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
 }