[Openmp-commits] [openmp] 2caeaf2 - [libomptarget][nfc] Introduce atomic wrapper function

Jon Chesterfield via Openmp-commits openmp-commits at lists.llvm.org
Wed Dec 18 12:07:09 PST 2019


Author: Jon Chesterfield
Date: 2019-12-18T20:06:17Z
New Revision: 2caeaf2f455db468cc5a5505d90b4919ae37c915

URL: https://github.com/llvm/llvm-project/commit/2caeaf2f455db468cc5a5505d90b4919ae37c915
DIFF: https://github.com/llvm/llvm-project/commit/2caeaf2f455db468cc5a5505d90b4919ae37c915.diff

LOG: [libomptarget][nfc] Introduce atomic wrapper function

Summary:
[libomptarget][nfc] Introduce atomic wrapper function

Wraps atomic functions in a template prefixed __kmpc_atomic that
dispatches to cuda or hip atomic functions. Intended to be easily extended
to dispatch to OpenCL or C++ atomics for a third target.

Reviewers: ABataev, jdoerfert, grokos

Reviewed By: jdoerfert

Subscribers: Anastasia, jvesely, mgrang, dexonsmith, llvm-commits, mgorny, jfb, openmp-commits

Tags: #openmp, #llvm

Differential Revision: https://reviews.llvm.org/D71404

Added: 
    openmp/libomptarget/deviceRTLs/common/target_atomic.h

Modified: 
    openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
    openmp/libomptarget/deviceRTLs/common/omptargeti.h
    openmp/libomptarget/deviceRTLs/common/src/libcall.cu
    openmp/libomptarget/deviceRTLs/common/src/loop.cu
    openmp/libomptarget/deviceRTLs/common/src/reduction.cu
    openmp/libomptarget/deviceRTLs/common/state-queuei.h
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
index ebea0a049b6e..d3df65b734df 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -76,6 +76,7 @@ set(h_files
   ${devicertl_base_directory}/common/omptarget.h
   ${devicertl_base_directory}/common/omptargeti.h
   ${devicertl_base_directory}/common/state-queue.h
+  ${devicertl_base_directory}/common/target_atomic.h
   ${devicertl_base_directory}/common/state-queuei.h
   ${devicertl_base_directory}/common/support.h)
 

diff  --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
index b952a8dc484a..379a870bfe36 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h
@@ -11,6 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "common/target_atomic.h"
+
 ////////////////////////////////////////////////////////////////////////////////
 // Task Descriptor
 ////////////////////////////////////////////////////////////////////////////////
@@ -207,7 +209,7 @@ INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
   ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
           "MemIdx is too big or uninitialized.");
   MemDataTy &MD = MemData[usedSlotIdx];
-  atomicExch((unsigned *)&MD.keys[usedMemIdx], 0);
+  __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
 }
 
 INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
@@ -217,7 +219,7 @@ INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
   const unsigned sm = usedSlotIdx;
   MemDataTy &MD = MemData[sm];
   unsigned i = hash(GetBlockIdInKernel());
-  while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) {
+  while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
     i = hash(i + 1);
   }
   usedSlotIdx = sm;

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
index 00eec92d71ef..c125d82372f7 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "common/omptarget.h"
+#include "common/target_atomic.h"
 #include "target_impl.h"
 
 EXTERN double omp_get_wtick(void) {

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
index 59970a6db41c..017af67ba1f2 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu
@@ -14,6 +14,7 @@
 
 #include "common/omptarget.h"
 #include "target_impl.h"
+#include "common/target_atomic.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
@@ -397,9 +398,9 @@ public:
     unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
     uint64_t warp_res;
     if (rank == 0) {
-      warp_res = atomicAdd(
+      warp_res = __kmpc_atomic_add(
           (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
-          change);
+          (unsigned long long)change);
     }
     warp_res = Shuffle(active, warp_res, leader);
     return warp_res + rank;
@@ -792,8 +793,8 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
     // Atomic max of iterations.
     uint64_t *varArray = (uint64_t *)array;
     uint64_t elem = varArray[i];
-    (void)atomicMax((unsigned long long int *)Buffer,
-                    (unsigned long long int)elem);
+    (void)__kmpc_atomic_max((unsigned long long int *)Buffer,
+                            (unsigned long long int)elem);
 
     // Barrier.
     syncWorkersInGenericMode(NumThreads);

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
index 5db194866ea5..7f6ee2e39c7d 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "common/omptarget.h"
+#include "common/target_atomic.h"
 #include "target_impl.h"
 
 EXTERN
@@ -242,7 +243,7 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
     // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
     // It resets 'timestamp' back to 0 once the last team increments
     // this counter.
-    unsigned val = atomicInc(timestamp, NumTeams - 1);
+    unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1);
     IsLastTeam = val == NumTeams - 1;
   }
 
@@ -377,7 +378,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
   if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
     return 0;
   // The master thread of the team actually does the reduction.
-  while (atomicCAS((uint32_t *)crit, 0, 1))
+  while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u))
     ;
   return 1;
 }
@@ -386,7 +387,7 @@ EXTERN void
 __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
                                             kmp_CriticalName *crit) {
   __kmpc_impl_threadfence_system();
-  (void)atomicExch((uint32_t *)crit, 0);
+  (void)__kmpc_atomic_exchange((uint32_t *)crit, 0u);
 }
 
 INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
@@ -431,7 +432,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   bool IsMaster = isMaster(loc, ThreadId);
   while (IsMaster) {
     // Atomic read
-    Bound = atomicAdd((uint32_t *)&IterCnt, 0);
+    Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
     if (TeamId < Bound + num_of_records)
       break;
   }
@@ -447,7 +448,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     // Increment team counter.
     // This counter is incremented by all teams in the current
     // BUFFER_SIZE chunk.
-    ChunkTeamCount = atomicInc((uint32_t *)&Cnt, num_of_records - 1);
+    ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
   }
   // Synchronize
   if (checkSPMDMode(loc))
@@ -522,7 +523,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   if (IsMaster && ChunkTeamCount == num_of_records - 1) {
     // Allow SIZE number of teams to proceed writing their
     // intermediate results to the global buffer.
-    atomicAdd((uint32_t *)&IterCnt, num_of_records);
+    __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
   }
 
   return 0;

diff  --git a/openmp/libomptarget/deviceRTLs/common/state-queuei.h b/openmp/libomptarget/deviceRTLs/common/state-queuei.h
index 3c3be113e733..1bd261f2826a 100644
--- a/openmp/libomptarget/deviceRTLs/common/state-queuei.h
+++ b/openmp/libomptarget/deviceRTLs/common/state-queuei.h
@@ -1,4 +1,4 @@
-//===------- state-queue.cu - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
+//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -17,15 +17,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "state-queue.h"
+#include "common/target_atomic.h"
 
 template <typename ElementType, uint32_t SIZE>
 INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
-  return atomicAdd((unsigned int *)&tail, 1);
+  return __kmpc_atomic_add((unsigned int *)&tail, 1u);
 }
 
 template <typename ElementType, uint32_t SIZE>
 INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
-  return atomicAdd((unsigned int *)&head, 1);
+  return __kmpc_atomic_add((unsigned int *)&head, 1u);
 }
 
 template <typename ElementType, uint32_t SIZE>
@@ -37,28 +38,28 @@ omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
 template <typename ElementType, uint32_t SIZE>
 INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
                                                                 uint32_t id) {
-  return atomicAdd((unsigned int *)&ids[slot], 0) == id;
+  return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id;
 }
 
 template <typename ElementType, uint32_t SIZE>
 INLINE void
 omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
                                                       ElementType *element) {
-  atomicExch((unsigned long long *)&elementQueue[slot],
-             (unsigned long long)element);
+  __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot],
+                         (unsigned long long)element);
 }
 
 template <typename ElementType, uint32_t SIZE>
 INLINE ElementType *
 omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
-  return (ElementType *)atomicAdd((unsigned long long *)&elementQueue[slot],
-                                  (unsigned long long)0);
+  return (ElementType *)__kmpc_atomic_add(
+      (unsigned long long *)&elementQueue[slot], (unsigned long long)0);
 }
 
 template <typename ElementType, uint32_t SIZE>
 INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
                                                                   uint32_t id) {
-  atomicExch((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
+  __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
 }
 
 template <typename ElementType, uint32_t SIZE>

diff  --git a/openmp/libomptarget/deviceRTLs/common/target_atomic.h b/openmp/libomptarget/deviceRTLs/common/target_atomic.h
new file mode 100644
index 000000000000..3c905d3cbbf2
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/target_atomic.h
@@ -0,0 +1,38 @@
+//===---- target_atomic.h - OpenMP GPU target atomic functions ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Declarations of atomic functions provided by each target
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_TARGET_ATOMIC_H
+#define OMPTARGET_TARGET_ATOMIC_H
+
+#include "target_impl.h"
+
+template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
+  return atomicAdd(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_inc(T *address, T val) {
+  return atomicInc(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
+  return atomicMax(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
+  return atomicExch(address, val);
+}
+
+template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
+  return atomicCAS(address, compare, val);
+}
+
+#endif

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 11f60e65173a..97a5ce34962c 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -12,10 +12,11 @@
 
 #include "target_impl.h"
 #include "common/debug.h"
+#include "common/target_atomic.h"
 
 #define __OMP_SPIN 1000
-#define UNSET 0
-#define SET 1
+#define UNSET 0u
+#define SET 1u
 
 EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
   omp_unset_lock(lock);
@@ -30,7 +31,7 @@ EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
   // (old == compare ? val : old)
 
   // TODO: not sure spinning is a good idea here..
-  while (atomicCAS(lock, UNSET, SET) != UNSET) {
+  while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
     clock_t start = clock();
     clock_t now;
     for (;;) {
@@ -44,7 +45,7 @@ EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
 }
 
 EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
-  (void)atomicExch(lock, UNSET);
+  (void)__kmpc_atomic_exchange(lock, UNSET);
 }
 
 EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {


        


More information about the Openmp-commits mailing list