[Openmp-commits] [openmp] 9b19ecb - [libomptarget][devicertl] Drop templated atomic functions

Jon Chesterfield via Openmp-commits openmp-commits at lists.llvm.org
Fri Jan 22 06:48:57 PST 2021


Author: Jon Chesterfield
Date: 2021-01-22T14:48:22Z
New Revision: 9b19ecb8f1ec7acbcfd6f0e4f3cbd6902570105d

URL: https://github.com/llvm/llvm-project/commit/9b19ecb8f1ec7acbcfd6f0e4f3cbd6902570105d
DIFF: https://github.com/llvm/llvm-project/commit/9b19ecb8f1ec7acbcfd6f0e4f3cbd6902570105d.diff

LOG: [libomptarget][devicertl] Drop templated atomic functions

[libomptarget][devicertl] Drop templated atomic functions

The five __kmpc_atomic templates are instantiated a total of seven times.
This change replaces the template with explictly typed functions, which
have the same prototype for amdgcn and nvptx, and implements them with
the same code presently in use.

Rolls in the accepted but not yet landed D95085.

The unsigned long long type can be replaced with uint64_t when replacing
the cuda function. Until then, clang warns on casting a pointer to one to
a pointer to the other.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D95093

Added: 
    

Modified: 
    openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
    openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index 6e8a651bd886..228d3f6e556d 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -126,29 +126,17 @@ DEVICE unsigned GetWarpId();
 DEVICE unsigned GetLaneId();
 
 // Atomics
-template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
-  return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST);
-}
-
-INLINE uint32_t __kmpc_atomic_inc(uint32_t *address, uint32_t max) {
-  return __builtin_amdgcn_atomic_inc32(address, max, __ATOMIC_SEQ_CST, "");
-}
-
-template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
-  return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST);
-}
-
-template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
-  T r;
-  __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST);
-  return r;
-}
-
-template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
-  (void)__atomic_compare_exchange(address, &compare, &val, false,
-                                  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
-  return compare;
-}
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
+
+static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
+                                                 unsigned long long);
+DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
+                                            unsigned long long);
 
 // Locks
 DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);

diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
index 7388a29215cc..35828cda0e06 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -132,11 +132,13 @@ DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
 } // namespace
 
 DEVICE int GetNumberOfBlocksInKernel() {
-  return get_grid_dim(__builtin_amdgcn_grid_size_x(), __builtin_amdgcn_workgroup_size_x());
+  return get_grid_dim(__builtin_amdgcn_grid_size_x(),
+                      __builtin_amdgcn_workgroup_size_x());
 }
 
 DEVICE int GetNumberOfThreadsInBlock() {
-  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(), __builtin_amdgcn_grid_size_x(),
+  return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
+                           __builtin_amdgcn_grid_size_x(),
                            __builtin_amdgcn_workgroup_size_x());
 }
 
@@ -149,6 +151,40 @@ EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
   return GetNumberOfThreadsInBlock();
 }
 
+// Atomics
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
+  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
+}
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
+  return __builtin_amdgcn_atomic_inc32(Address, max, __ATOMIC_SEQ_CST, "");
+}
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
+  return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
+}
+
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
+  uint32_t R;
+  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+  return R;
+}
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
+                                  uint32_t Val) {
+  (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
+                                  __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
+  return Compare;
+}
+
+DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
+                                                 unsigned long long Val) {
+  unsigned long long R;
+  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+  return R;
+}
+DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
+                                            unsigned long long Val) {
+  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
+}
+
 // Stub implementations
 DEVICE void *__kmpc_impl_malloc(size_t) { return nullptr; }
 DEVICE void __kmpc_impl_free(void *) {}

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 75945e3cd8c4..2bf19523ef6f 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -140,6 +140,41 @@ DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
 DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
 DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
 
+// Forward declaration of atomics. Although they're template functions, we
+// already have definitions for 
diff erent types in CUDA internal headers with
+// the right mangled names.
+template <typename T> DEVICE T atomicAdd(T *address, T val);
+template <typename T> DEVICE T atomicInc(T *address, T val);
+template <typename T> DEVICE T atomicMax(T *address, T val);
+template <typename T> DEVICE T atomicExch(T *address, T val);
+template <typename T> DEVICE T atomicCAS(T *address, T compare, T val);
+
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
+  return atomicAdd(Address, Val);
+}
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
+  return atomicInc(Address, Val);
+}
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
+  return atomicMax(Address, Val);
+}
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
+  return atomicExch(Address, Val);
+}
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
+                                  uint32_t Val) {
+  return atomicCAS(Address, Compare, Val);
+}
+
+DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
+                                                 unsigned long long Val) {
+  return atomicExch(Address, Val);
+}
+DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
+                                            unsigned long long Val) {
+  return atomicAdd(Address, Val);
+}
+
 #define __OMP_SPIN 1000
 #define UNSET 0u
 #define SET 1u

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 1d7b649fe20e..1828fcf594bc 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -130,35 +130,18 @@ DEVICE int GetNumberOfThreadsInBlock();
 DEVICE unsigned GetWarpId();
 DEVICE unsigned GetLaneId();
 
-// Forward declaration of atomics. Although they're template functions, we
-// already have definitions for 
diff erent types in CUDA internal headers with
-// the right mangled names.
-template <typename T> DEVICE T atomicAdd(T *address, T val);
-template <typename T> DEVICE T atomicInc(T *address, T val);
-template <typename T> DEVICE T atomicMax(T *address, T val);
-template <typename T> DEVICE T atomicExch(T *address, T val);
-template <typename T> DEVICE T atomicCAS(T *address, T compare, T val);
-
 // Atomics
-template <typename T> INLINE T __kmpc_atomic_add(T *address, T val) {
-  return atomicAdd(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_inc(T *address, T val) {
-  return atomicInc(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_max(T *address, T val) {
-  return atomicMax(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_exchange(T *address, T val) {
-  return atomicExch(address, val);
-}
-
-template <typename T> INLINE T __kmpc_atomic_cas(T *address, T compare, T val) {
-  return atomicCAS(address, compare, val);
-}
+DEVICE uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
+DEVICE uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
+
+static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
+DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *,
+                                                 unsigned long long);
+DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *,
+                                            unsigned long long);
 
 // Locks
 DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock);


        


More information about the Openmp-commits mailing list