[Openmp-commits] [PATCH] D95294: [libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics

Jon Chesterfield via Phabricator via Openmp-commits openmp-commits at lists.llvm.org
Sat Jan 23 12:28:40 PST 2021


JonChesterfield created this revision.
JonChesterfield added reviewers: jdoerfert, tianshilei1992, grokos, ABataev.
Herald added subscribers: jfb, yaxunl.
JonChesterfield requested review of this revision.
Herald added subscribers: openmp-commits, sstefan1.
Herald added a project: OpenMP.

[libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics

Tested by diff of IR generated for target_impl.cu before and after. NFC. Part
of removing deviceRTL build time dependency on cuda SDK.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D95294

Files:
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu


Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -28,9 +28,6 @@
 int __shfl_down(int var, unsigned detla, int width);
 int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
 void __syncwarp(int mask);
-void __threadfence();
-void __threadfence_block();
-void __threadfence_system();
 }
 
 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
@@ -126,9 +123,9 @@
                : "memory");
 }
 
-DEVICE void __kmpc_impl_threadfence() { __threadfence(); }
-DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); }
-DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); }
+DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
+DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
+DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
 
 // Calls to the NVPTX layer (assuming 1D layout)
 DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
@@ -140,39 +137,41 @@
 DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
 DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
 
-// Forward declaration of atomics. Although they're template functions, we
-// already have definitions for different types in CUDA internal headers with
-// the right mangled names.
-template <typename T> DEVICE T atomicAdd(T *address, T val);
-template <typename T> DEVICE T atomicInc(T *address, T val);
-template <typename T> DEVICE T atomicMax(T *address, T val);
-template <typename T> DEVICE T atomicExch(T *address, T val);
-template <typename T> DEVICE T atomicCAS(T *address, T compare, T val);
-
+// Atomics
 DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
-  return atomicAdd(Address, Val);
+  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 }
 DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
-  return atomicInc(Address, Val);
+  return __nvvm_atom_inc_gen_ui(Address, Val);
 }
+
 DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
-  return atomicMax(Address, Val);
+  return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
 }
+
 DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
-  return atomicExch(Address, Val);
+  uint32_t R;
+  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+  return R;
 }
+
 DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
                                   uint32_t Val) {
-  return atomicCAS(Address, Compare, Val);
+  (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
+                                  __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+  return Compare;
 }
 
 DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
                                                  unsigned long long Val) {
-  return atomicExch(Address, Val);
+  unsigned long long R;
+  __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+  return R;
 }
+
 DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
                                             unsigned long long Val) {
-  return atomicAdd(Address, Val);
+  return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
 }
 
 #define __OMP_SPIN 1000


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D95294.318782.patch
Type: text/x-patch
Size: 3472 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/openmp-commits/attachments/20210123/e7a4902d/attachment.bin>


More information about the Openmp-commits mailing list