[Openmp-commits] [openmp] c3074d4 - [libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics
Jon Chesterfield via Openmp-commits
openmp-commits at lists.llvm.org
Sun Jan 24 02:59:30 PST 2021
Author: Jon Chesterfield
Date: 2021-01-24T10:59:15Z
New Revision: c3074d48d38cc1207da893b6f3545b5777db4c27
URL: https://github.com/llvm/llvm-project/commit/c3074d48d38cc1207da893b6f3545b5777db4c27
DIFF: https://github.com/llvm/llvm-project/commit/c3074d48d38cc1207da893b6f3545b5777db4c27.diff
LOG: [libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics
[libomptarget][nvptx] Replace cuda atomic primitives with clang intrinsics
Tested by diff of IR generated for target_impl.cu before and after. NFC. Part
of removing deviceRTL build time dependency on cuda SDK.
Reviewed By: tianshilei1992
Differential Revision: https://reviews.llvm.org/D95294
Added:
Modified:
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
Removed:
################################################################################
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 2bf19523ef6f..1e3ba7d664af 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -28,9 +28,6 @@ int __shfl(int val, int src_line, int width = WARPSIZE);
int __shfl_down(int var, unsigned detla, int width);
int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
void __syncwarp(int mask);
-void __threadfence();
-void __threadfence_block();
-void __threadfence_system();
}
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
@@ -126,9 +123,9 @@ DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
: "memory");
}
-DEVICE void __kmpc_impl_threadfence() { __threadfence(); }
-DEVICE void __kmpc_impl_threadfence_block() { __threadfence_block(); }
-DEVICE void __kmpc_impl_threadfence_system() { __threadfence_system(); }
+DEVICE void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
+DEVICE void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
+DEVICE void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
// Calls to the NVPTX layer (assuming 1D layout)
DEVICE int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
@@ -140,39 +137,41 @@ DEVICE int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
-// Forward declaration of atomics. Although they're template functions, we
-// already have definitions for
diff erent types in CUDA internal headers with
-// the right mangled names.
-template <typename T> DEVICE T atomicAdd(T *address, T val);
-template <typename T> DEVICE T atomicInc(T *address, T val);
-template <typename T> DEVICE T atomicMax(T *address, T val);
-template <typename T> DEVICE T atomicExch(T *address, T val);
-template <typename T> DEVICE T atomicCAS(T *address, T compare, T val);
-
+// Atomics
DEVICE uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
- return atomicAdd(Address, Val);
+ return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
}
DEVICE uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
- return atomicInc(Address, Val);
+ return __nvvm_atom_inc_gen_ui(Address, Val);
}
+
DEVICE uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
- return atomicMax(Address, Val);
+ return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
}
+
DEVICE uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
- return atomicExch(Address, Val);
+ uint32_t R;
+ __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+ return R;
}
+
DEVICE uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare,
uint32_t Val) {
- return atomicCAS(Address, Compare, Val);
+ (void)__atomic_compare_exchange(Address, &Compare, &Val, false,
+ __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+ return Compare;
}
DEVICE unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
unsigned long long Val) {
- return atomicExch(Address, Val);
+ unsigned long long R;
+ __atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
+ return R;
}
+
DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
unsigned long long Val) {
- return atomicAdd(Address, Val);
+ return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
}
#define __OMP_SPIN 1000
More information about the Openmp-commits
mailing list