[Openmp-commits] [PATCH] D95327: [OpenMP][NVPTX] Rewrite CUDA intrinsics with NVVM intrinsics
Shilei Tian via Phabricator via Openmp-commits
openmp-commits at lists.llvm.org
Sun Jan 24 19:02:00 PST 2021
tianshilei1992 created this revision.
tianshilei1992 added reviewers: jdoerfert, JonChesterfield.
Herald added subscribers: guansong, yaxunl.
tianshilei1992 requested review of this revision.
Herald added subscribers: openmp-commits, sstefan1.
Herald added a project: OpenMP.
This patch makes prep for dropping CUDA when compiling `deviceRTLs`.
CUDA intrinsics are replaced by NVVM intrinsics which refers to code in
`__clang_cuda_intrinsics.h`. We don't want to directly include it because in the
near future we're going to switch to OpenMP and by then the header cannot be
used anymore.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D95327
Files:
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -16,20 +16,6 @@
#include <cuda.h>
-// Forward declaration of CUDA primitives which will be evetually transformed
-// into LLVM intrinsics.
-extern "C" {
-unsigned int __activemask();
-unsigned int __ballot(unsigned);
-// The default argument here is based on NVIDIA's website
-// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE);
-int __shfl(int val, int src_line, int width = WARPSIZE);
-int __shfl_down(int var, unsigned detla, int width);
-int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
-void __syncwarp(int mask);
-}
-
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
}
@@ -71,10 +57,12 @@
// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION >= 9000
- return __activemask();
+#if CUDA_VERSION < 9020
+ return __nvvm_vote_ballot(1);
#else
- return __ballot(1);
+ unsigned int Mask;
+ asm volatile("activemask.b32 %0;" : "=r"(Mask));
+ return Mask;
#endif
}
@@ -82,9 +70,9 @@
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
int32_t SrcLane) {
#if CUDA_VERSION >= 9000
- return __shfl_sync(Mask, Var, SrcLane);
+ return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
#else
- return __shfl(Var, SrcLane);
+ return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
#endif // CUDA_VERSION
}
@@ -92,9 +80,10 @@
int32_t Var, uint32_t Delta,
int32_t Width) {
#if CUDA_VERSION >= 9000
- return __shfl_down_sync(Mask, Var, Delta, Width);
+ return __nvvm_shfl_sync_down_i32(Mask, Var, Delta,
+ ((WARPSIZE - Width) << 8) | 0x1f);
#else
- return __shfl_down(Var, Delta, Width);
+ return __nvvm_shfl_down_i32(Var, Delta, ((WARPSIZE - Width) << 8) | 0x1f);
#endif // CUDA_VERSION
}
@@ -102,7 +91,7 @@
DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
#if CUDA_VERSION >= 9000
- __syncwarp(Mask);
+ __nvvm_bar_warp_sync(Mask);
#else
// In Cuda < 9.0 no need to sync threads in warps.
#endif // CUDA_VERSION
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D95327.318893.patch
Type: text/x-patch
Size: 2620 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/openmp-commits/attachments/20210125/10b89948/attachment.bin>
More information about the Openmp-commits
mailing list