[Openmp-commits] [openmp] 27cc4a8 - [OpenMP][NVPTX] Rewrite CUDA intrinsics with NVVM intrinsics

Mon Jan 25 11:14:44 PST 2021

Author: Shilei Tian
Date: 2021-01-25T14:14:30-05:00
New Revision: 27cc4a8138d819f78bc4fc028e39772bbda84dbd

URL: https://github.com/llvm/llvm-project/commit/27cc4a8138d819f78bc4fc028e39772bbda84dbd
DIFF: https://github.com/llvm/llvm-project/commit/27cc4a8138d819f78bc4fc028e39772bbda84dbd.diff

LOG: [OpenMP][NVPTX] Rewrite CUDA intrinsics with NVVM intrinsics

This patch makes prep for dropping CUDA when compiling `deviceRTLs`.
CUDA intrinsics are replaced by NVVM intrinsics which refers to code in
`__clang_cuda_intrinsics.h`. We don't want to directly include it because in the
near future we're going to switch to OpenMP and by then the header cannot be
used anymore.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D95327

Added: 
    

Modified: 
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
index 1e3ba7d664af..09bf45b005c8 100644

--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -16,20 +16,6 @@
 
 #include <cuda.h>
 
-// Forward declaration of CUDA primitives which will be evetually transformed
-// into LLVM intrinsics.
-extern "C" {
-unsigned int __activemask();
-unsigned int __ballot(unsigned);
-// The default argument here is based on NVIDIA's website
-// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE);
-int __shfl(int val, int src_line, int width = WARPSIZE);
-int __shfl_down(int var, unsigned detla, int width);
-int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
-void __syncwarp(int mask);
-}
-
 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
 }
@@ -71,10 +57,12 @@ DEVICE double __kmpc_impl_get_wtime() {
 
 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION >= 9000
-  return __activemask();
+#if CUDA_VERSION < 9020
+  return __nvvm_vote_ballot(1);
 #else
-  return __ballot(1);
+  unsigned int Mask;
+  asm volatile("activemask.b32 %0;" : "=r"(Mask));
+  return Mask;
 #endif
 }
 
@@ -82,19 +70,20 @@ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane) {
 #if CUDA_VERSION >= 9000
-  return __shfl_sync(Mask, Var, SrcLane);
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
 #else
-  return __shfl(Var, SrcLane);
+  return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
 #endif // CUDA_VERSION
 }
 
 DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width) {
+  int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
 #if CUDA_VERSION >= 9000
-  return __shfl_down_sync(Mask, Var, Delta, Width);
+  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
 #else
-  return __shfl_down(Var, Delta, Width);
+  return __nvvm_shfl_down_i32(Var, Delta, T);
 #endif // CUDA_VERSION
 }
 
@@ -102,7 +91,7 @@ DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
 #if CUDA_VERSION >= 9000
-  __syncwarp(Mask);
+  __nvvm_bar_warp_sync(Mask);
 #else
   // In Cuda < 9.0 no need to sync threads in warps.
 #endif // CUDA_VERSION