[Openmp-commits] [PATCH] D94731: [libomptarget][nvptx] Call builtins instead of cuda

Jon Chesterfield via Phabricator via Openmp-commits openmp-commits at lists.llvm.org
Thu Jan 14 16:31:22 PST 2021


JonChesterfield created this revision.
JonChesterfield added reviewers: jdoerfert, ABataev, tianshilei1992, grokos, ye-luo.
Herald added subscribers: jfb, yaxunl.
JonChesterfield requested review of this revision.
Herald added subscribers: openmp-commits, sstefan1.
Herald added a project: OpenMP.

[libomptarget][nvptx] Call builtins instead of cuda

This is the tricky part of dropping the dependency on cuda.h. Written
with reference to clang's cuda header plumbing (of which we need a
negligible fraction) by changing the cuda source to give the same IR
output as before.

Remaining parts are device prototype for malloc, free, printf, assert,
changing to builtin atomics, and using attribute((device)) for __device__.

Atomics and device attributes may be solved by porting to openmp.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D94731

Files:
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu


Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -10,9 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "target_impl.h"
 #include "common/debug.h"
 #include "common/target_atomic.h"
+#include "target_impl.h"
 
 #include <cuda.h>
 
@@ -56,23 +56,23 @@
 }
 
 // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
 #if CUDA_VERSION >= 9000
-  return __activemask();
+  uint32_t mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
 #else
-  return __ballot(1);
+  return __nvvm_vote_ballot(1);
 #endif
 }
 
 // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
-
 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane) {
 #if CUDA_VERSION >= 9000
-  return __shfl_sync(Mask, Var, SrcLane);
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, WARPSIZE - 1);
 #else
-  return __shfl(Var, SrcLane);
+  return __nvvm_shfl_idx_i32(Var, SrcLane, WARPSIZE - 1);
 #endif // CUDA_VERSION
 }
 
@@ -80,24 +80,18 @@
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width) {
 #if CUDA_VERSION >= 9000
-  return __shfl_down_sync(Mask, Var, Delta, Width);
+  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta,
+                                   ((WARPSIZE - Width) << 8) | 0x1f);
 #else
-  return __shfl_down(Var, Delta, Width);
+  return __nvvm_shfl_down_i32(Var, Delta, ((WARPSIZE - Width) << 8) | 0x1f);
 #endif // CUDA_VERSION
 }
 
-DEVICE void __kmpc_impl_syncthreads() {
-  // Use original __syncthreads if compiled by nvcc or clang >= 9.0.
-#if !defined(__clang__) || __clang_major__ >= 9
-  __syncthreads();
-#else
-  asm volatile("bar.sync %0;" : : "r"(0) : "memory");
-#endif // __clang__
-}
+DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
 DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
 #if CUDA_VERSION >= 9000
-  __syncwarp(Mask);
+  __nvvm_bar_warp_sync(Mask);
 #else
   // In Cuda < 9.0 no need to sync threads in warps.
 #endif // CUDA_VERSION
@@ -145,11 +139,11 @@
 DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock) {
   // TODO: not sure spinning is a good idea here..
   while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
-    clock_t start = clock();
-    clock_t now;
+    int32_t start = __nvvm_read_ptx_sreg_clock();
+    int32_t now;
     for (;;) {
-      now = clock();
-      clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
+      now = __nvvm_read_ptx_sreg_clock();
+      int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
       if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
         break;
       }


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D94731.316809.patch
Type: text/x-patch
Size: 3039 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/openmp-commits/attachments/20210115/fbcddab3/attachment-0001.bin>


More information about the Openmp-commits mailing list