[Openmp-commits] [openmp] 32dfbd1 - [libomptarget][nfc] Use cuda variable wrappers from support.h

Thu Nov 14 04:45:22 PST 2019

Author: JonChesterfield
Date: 2019-11-14T12:45:09Z
New Revision: 32dfbd131da87c4ff1c6a417001bbbc003697867

URL: https://github.com/llvm/llvm-project/commit/32dfbd131da87c4ff1c6a417001bbbc003697867
DIFF: https://github.com/llvm/llvm-project/commit/32dfbd131da87c4ff1c6a417001bbbc003697867.diff

LOG: [libomptarget][nfc] Use cuda variable wrappers from support.h

Summary:
[libomptarget][nfc] Use cuda variable wrappers from support.h
Reimplementation of D69693, after the revert of D69885

Use the wrappers in support.h for cuda builtin variables at all call sites.
Localises use of cuda and removes WARPSIZE==32 assumption in debug.h.

Reviewers: ABataev, jdoerfert, grokos

Reviewed By: jdoerfert

Subscribers: openmp-commits

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D70186

Added: 
    

Modified: 
    openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
    openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/support.cu

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index 78b04ec5cffe..f2892acb790a 100644

--- a/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -13,11 +13,6 @@
 #include "target_impl.h"
 #include <stdio.h>
 
-// Warp ID in the CUDA block
-INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
-// Lane ID in the CUDA warp.
-INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
-
 // Return true if this is the first active thread in the warp.
 INLINE static bool IsWarpMasterActiveThread() {
   unsigned long long Mask = __kmpc_impl_activemask();
@@ -67,7 +62,7 @@ __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
   DSPRINT0(DSFLAG_INIT,
            "Entering __kmpc_initialize_data_sharing_environment\n");
 
-  unsigned WID = getWarpId();
+  unsigned WID = GetWarpId();
   DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
 
   omptarget_nvptx_TeamDescr *teamDescr =
@@ -111,7 +106,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
   DSPRINT(DSFLAG, "Default Data Size %016llx\n",
           (unsigned long long)SharingDefaultDataSize);
 
-  unsigned WID = getWarpId();
+  unsigned WID = GetWarpId();
   __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
 
   __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
@@ -231,7 +226,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
 
   DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
 
-  unsigned WID = getWarpId();
+  unsigned WID = GetWarpId();
 
   if (IsEntryPoint) {
     if (IsWarpMasterActiveThread()) {
@@ -359,7 +354,7 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
   // This function initializes the stack pointer with the pointer to the
   // statically allocated shared memory slots. The size of a shared memory
   // slot is pre-determined to be 256 bytes.
-  if (threadIdx.x == 0)
+  if (GetThreadIdInBlock() == 0)
     data_sharing_init_stack_common();
 
   __threadfence_block();
@@ -377,7 +372,7 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
   PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
 
   // Frame pointer must be visible to all workers in the same warp.
-  const unsigned WID = getWarpId();
+  const unsigned WID = GetWarpId();
   void *FrameP = 0;
   __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
 
@@ -467,7 +462,7 @@ EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
   // Compute the start address of the frame of each thread in the warp.
   uintptr_t FrameStartAddress =
       (uintptr_t) data_sharing_push_stack_common(PushSize);
-  FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
+  FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
   return (void *)FrameStartAddress;
 }
 
@@ -482,7 +477,7 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
   __threadfence_block();
 
   if (GetThreadIdInBlock() % WARPSIZE == 0) {
-    unsigned WID = getWarpId();
+    unsigned WID = GetWarpId();
 
     // Current slot
     __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
index 1052392155a7..3388b04616f4 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
@@ -128,12 +128,12 @@
 
 #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
 #include <stdio.h>
-#include "target_impl.h"
+#include "support.h"
 
 template <typename... Arguments>
 NOINLINE static void log(const char *fmt, Arguments... parameters) {
-  printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),
-         (int)(threadIdx.x & 0x1F), parameters...);
+  printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
+         (int)GetWarpId(), (int)GetLaneId(), parameters...);
 }
 
 #endif
@@ -144,9 +144,8 @@ template <typename... Arguments>
 NOINLINE static void check(bool cond, const char *fmt,
                            Arguments... parameters) {
   if (!cond)
-    printf(fmt, (int)blockIdx.x, (int)threadIdx.x,
-           (int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F),
-           parameters...);
+    printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
+           (int)GetWarpId(), (int)GetLaneId(), parameters...);
   assert(cond);
 }
 

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
index 20a22f425324..e86b1d1fdbf3 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -364,7 +364,7 @@ EXTERN void omp_set_lock(omp_lock_t *lock) {
     for (;;) {
       now = clock();
       clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
-      if (cycles >= __OMP_SPIN * blockIdx.x) {
+      if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
         break;
       }
     }

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
index a7aa8aa814ab..7a022e1680ff 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
@@ -106,9 +106,9 @@ DEVICE int GetNumberOfBlocksInKernel() { return gridDim.x; }
 
 DEVICE int GetNumberOfThreadsInBlock() { return blockDim.x; }
 
-DEVICE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
+DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
 
-DEVICE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
+DEVICE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -124,7 +124,7 @@ DEVICE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
 //      If NumThreads is 1024, master id is 992.
 //
 // Called in Generic Execution Mode only.
-DEVICE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
+DEVICE int GetMasterThreadID() { return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); }
 
 // The last warp is reserved for the master; other warps are workers.
 // Called in Generic Execution Mode only.