[Openmp-commits] [openmp] r345976 - [OPENMP][NVPTX]Fixed/improved support for globalization in team contexts.

Alexey Bataev via Openmp-commits openmp-commits at lists.llvm.org
Fri Nov 2 07:43:23 PDT 2018


Author: abataev
Date: Fri Nov  2 07:43:23 2018
New Revision: 345976

URL: http://llvm.org/viewvc/llvm-project?rev=345976&view=rev
Log:
[OPENMP][NVPTX]Fixed/improved support for globalization in team contexts.

Summary:
Current globalization scheme works correctly only for SPMD+lightweight
runtime mode and does not work for full runtime. Patch improves support
for the globalization scheme + reduces global memory consumption in
  lightweight runtime mode.
Patch adds runtime functions to work with the statically allocated
global memory. It allows to improve performance and memory consumption.
This global memory must be allocated by the compiler.

Reviewers: grokos, kkwli0, gtbercea, caomhin

Subscribers: guansong, jfb, openmp-commits

Differential Revision: https://reviews.llvm.org/D53943

Modified:
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
    openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu?rev=345976&r1=345975&r2=345976&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu Fri Nov  2 07:43:23 2018
@@ -370,11 +370,7 @@ EXTERN void __kmpc_data_sharing_init_sta
 }
 
 INLINE void* data_sharing_push_stack_common(size_t PushSize) {
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, isSPMDMode(),
-            "Expected SPMD mode with uninitialized runtime.");
-    return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize);
-  }
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
 
   // Only warp active master threads manage the stack.
   bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0;
@@ -480,11 +476,7 @@ EXTERN void* __kmpc_data_sharing_push_st
 // reclaim all outstanding global memory slots since it is
 // likely we have reached the end of the kernel.
 EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
-  if (isRuntimeUninitialized()) {
-    ASSERT0(LT_FUSSY, isSPMDMode(),
-            "Expected SPMD mode with uninitialized runtime.");
-    return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
-  }
+  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
 
   __threadfence_block();
 
@@ -544,3 +536,44 @@ EXTERN void __kmpc_end_sharing_variables
 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
   *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
 }
+
+// This function is used to init static memory manager. This manager is used to
+// manage statically allocated global memory. This memory is allocated by the
+// compiler and used to correctly implement globalization of the variables in
+// target, teams and distribute regions.
+EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
+                                          int16_t is_shared,
+                                          const void **frame) {
+  if (is_shared) {
+    *frame = buf;
+    return;
+  }
+  if (isSPMDMode()) {
+    if (GetThreadIdInBlock() == 0) {
+      *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+    }
+    __syncthreads();
+    return;
+  }
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
+          "Must be called only in the target master thread.");
+  *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
+  __threadfence();
+}
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared) {
+  if (is_shared)
+    return;
+  if (isSPMDMode()) {
+    __syncthreads();
+    if (GetThreadIdInBlock() == 0) {
+      omptarget_nvptx_simpleMemoryManager.Release();
+    }
+    return;
+  }
+  __threadfence();
+  ASSERT0(LT_FUSSY, GetThreadIdInBlock() == getMasterThreadId(),
+          "Must be called only in the target master thread.");
+  omptarget_nvptx_simpleMemoryManager.Release();
+}
+

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h?rev=345976&r1=345975&r2=345976&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/interface.h Fri Nov  2 07:43:23 2018
@@ -514,4 +514,10 @@ __kmpc_get_data_sharing_environment_fram
 
 // SPMD execution mode interrogation function.
 EXTERN int8_t __kmpc_is_spmd_exec_mode();
+
+EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
+                                          int16_t is_shared, const void **res);
+
+EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared);
+
 #endif

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu?rev=345976&r1=345975&r2=345976&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu Fri Nov  2 07:43:23 2018
@@ -31,6 +31,11 @@ __device__ omptarget_nvptx_Queue<omptarg
                                  OMP_STATE_COUNT>
     omptarget_nvptx_device_simpleState[MAX_SM];
 
+__device__ omptarget_nvptx_SimpleMemoryManager
+    omptarget_nvptx_simpleMemoryManager;
+__device__ __shared__ uint32_t usedMemIdx;
+__device__ __shared__ uint32_t usedSlotIdx;
+
 // Pointer to this team's OpenMP state object
 __device__ __shared__
     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
@@ -38,8 +43,6 @@ __device__ __shared__
 __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
     *omptarget_nvptx_simpleThreadPrivateContext;
 
-__device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
-
 ////////////////////////////////////////////////////////////////////////////////
 // The team master sets the outlined parallel function in this variable to
 // communicate with the workers.  Since it is in shared memory, there is one

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu?rev=345976&r1=345975&r2=345976&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu Fri Nov  2 07:43:23 2018
@@ -25,18 +25,10 @@ extern __device__ omptarget_nvptx_Queue<
     omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
     omptarget_nvptx_device_simpleState[MAX_SM];
 
-extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
-
 ////////////////////////////////////////////////////////////////////////////////
 // init entry points
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE unsigned nsmid() {
-  unsigned n;
-  asm("mov.u32 %0, %%nsmid;" : "=r"(n));
-  return n;
-}
-
 INLINE unsigned smid() {
   unsigned id;
   asm("mov.u32 %0, %%smid;" : "=r"(id));
@@ -64,11 +56,9 @@ EXTERN void __kmpc_kernel_init(int Threa
 
   // Get a state object from the queue.
   int slot = smid() % MAX_SM;
+  usedSlotIdx = slot;
   omptarget_nvptx_threadPrivateContext =
       omptarget_nvptx_device_State[slot].Dequeue();
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  omptarget_nvptx_threadPrivateContext->SetSourceQueue(slot);
-#endif
 
   // init thread private
   int threadId = GetLogicalThreadIdInBlock();
@@ -94,11 +84,7 @@ EXTERN void __kmpc_kernel_deinit(int16_t
   ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized,
           "Generic always requires initialized runtime.");
   // Enqueue omp state object for use by another team.
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
-#else
-  int slot = smid() % MAX_SM;
-#endif
+  int slot = usedSlotIdx;
   omptarget_nvptx_device_State[slot].Enqueue(
       omptarget_nvptx_threadPrivateContext);
   // Done with work.  Kill the workers.
@@ -114,12 +100,9 @@ EXTERN void __kmpc_spmd_kernel_init(int
     setExecutionParameters(Spmd, RuntimeUninitialized);
     if (GetThreadIdInBlock() == 0) {
       int slot = smid() % MAX_SM;
+      usedSlotIdx = slot;
       omptarget_nvptx_simpleThreadPrivateContext =
           omptarget_nvptx_device_simpleState[slot].Dequeue();
-      // Reuse the memory allocated for the full runtime as the preallocated
-      // global memory buffer for the lightweight runtime.
-      omptarget_nvptx_simpleGlobalData =
-          omptarget_nvptx_device_State[slot].Dequeue();
     }
     __syncthreads();
     omptarget_nvptx_simpleThreadPrivateContext->Init();
@@ -136,6 +119,7 @@ EXTERN void __kmpc_spmd_kernel_init(int
   if (threadId == 0) {
     // Get a state object from the queue.
     int slot = smid() % MAX_SM;
+    usedSlotIdx = slot;
     omptarget_nvptx_threadPrivateContext =
         omptarget_nvptx_device_State[slot].Dequeue();
 
@@ -186,19 +170,15 @@ EXTERN void __kmpc_spmd_kernel_deinit()
   if (isRuntimeUninitialized()) {
     if (threadId == 0) {
       // Enqueue omp state object for use by another team.
-      int slot = smid() % MAX_SM;
+      int slot = usedSlotIdx;
       omptarget_nvptx_device_simpleState[slot].Enqueue(
           omptarget_nvptx_simpleThreadPrivateContext);
-      // Enqueue global memory back.
-      omptarget_nvptx_device_State[slot].Enqueue(
-          reinterpret_cast<omptarget_nvptx_ThreadPrivateContext *>(
-              omptarget_nvptx_simpleGlobalData));
     }
     return;
   }
   if (threadId == 0) {
     // Enqueue omp state object for use by another team.
-    int slot = smid() % MAX_SM;
+    int slot = usedSlotIdx;
     omptarget_nvptx_device_State[slot].Enqueue(
         omptarget_nvptx_threadPrivateContext);
   }

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=345976&r1=345975&r2=345976&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Fri Nov  2 07:43:23 2018
@@ -344,8 +344,6 @@ public:
   INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
 
   INLINE void InitThreadPrivateContext(int tid);
-  INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; }
-  INLINE uint64_t GetSourceQueue() { return SourceQueue; }
 
 private:
   // team context for this team
@@ -368,8 +366,6 @@ private:
   // state for dispatch with dyn/guided OR static (never use both at a time)
   int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
   int64_t stride[MAX_THREADS_PER_TEAM];
-  // Queue to which this object must be returned.
-  uint64_t SourceQueue;
 };
 
 /// Device envrionment data
@@ -377,6 +373,22 @@ struct omptarget_device_environmentTy {
   int32_t debug_level;
 };
 
+/// Memory manager for statically allocated memory.
+class omptarget_nvptx_SimpleMemoryManager {
+private:
+  __align__(128) struct MemDataTy {
+    volatile unsigned keys[OMP_STATE_COUNT];
+  } MemData[MAX_SM];
+
+  INLINE uint32_t hash(unsigned key) const {
+    return key & (OMP_STATE_COUNT - 1);
+  }
+
+public:
+  INLINE void Release();
+  INLINE const void *Acquire(const void *buf, size_t size);
+};
+
 class omptarget_nvptx_SimpleThreadPrivateContext {
   uint16_t par_level[MAX_THREADS_PER_TEAM];
 
@@ -386,8 +398,6 @@ public:
             "Expected SPMD + uninitialized runtime modes.");
     par_level[GetThreadIdInBlock()] = 0;
   }
-  static INLINE void *Allocate(size_t DataSize);
-  static INLINE void Deallocate(void *Ptr);
   INLINE void IncParLevel() {
     ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
             "Expected SPMD + uninitialized runtime modes.");
@@ -424,6 +434,10 @@ extern __device__ omptarget_device_envir
 // global data tables
 ////////////////////////////////////////////////////////////////////////////////
 
+extern __device__ omptarget_nvptx_SimpleMemoryManager
+    omptarget_nvptx_simpleMemoryManager;
+extern __device__ __shared__ uint32_t usedMemIdx;
+extern __device__ __shared__ uint32_t usedSlotIdx;
 extern __device__ __shared__
     omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
 extern __device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext

Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h?rev=345976&r1=345975&r2=345976&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h Fri Nov  2 07:43:23 2018
@@ -204,34 +204,29 @@ INLINE omptarget_nvptx_TaskDescr *getMyT
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-// Lightweight runtime functions.
+// Memory management runtime functions.
 ////////////////////////////////////////////////////////////////////////////////
 
-// Shared memory buffer for globalization support.
-static __align__(16) __device__ __shared__ char
-    omptarget_static_buffer[DS_Shared_Memory_Size];
-static __device__ __shared__ void *omptarget_spmd_allocated;
-
-extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
-
-INLINE void *
-omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) {
-  if (DataSize <= DS_Shared_Memory_Size)
-    return ::omptarget_static_buffer;
-  if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext))
-    return ::omptarget_nvptx_simpleGlobalData;
-  if (threadIdx.x == 0)
-    omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc");
-  __syncthreads();
-  return omptarget_spmd_allocated;
+INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
+  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+          "SlotIdx is too big or uninitialized.");
+  ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
+          "MemIdx is too big or uninitialized.");
+  MemDataTy &MD = MemData[usedSlotIdx];
+  atomicExch((unsigned *)&MD.keys[usedMemIdx], 0);
 }
 
-INLINE void
-omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) {
-  if (Ptr != ::omptarget_static_buffer &&
-      Ptr != ::omptarget_nvptx_simpleGlobalData) {
-    __syncthreads();
-    if (threadIdx.x == 0)
-      SafeFree(Ptr, "SPMD teams dealloc");
+INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
+                                                                size_t size) {
+  ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
+          "SlotIdx is too big or uninitialized.");
+  const unsigned sm = usedSlotIdx;
+  MemDataTy &MD = MemData[sm];
+  unsigned i = hash(GetBlockIdInKernel());
+  while (atomicCAS((unsigned *)&MD.keys[i], 0, 1) != 0) {
+    i = hash(i + 1);
   }
+  usedSlotIdx = sm;
+  usedMemIdx = i;
+  return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
 }




More information about the Openmp-commits mailing list