[Openmp-commits] [openmp] r342737 - [OPENMP][NVPTX] Add support for lastprivates/reductions handling in SPMD constructs with lightweight runtime.
Alexey Bataev via Openmp-commits
openmp-commits at lists.llvm.org
Fri Sep 21 07:11:41 PDT 2018
Author: abataev
Date: Fri Sep 21 07:11:41 2018
New Revision: 342737
URL: http://llvm.org/viewvc/llvm-project?rev=342737&view=rev
Log:
[OPENMP][NVPTX] Add support for lastprivates/reductions handling in SPMD constructs with lightweight runtime.
Summary:
We need the support for per-team shared variables to support codegen for
lastprivates/reductions. Patch adds this support by using shared memory
if the total size of the reductions/lastprivates is <= 128 bytes,
then pre-allocated buffer in global memory if size is <= 4K bytes,or
uses malloc/free, otherwise.
Reviewers: gtbercea, kkwli0, grokos
Subscribers: guansong, openmp-commits
Differential Revision: https://reviews.llvm.org/D51875
Modified:
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu?rev=342737&r1=342736&r2=342737&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu Fri Sep 21 07:11:41 2018
@@ -378,6 +378,12 @@ EXTERN void __kmpc_data_sharing_init_sta
// as long as the size requested fits the pre-allocated size.
EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
int16_t UseSharedMemory) {
+ if (isRuntimeUninitialized()) {
+ ASSERT0(LT_FUSSY, isSPMDMode(),
+ "Expected SPMD mode with uninitialized runtime.");
+ return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize);
+ }
+
// Frame pointer must be visible to all workers in the same warp.
unsigned WID = getWarpId();
void *&FrameP = DataSharingState.FramePtr[WID];
@@ -456,6 +462,12 @@ EXTERN void* __kmpc_data_sharing_push_st
// reclaim all outstanding global memory slots since it is
// likely we have reached the end of the kernel.
EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
+ if (isRuntimeUninitialized()) {
+ ASSERT0(LT_FUSSY, isSPMDMode(),
+ "Expected SPMD mode with uninitialized runtime.");
+ return omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(FrameStart);
+ }
+
if (IsWarpMasterActiveThread()) {
unsigned WID = getWarpId();
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu?rev=342737&r1=342736&r2=342737&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omp_data.cu Fri Sep 21 07:11:41 2018
@@ -38,6 +38,8 @@ __device__ __shared__
__device__ __shared__ omptarget_nvptx_SimpleThreadPrivateContext
*omptarget_nvptx_simpleThreadPrivateContext;
+__device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
+
////////////////////////////////////////////////////////////////////////////////
// The team master sets the outlined parallel function in this variable to
// communicate with the workers. Since it is in shared memory, there is one
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu?rev=342737&r1=342736&r2=342737&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu Fri Sep 21 07:11:41 2018
@@ -25,13 +25,23 @@ extern __device__ omptarget_nvptx_Queue<
omptarget_nvptx_SimpleThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_simpleState[MAX_SM];
+extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
+
////////////////////////////////////////////////////////////////////////////////
// init entry points
////////////////////////////////////////////////////////////////////////////////
+INLINE unsigned nsmid() {
+ unsigned n;
+ asm("mov.u32 %0, %%nsmid;" : "=r"(n));
+ return n;
+}
+
INLINE unsigned smid() {
unsigned id;
asm("mov.u32 %0, %%smid;" : "=r"(id));
+ ASSERT0(LT_FUSSY, nsmid() <= MAX_SM,
+ "Expected number of SMs is less than reported.");
return id;
}
@@ -108,6 +118,10 @@ EXTERN void __kmpc_spmd_kernel_init(int
int slot = smid() % MAX_SM;
omptarget_nvptx_simpleThreadPrivateContext =
omptarget_nvptx_device_simpleState[slot].Dequeue();
+ // Reuse the memory allocated for the full runtime as the preallocated
+ // global memory buffer for the lightweight runtime.
+ omptarget_nvptx_simpleGlobalData =
+ omptarget_nvptx_device_State[slot].Dequeue();
}
__syncthreads();
omptarget_nvptx_simpleThreadPrivateContext->Init();
@@ -177,6 +191,10 @@ EXTERN void __kmpc_spmd_kernel_deinit()
int slot = smid() % MAX_SM;
omptarget_nvptx_device_simpleState[slot].Enqueue(
omptarget_nvptx_simpleThreadPrivateContext);
+ // Enqueue global memory back.
+ omptarget_nvptx_device_State[slot].Enqueue(
+ reinterpret_cast<omptarget_nvptx_ThreadPrivateContext *>(
+ omptarget_nvptx_simpleGlobalData));
}
return;
}
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h?rev=342737&r1=342736&r2=342737&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h Fri Sep 21 07:11:41 2018
@@ -113,6 +113,8 @@ enum DATA_SHARING_SIZES {
DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
// The maximum number of warps in use
DS_Max_Warp_Number = 32,
+ // The size of the preallocated shared memory buffer per team
+ DS_Shared_Memory_Size = 128,
};
// Data structure to keep in shared memory that traces the current slot, stack,
@@ -386,12 +388,15 @@ struct omptarget_device_environmentTy {
class omptarget_nvptx_SimpleThreadPrivateContext {
uint16_t par_level[MAX_THREADS_PER_TEAM];
+
public:
INLINE void Init() {
ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
"Expected SPMD + uninitialized runtime modes.");
par_level[GetThreadIdInBlock()] = 0;
}
+ static INLINE void *Allocate(size_t DataSize);
+ static INLINE void Deallocate(void *Ptr);
INLINE void IncParLevel() {
ASSERT0(LT_FUSSY, isSPMDMode() && isRuntimeUninitialized(),
"Expected SPMD + uninitialized runtime modes.");
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h?rev=342737&r1=342736&r2=342737&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h Fri Sep 21 07:11:41 2018
@@ -202,3 +202,36 @@ INLINE omptarget_nvptx_TaskDescr *getMyT
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
}
+
+////////////////////////////////////////////////////////////////////////////////
+// Lightweight runtime functions.
+////////////////////////////////////////////////////////////////////////////////
+
+// Shared memory buffer for globalization support.
+static __align__(16) __device__ __shared__ char
+ omptarget_static_buffer[DS_Shared_Memory_Size];
+static __device__ __shared__ void *omptarget_spmd_allocated;
+
+extern __device__ __shared__ void *omptarget_nvptx_simpleGlobalData;
+
+INLINE void *
+omptarget_nvptx_SimpleThreadPrivateContext::Allocate(size_t DataSize) {
+ if (DataSize <= DS_Shared_Memory_Size)
+ return ::omptarget_static_buffer;
+ if (DataSize <= sizeof(omptarget_nvptx_ThreadPrivateContext))
+ return ::omptarget_nvptx_simpleGlobalData;
+ if (threadIdx.x == 0)
+ omptarget_spmd_allocated = SafeMalloc(DataSize, "SPMD teams alloc");
+ __syncthreads();
+ return omptarget_spmd_allocated;
+}
+
+INLINE void
+omptarget_nvptx_SimpleThreadPrivateContext::Deallocate(void *Ptr) {
+ if (Ptr != ::omptarget_static_buffer &&
+ Ptr != ::omptarget_nvptx_simpleGlobalData) {
+ __syncthreads();
+ if (threadIdx.x == 0)
+ SafeFree(Ptr, "SPMD teams dealloc");
+ }
+}
Modified: openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h?rev=342737&r1=342736&r2=342737&view=diff
==============================================================================
--- openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h (original)
+++ openmp/trunk/libomptarget/deviceRTLs/nvptx/src/option.h Fri Sep 21 07:11:41 2018
@@ -34,7 +34,10 @@
// Maximum number of omp state objects per SM allocated statically in global
// memory.
-#if __CUDA_ARCH__ >= 600
+#if __CUDA_ARCH__ >= 700
+#define OMP_STATE_COUNT 32
+#define MAX_SM 84
+#elif __CUDA_ARCH__ >= 600
#define OMP_STATE_COUNT 32
#define MAX_SM 56
#else
More information about the Openmp-commits
mailing list