[Openmp-commits] [openmp] 1ab1f04 - [OpenMP] Simplify variable sharing and increase shared memory size

Johannes Doerfert via Openmp-commits openmp-commits at lists.llvm.org
Sun Jul 11 17:18:35 PDT 2021


Author: Johannes Doerfert
Date: 2021-07-11T19:18:03-05:00
New Revision: 1ab1f04a2be34bea2fb34df0f5ff0bd75bdc7aa0

URL: https://github.com/llvm/llvm-project/commit/1ab1f04a2be34bea2fb34df0f5ff0bd75bdc7aa0
DIFF: https://github.com/llvm/llvm-project/commit/1ab1f04a2be34bea2fb34df0f5ff0bd75bdc7aa0.diff

LOG: [OpenMP] Simplify variable sharing and increase shared memory size

In order to avoid malloc/free, up to NUM_SHARED_VARIABLES_IN_SHARED_MEM
(=64) variables are communicated in dedicated shared memory instead. The
simplification does avoid the need for an "init" and requires "deinit"
only if we ever communicate more than NUM_SHARED_VARIABLES_IN_SHARED_MEM
variables.

Differential Revision: https://reviews.llvm.org/D105767

Added: 
    

Modified: 
    openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
    openmp/libomptarget/deviceRTLs/common/omptarget.h
    openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
    openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
    openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
    openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index 6af40a7d507c..615335df5488 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -40,10 +40,6 @@
 
 #define WARPSIZE 64
 
-// Maximum number of preallocated arguments to an outlined parallel/simd
-// function. Anything more requires dynamic memory allocation.
-#define MAX_SHARED_ARGS 20
-
 // Maximum number of omp state objects per SM allocated statically in global
 // memory.
 #define OMP_STATE_COUNT 32

diff  --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h
index 47d2848ec7b0..d8ea6a396697 100644
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -35,46 +35,6 @@
 #define BARRIER_COUNTER 0
 #define ORDERED_COUNTER 1
 
-// arguments needed for L0 parallelism only.
-class omptarget_nvptx_SharedArgs {
-public:
-  // All these methods must be called by the master thread only.
-  INLINE void Init() {
-    args = buffer;
-    nArgs = MAX_SHARED_ARGS;
-  }
-  INLINE void DeInit() {
-    // Free any memory allocated for outlined parallel function with a large
-    // number of arguments.
-    if (nArgs > MAX_SHARED_ARGS) {
-      SafeFree(args, "new extended args");
-      Init();
-    }
-  }
-  INLINE void EnsureSize(size_t size) {
-    if (size > nArgs) {
-      if (nArgs > MAX_SHARED_ARGS) {
-        SafeFree(args, "new extended args");
-      }
-      args = (void **)SafeMalloc(size * sizeof(void *), "new extended args");
-      nArgs = size;
-    }
-  }
-  // Called by all threads.
-  INLINE void **GetArgs() const { return args; };
-
-private:
-  // buffer of pre-allocated arguments.
-  void *buffer[MAX_SHARED_ARGS];
-  // pointer to arguments buffer.
-  // starts off as a pointer to 'buffer' but can be dynamically allocated.
-  void **args;
-  // starts off as MAX_SHARED_ARGS but can increase in size.
-  uint32_t nArgs;
-};
-
-extern omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs);
-
 // Worker slot type which is initialized with the default worker slot
 // size of 4*32 bytes.
 struct __kmpc_data_sharing_slot {

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
index 3b95ca88aad2..65db27b63bdf 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -135,14 +135,32 @@ EXTERN void __kmpc_data_sharing_init_stack() {
   }
 }
 
+/// Allocate storage in shared memory to communicate arguments from the main
+/// thread to the workers in generic mode. If we exceed
+/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
+#define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64
+
+[[clang::loader_uninitialized]] static void
+    *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
+#pragma omp allocate(SharedMemVariableSharingSpace)                            \
+    allocator(omp_pteam_mem_alloc)
+[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
+#pragma omp allocate(SharedMemVariableSharingSpacePtr)                         \
+    allocator(omp_pteam_mem_alloc)
+
 // Begin a data sharing context. Maintain a list of references to shared
 // variables. This list of references to shared variables will be passed
 // to one or more threads.
 // In L0 data sharing this is called by master thread.
 // In L1 data sharing this is called by active warp master thread.
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
-  omptarget_nvptx_globalArgs.EnsureSize(nArgs);
-  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+  if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
+    SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
+  } else {
+    SharedMemVariableSharingSpacePtr =
+        (void **)SafeMalloc(nArgs * sizeof(void *), "new extended args");
+  }
+  *GlobalArgs = SharedMemVariableSharingSpacePtr;
 }
 
 // End a data sharing context. There is no need to have a list of refs
@@ -152,7 +170,8 @@ EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
 // In L0 data sharing this is called by master thread.
 // In L1 data sharing this is called by active warp master thread.
 EXTERN void __kmpc_end_sharing_variables() {
-  omptarget_nvptx_globalArgs.DeInit();
+  if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
+    SafeFree(SharedMemVariableSharingSpacePtr, "new extended args");
 }
 
 // This function will return a list of references to global variables. This
@@ -161,7 +180,7 @@ EXTERN void __kmpc_end_sharing_variables() {
 // preserving the order.
 // Called by all workers.
 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
-  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+  *GlobalArgs = SharedMemVariableSharingSpacePtr;
 }
 
 // This function is used to init static memory manager. This manager is used to

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
index d46571f650f4..876ec7bb3a43 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -62,9 +62,4 @@ uint32_t SHARED(execution_param);
 ////////////////////////////////////////////////////////////////////////////////
 void *SHARED(ReductionScratchpadPtr);
 
-////////////////////////////////////////////////////////////////////////////////
-// Data sharing related variables.
-////////////////////////////////////////////////////////////////////////////////
-omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);
-
 #pragma omp end declare target

diff  --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
index 34af243fab54..2a5cc312376a 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -68,8 +68,6 @@ static void __kmpc_generic_kernel_init() {
   nThreads = GetNumberOfWorkersInTeam();
   threadLimit = nThreads;
 
-  omptarget_nvptx_globalArgs.Init();
-
   __kmpc_data_sharing_init_stack();
   __kmpc_impl_target_init();
 }

diff  --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index d0d7127aac7d..9e69f6016ea5 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -33,10 +33,6 @@
 
 #define WARPSIZE 32
 
-// Maximum number of preallocated arguments to an outlined parallel/simd
-// function. Anything more requires dynamic memory allocation.
-#define MAX_SHARED_ARGS 20
-
 // Maximum number of omp state objects per SM allocated statically in global
 // memory.
 #if __CUDA_ARCH__ >= 600


        


More information about the Openmp-commits mailing list