[Openmp-commits] [openmp] f1c821f - [OpenMP] Add support for dynamic shared memory in new RTL

Joseph Huber via Openmp-commits openmp-commits at lists.llvm.org
Fri Sep 17 18:26:01 PDT 2021


Author: Joseph Huber
Date: 2021-09-17T21:25:36-04:00
New Revision: f1c821fa850b5168e0cc120f2497ac54d8ad325b

URL: https://github.com/llvm/llvm-project/commit/f1c821fa850b5168e0cc120f2497ac54d8ad325b
DIFF: https://github.com/llvm/llvm-project/commit/f1c821fa850b5168e0cc120f2497ac54d8ad325b.diff

LOG: [OpenMP] Add support for dynamic shared memory in new RTL

This patch adds support for using dynamic shared memory in the new
device runtime. The new function `__kmpc_get_dynamic_shared` will return a
pointer to the buffer of dynamic shared memory. Currently the amount of memory
allocated is set by an environment variable.

In the future this amount will be added to the amount used for the smart stack
which will be configured in a similar way.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D110006

Added: 
    openmp/libomptarget/test/api/omp_dynamic_shared_memory.c

Modified: 
    openmp/docs/design/Runtimes.rst
    openmp/libomptarget/DeviceRTL/include/Configuration.h
    openmp/libomptarget/DeviceRTL/include/Interface.h
    openmp/libomptarget/DeviceRTL/include/State.h
    openmp/libomptarget/DeviceRTL/src/Configuration.cpp
    openmp/libomptarget/DeviceRTL/src/State.cpp
    openmp/libomptarget/plugins/cuda/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index b8b8da843b4f..2d8bd024fca7 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -32,6 +32,7 @@ variables is defined below.
     * ``LIBOMPTARGET_INFO=<Num>``
     * ``LIBOMPTARGET_HEAP_SIZE=<Num>``
     * ``LIBOMPTARGET_STACK_SIZE=<Num>``
+    * ``LIBOMPTARGET_SHARED_MEMORY_SIZE=<Num>``
 
 LIBOMPTARGET_DEBUG
 """"""""""""""""""
@@ -338,6 +339,14 @@ allocated using ``malloc`` and ``free`` for the CUDA plugin. This is necessary
 for some applications that allocate too much memory either through the user or
 globalization.
 
+LIBOMPTARGET_SHARED_MEMORY_SIZE
+"""""""""""""""""""""""""""""""
+
+This environment variable sets the amount of dynamic shared memory in bytes used 
+by the kernel once it is launched. A pointer to the dynamic memory buffer can 
+currently only be accessed using the ``__kmpc_get_dynamic_shared`` device 
+runtime call.
+
 .. toctree::
    :hidden:
    :maxdepth: 1

diff  --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
index 11aa5481c0f3..97e9449c3880 100644
--- a/openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -31,6 +31,9 @@ uint32_t getDeviceNum();
 /// Return the user choosen debug level.
 uint32_t getDebugLevel();
 
+/// Return the amount of dynamic shared memory that was allocated at launch.
+uint64_t getDynamicMemorySize();
+
 bool isDebugMode(DebugLevel Level);
 
 } // namespace config

diff  --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
index 9ed396d06af4..1a8471a2352f 100644
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -174,6 +174,10 @@ void *__kmpc_alloc_shared(uint64_t Bytes);
 /// allocated by __kmpc_alloc_shared by the same thread.
 void __kmpc_free_shared(void *Ptr, uint64_t Bytes);
 
+/// Get a pointer to the memory buffer containing dynamically allocated shared
+/// memory configured at launch.
+void *__kmpc_get_dynamic_shared();
+
 /// Allocate sufficient space for \p NumArgs sequential `void*` and store the
 /// allocation address in \p GlobalArgs.
 ///

diff  --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
index 63e0923d4154..c860bd1b98b8 100644
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -188,6 +188,9 @@ void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
 /// Alloca \p Size bytes in global memory, if possible, for \p Reason.
 void *allocGlobal(uint64_t Size, const char *Reason);
 
+/// Return a pointer to the dynamic shared memory buffer.
+void *getDynamicBuffer();
+
 /// Free \p Ptr, alloated via allocGlobal, for \p Reason.
 void freeGlobal(void *Ptr, const char *Reason);
 

diff  --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index 4e485992cde3..dc307071b21f 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -21,6 +21,7 @@ struct DeviceEnvironmentTy {
   uint32_t DebugLevel;
   uint32_t NumDevices;
   uint32_t DeviceNum;
+  uint64_t DynamicMemSize;
 };
 
 #pragma omp declare target
@@ -43,6 +44,10 @@ uint32_t config::getDeviceNum() {
   return omptarget_device_environment.DeviceNum;
 }
 
+uint64_t config::getDynamicMemorySize() {
+  return omptarget_device_environment.DynamicMemSize;
+}
+
 bool config::isDebugMode(config::DebugLevel Level) {
   return config::getDebugLevel() > Level;
 }

diff  --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index 2e00a6ecb02f..e3361551e3df 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -25,6 +25,13 @@ using namespace _OMP;
 ///
 ///{
 
+/// Add worst-case padding so that future allocations are properly aligned.
+constexpr const uint32_t Alignment = 8;
+
+/// External symbol to access dynamic shared memory.
+extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
+#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
+
 namespace {
 
 /// Fallback implementations are missing to trigger a link time error.
@@ -57,9 +64,6 @@ void free(void *Ptr) {}
 #pragma omp end declare variant
 ///}
 
-/// Add worst-case padding so that future allocations are properly aligned.
-constexpr const uint32_t Alignment = 8;
-
 /// A "smart" stack in shared memory.
 ///
 /// The stack exposes a malloc/free interface but works like a stack internally.
@@ -147,6 +151,8 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint32_t Bytes) {
 
 } // namespace
 
+void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
+
 void *memory::allocShared(uint64_t Bytes, const char *Reason) {
   return SharedMemorySmartStack.push(Bytes);
 }
@@ -497,6 +503,10 @@ __attribute__((noinline)) void __kmpc_free_shared(void *Ptr, uint64_t Bytes) {
   memory::freeShared(Ptr, Bytes, "Frontend free shared");
 }
 
+__attribute__((noinline)) void *__kmpc_get_dynamic_shared() {
+  return memory::getDynamicBuffer();
+}
+
 /// Allocate storage in shared memory to communicate arguments from the main
 /// thread to the workers in generic mode. If we exceed
 /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.

diff  --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index f3a810cc4d87..aaff0d3cd2ce 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -103,6 +103,7 @@ struct omptarget_device_environmentTy {
   int32_t debug_level;
   uint32_t num_devices;
   uint32_t device_num;
+  uint64_t dynamic_shared_size;
 };
 
 namespace {
@@ -346,6 +347,8 @@ class DeviceRTLTy {
   int EnvTeamThreadLimit;
   // OpenMP requires flags
   int64_t RequiresFlags;
+  // Amount of dynamic shared memory to use at launch.
+  uint64_t DynamicMemorySize;
 
   static constexpr const int HardTeamLimit = 1U << 16U; // 64k
   static constexpr const int HardThreadLimit = 1024;
@@ -499,7 +502,8 @@ class DeviceRTLTy {
 
   DeviceRTLTy()
       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
-        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
+        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
+        DynamicMemorySize(0) {
 
     DP("Start initializing CUDA\n");
 
@@ -540,6 +544,11 @@ class DeviceRTLTy {
       EnvNumTeams = std::stoi(EnvStr);
       DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
     }
+    if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
+      // LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
+      DynamicMemorySize = std::stoi(EnvStr);
+      DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE", DynamicMemorySize);
+    }
 
     StreamManager =
         std::make_unique<StreamManagerTy>(NumberOfDevices, DeviceData);
@@ -904,7 +913,7 @@ class DeviceRTLTy {
       // TODO: The device ID used here is not the real device ID used by OpenMP.
       omptarget_device_environmentTy DeviceEnv{
           0, static_cast<uint32_t>(NumberOfDevices),
-          static_cast<uint32_t>(DeviceId)};
+          static_cast<uint32_t>(DeviceId), DynamicMemorySize};
 
 #ifdef OMPTARGET_DEBUG
       if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
@@ -1190,7 +1199,7 @@ class DeviceRTLTy {
     Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
                          /* gridDimZ */ 1, CudaThreadsPerBlock,
                          /* blockDimY */ 1, /* blockDimZ */ 1,
-                         /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+                         DynamicMemorySize, Stream, &Args[0], nullptr);
     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
       return OFFLOAD_FAIL;
 

diff  --git a/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
new file mode 100644
index 000000000000..9a74ed36c96f
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_dynamic_shared_memory.c
@@ -0,0 +1,31 @@
+// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime
+// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=4 \
+// RUN:   %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
+// REQUIRES: nvptx64-nvidia-cuda
+
+#include <omp.h>
+#include <stdio.h>
+
+void *get_dynamic_shared() { return NULL; }
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+extern void *__kmpc_get_dynamic_shared();
+void *get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
+#pragma omp end declare variant
+
+int main() {
+  int x;
+#pragma omp target parallel map(from : x)
+  {
+    int *buf = get_dynamic_shared();
+#pragma omp barrier
+    if (omp_get_thread_num() == 0)
+      *buf = 1;
+#pragma omp barrier
+    if (omp_get_thread_num() == 1)
+      x = *buf;
+  }
+
+  // CHECK: PASS
+  if (x == 1)
+    printf("PASS\n");
+}


        


More information about the Openmp-commits mailing list