[Openmp-commits] [openmp] 1660288 - [OpenMP][CUDA] Use one event pool per device

Mon Mar 7 21:43:32 PST 2022

Author: Johannes Doerfert
Date: 2022-03-07T23:43:05-06:00
New Revision: 1660288b281789c7621cdbe9f5695f3589106ff4

URL: https://github.com/llvm/llvm-project/commit/1660288b281789c7621cdbe9f5695f3589106ff4
DIFF: https://github.com/llvm/llvm-project/commit/1660288b281789c7621cdbe9f5695f3589106ff4.diff

LOG: [OpenMP][CUDA] Use one event pool per device

An event pool, similar to the stream pool, needs to be kept per device.
For one, events are associated with cuda contexts which means we cannot
destroy the former after the latter. Also, CUDA documentation states
streams and events need to be associated with the same context, which
we did not ensure at all.

Differential Revision: https://reviews.llvm.org/D120142

Added: 
    

Modified: 
    openmp/libomptarget/plugins/cuda/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index cb511240ac2c6..bbffc478f66d8 100644

--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -144,6 +144,8 @@ int syncEvent(void *EventPtr) {
   return OFFLOAD_SUCCESS;
 }
 
+namespace {
+
 // Structure contains per-device data
 struct DeviceDataTy {
   /// List that contains all the kernels.
@@ -164,23 +166,27 @@ struct DeviceDataTy {
 /// Resource allocator where \p T is the resource type.
 /// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
 /// accordingly. The implementation should not raise any exception.
-template <typename T> class AllocatorTy {
-public:
+template <typename T> struct AllocatorTy {
+  AllocatorTy(CUcontext C) noexcept : Context(C) {}
+  using ElementTy = T;
+
+  virtual ~AllocatorTy() {}
+
   /// Create a resource and assign to R.
-  int create(T &R) noexcept;
+  virtual int create(T &R) noexcept = 0;
   /// Destroy the resource.
-  int destroy(T) noexcept;
-};
+  virtual int destroy(T) noexcept = 0;
 
-/// Allocator for CUstream.
-template <> class AllocatorTy<CUstream> {
+protected:
   CUcontext Context;
+};
 
-public:
-  AllocatorTy(CUcontext C) noexcept : Context(C) {}
+/// Allocator for CUstream.
+struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
+  StreamAllocatorTy(CUcontext C) noexcept : AllocatorTy<CUstream>(C) {}
 
   /// See AllocatorTy<T>::create.
-  int create(CUstream &Stream) noexcept {
+  int create(CUstream &Stream) noexcept override {
     if (!checkResult(cuCtxSetCurrent(Context),
                      "Error returned from cuCtxSetCurrent\n"))
       return OFFLOAD_FAIL;
@@ -193,7 +199,7 @@ template <> class AllocatorTy<CUstream> {
   }
 
   /// See AllocatorTy<T>::destroy.
-  int destroy(CUstream Stream) noexcept {
+  int destroy(CUstream Stream) noexcept override {
     if (!checkResult(cuCtxSetCurrent(Context),
                      "Error returned from cuCtxSetCurrent\n"))
       return OFFLOAD_FAIL;
@@ -206,10 +212,11 @@ template <> class AllocatorTy<CUstream> {
 };
 
 /// Allocator for CUevent.
-template <> class AllocatorTy<CUevent> {
-public:
+struct EventAllocatorTy final : public AllocatorTy<CUevent> {
+  EventAllocatorTy(CUcontext C) noexcept : AllocatorTy<CUevent>(C) {}
+
   /// See AllocatorTy<T>::create.
-  int create(CUevent &Event) noexcept {
+  int create(CUevent &Event) noexcept override {
     if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
                      "Error returned from cuEventCreate\n"))
       return OFFLOAD_FAIL;
@@ -218,7 +225,7 @@ template <> class AllocatorTy<CUevent> {
   }
 
   /// See AllocatorTy<T>::destroy.
-  int destroy(CUevent Event) noexcept {
+  int destroy(CUevent Event) noexcept override {
     if (!checkResult(cuEventDestroy(Event),
                      "Error returned from cuEventDestroy\n"))
       return OFFLOAD_FAIL;
@@ -229,15 +236,16 @@ template <> class AllocatorTy<CUevent> {
 
 /// A generic pool of resources where \p T is the resource type.
 /// \p T should be copyable as the object is stored in \p std::vector .
-template <typename T> class ResourcePoolTy {
+template <typename AllocTy> class ResourcePoolTy {
+  using ElementTy = typename AllocTy::ElementTy;
   /// Index of the next available resource.
   size_t Next = 0;
   /// Mutex to guard the pool.
   std::mutex Mutex;
   /// Pool of resources.
-  std::vector<T> Resources;
+  std::vector<ElementTy> Resources;
   /// A reference to the corresponding allocator.
-  AllocatorTy<T> Allocator;
+  AllocTy Allocator;
 
   /// If `Resources` is used up, we will fill in more resources. It assumes that
   /// the new size `Size` should be always larger than the current size.
@@ -246,7 +254,7 @@ template <typename T> class ResourcePoolTy {
     assert(Size > CurSize && "Unexpected smaller size");
     Resources.reserve(Size);
     for (auto I = CurSize; I < Size; ++I) {
-      T NewItem;
+      ElementTy NewItem;
       int Ret = Allocator.create(NewItem);
       if (Ret != OFFLOAD_SUCCESS)
         return false;
@@ -256,7 +264,7 @@ template <typename T> class ResourcePoolTy {
   }
 
 public:
-  ResourcePoolTy(AllocatorTy<T> &&A, size_t Size = 0) noexcept
+  ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
       : Allocator(std::move(A)) {
     if (Size)
       (void)resize(Size);
@@ -275,7 +283,7 @@ template <typename T> class ResourcePoolTy {
   /// xxxxxs+++++++++
   ///       ^
   ///       Next
-  int acquire(T &R) noexcept {
+  int acquire(ElementTy &R) noexcept {
     std::lock_guard<std::mutex> LG(Mutex);
     if (Next == Resources.size()) {
       auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
@@ -302,7 +310,7 @@ template <typename T> class ResourcePoolTy {
   /// `Next`. The left one will in the end be overwritten by another resource.
   /// Therefore, after several execution, the order of pool might be 
diff erent
   /// from its initial state.
-  void release(T R) noexcept {
+  void release(ElementTy R) noexcept {
     std::lock_guard<std::mutex> LG(Mutex);
     Resources[--Next] = R;
   }
@@ -316,6 +324,8 @@ template <typename T> class ResourcePoolTy {
   }
 };
 
+} // namespace
+
 class DeviceRTLTy {
   int NumberOfDevices;
   // OpenMP environment properties
@@ -326,17 +336,22 @@ class DeviceRTLTy {
   int64_t RequiresFlags;
   // Amount of dynamic shared memory to use at launch.
   uint64_t DynamicMemorySize;
-  // Number of initial streams for each device.
+
+  /// Number of initial streams for each device.
   int NumInitialStreams = 32;
 
+  /// Number of initial events for each device.
+  int NumInitialEvents = 8;
+
   static constexpr const int32_t HardThreadLimit = 1024;
   static constexpr const int32_t DefaultNumTeams = 128;
   static constexpr const int32_t DefaultNumThreads = 128;
 
-  using StreamPoolTy = ResourcePoolTy<CUstream>;
+  using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
   std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
 
-  ResourcePoolTy<CUevent> EventPool;
+  using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
+  std::vector<std::unique_ptr<EventPoolTy>> EventPool;
 
   std::vector<DeviceDataTy> DeviceData;
   std::vector<CUmodule> Modules;
@@ -494,7 +509,7 @@ class DeviceRTLTy {
   DeviceRTLTy()
       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
         EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
-        DynamicMemorySize(0), EventPool(AllocatorTy<CUevent>()) {
+        DynamicMemorySize(0) {
 
     DP("Start initializing CUDA\n");
 
@@ -519,6 +534,7 @@ class DeviceRTLTy {
 
     DeviceData.resize(NumberOfDevices);
     StreamPool.resize(NumberOfDevices);
+    EventPool.resize(NumberOfDevices);
 
     // Get environment variables regarding teams
     if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
@@ -622,11 +638,15 @@ class DeviceRTLTy {
     if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
       return OFFLOAD_FAIL;
 
-    // Initialize stream pool
+    // Initialize the stream pool.
     if (!StreamPool[DeviceId])
       StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(
-          AllocatorTy<CUstream>(DeviceData[DeviceId].Context),
-          NumInitialStreams);
+          StreamAllocatorTy(DeviceData[DeviceId].Context), NumInitialStreams);
+
+    // Initialize the event pool.
+    if (!EventPool[DeviceId])
+      EventPool[DeviceId] = std::make_unique<EventPoolTy>(
+          EventAllocatorTy(DeviceData[DeviceId].Context), NumInitialEvents);
 
     // Query attributes to determine number of threads/block and blocks/grid.
     int MaxGridDimX;
@@ -761,12 +781,7 @@ class DeviceRTLTy {
       checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n");
 
     StreamPool[DeviceId].reset();
-
-    // The event pool is shared, we initialize it once all devices have been
-    // deinitialized.
-    if (std::none_of(InitializedFlags.begin(), InitializedFlags.end(),
-                     [](bool IsInitialized) { return IsInitialized; }))
-      EventPool.clear();
+    EventPool[DeviceId].reset();
 
     // Destroy context
     DeviceDataTy &D = DeviceData[DeviceId];
@@ -1412,16 +1427,16 @@ class DeviceRTLTy {
     printf("    Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
   }
 
-  int createEvent(void **P) {
+  int createEvent(int DeviceId, void **P) {
     CUevent Event = nullptr;
-    if (EventPool.acquire(Event) != OFFLOAD_SUCCESS)
+    if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
       return OFFLOAD_FAIL;
     *P = Event;
     return OFFLOAD_SUCCESS;
   }
 
-  int destroyEvent(void *EventPtr) {
-    EventPool.release(reinterpret_cast<CUevent>(EventPtr));
+  int destroyEvent(int DeviceId, void *EventPtr) {
+    EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
     return OFFLOAD_SUCCESS;
   }
 
@@ -1695,7 +1710,7 @@ void __tgt_rtl_print_device_info(int32_t device_id) {
 
 int32_t __tgt_rtl_create_event(int32_t device_id, void **event) {
   assert(event && "event is nullptr");
-  return DeviceRTL.createEvent(event);
+  return DeviceRTL.createEvent(device_id, event);
 }
 
 int32_t __tgt_rtl_record_event(int32_t device_id, void *event_ptr,
@@ -1725,7 +1740,7 @@ int32_t __tgt_rtl_sync_event(int32_t device_id, void *event_ptr) {
 int32_t __tgt_rtl_destroy_event(int32_t device_id, void *event_ptr) {
   assert(event_ptr && "event is nullptr");
 
-  return DeviceRTL.destroyEvent(event_ptr);
+  return DeviceRTL.destroyEvent(device_id, event_ptr);
 }
 
 int32_t __tgt_rtl_release_async_info(int32_t device_id,