[Openmp-commits] [openmp] AsyncMalloc (PR #72440)

Wed Nov 15 13:10:11 PST 2023

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/72440

- WIP Async Malloc for Nvidia
- [Libomptarget] Fix RPC-based `malloc` on NVPTX


>From 58031e8008b12d0e3aa7b06329fdd43a173540e2 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 15 Nov 2023 07:22:41 -0600
Subject: [PATCH 1/2] WIP Async Malloc for Nvidia

---
 openmp/libomptarget/include/omptarget.h       |  4 +++
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 20 ++++++++----
 .../common/MemoryManager/MemoryManager.h      |  7 ++--
 .../common/PluginInterface/RPC.cpp            | 32 +++++++++++++++++--
 .../cuda/dynamic_cuda/cuda.cpp                |  3 ++
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |  2 ++
 .../plugins-nextgen/cuda/src/rtl.cpp          | 29 ++++++++++++++---
 .../generic-elf-64bit/src/rtl.cpp             |  8 +++--
 8 files changed, 88 insertions(+), 17 deletions(-)

diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 818967c88904ec0..46902bb133631d9 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -195,6 +195,10 @@ struct __tgt_async_info {
   // is CUstream.
   void *Queue = nullptr;
 
+  /// Arbitrary data that is only valid once the asynchronous operation has been
+  /// completed.
+  void *Data = nullptr;
+
   /// A collection of allocations that are associated with this stream and that
   /// should be freed after finalization.
   llvm::SmallVector<void *, 2> AssociatedAllocations;
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index a529c379844e904..e490dcbb09175c5 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -341,10 +341,12 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
 private:
   /// Allocation callback that will be called once the memory manager does not
   /// have more previously allocated buffers.
-  void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
+  void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind,
+                 __tgt_async_info *AsyncInfo = nullptr) override;
 
   /// Deallocation callack that will be called by the memory manager.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  int free(void *TgtPtr, TargetAllocTy Kind,
+           __tgt_async_info *AsyncInfo = nullptr) override {
     if (auto Err = MemoryPool->deallocate(TgtPtr)) {
       consumeError(std::move(Err));
       return OFFLOAD_FAIL;
@@ -2053,10 +2055,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Allocate memory on the device or related to the device.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
+  void *allocate(size_t Size, void *, TargetAllocTy Kind,
+                 __tgt_async_info *AsyncInfo) override;
 
   /// Deallocate memory on the device or related to the device.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  int free(void *TgtPtr, TargetAllocTy Kind,
+           __tgt_async_info *AsyncInfo) override {
     if (TgtPtr == nullptr)
       return OFFLOAD_SUCCESS;
 
@@ -3219,7 +3223,8 @@ Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
 }
 
 void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
-                                      TargetAllocTy Kind) {
+                                      TargetAllocTy Kind,
+                                      __tgt_async_info *AsyncInfo) {
   // Allocate memory from the pool.
   void *Ptr = nullptr;
   if (auto Err = MemoryPool->allocate(Size, &Ptr)) {
@@ -3238,7 +3243,8 @@ void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
   return Ptr;
 }
 
-void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
+void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind,
+                               __tgt_async_info *AsyncInfo) {
   if (Size == 0)
     return nullptr;
 
@@ -3281,6 +3287,8 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
     }
   }
 
+  if (AsyncInfo)
+    AsyncInfo->Data = Alloc;
   return Alloc;
 }
 
diff --git a/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
index 37ef80a1af3ae22..200218cfe4b1cd3 100644
--- a/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
+++ b/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "Debug.h"
+#include "omptarget.h"
 
 /// Base class of per-device allocator.
 class DeviceAllocatorTy {
@@ -31,10 +32,12 @@ class DeviceAllocatorTy {
   /// Allocate a memory of size \p Size . \p HstPtr is used to assist the
   /// allocation.
   virtual void *allocate(size_t Size, void *HstPtr,
-                         TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
+                         TargetAllocTy Kind = TARGET_ALLOC_DEFAULT,
+                         __tgt_async_info *AsyncInfo = nullptr) = 0;
 
   /// Delete the pointer \p TgtPtr on the device
-  virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
+  virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT,
+                   __tgt_async_info *AsyncInfo = nullptr) = 0;
 };
 
 /// Class of memory manager. The memory manager is per-device by using
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
index 72bba012fcf93c6..5574a4d7ddab24a 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
@@ -67,15 +67,31 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
         "Failed to initialize RPC server for device %d: %d", DeviceId, Err);
 
   // Register a custom opcode handler to perform plugin specific allocation.
-  // FIXME: We need to make sure this uses asynchronous allocations on CUDA.
   auto MallocHandler = [](rpc_port_t Port, void *Data) {
     rpc_recv_and_send(
         Port,
         [](rpc_buffer_t *Buffer, void *Data) {
           plugin::GenericDeviceTy &Device =
               *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
+
+          __tgt_async_info *AsyncInfo;
+          if (auto Err = Device.initAsyncInfo(&AsyncInfo)) {
+            consumeError(std::move(Err));
+            Buffer->data[0] = reinterpret_cast<uintptr_t>(nullptr);
+            return;
+          }
+
           Buffer->data[0] = reinterpret_cast<uintptr_t>(
-              Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
+              Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE),
+              &AsyncInfo);
+
+          if (auto Err = Device.synchronize(AsyncInfo)) {
+            consumeError(std::move(Err));
+            Buffer->data[0] = reinterpret_cast<uintptr_t>(nullptr);
+            return;
+          }
+
+          Buffer->data[0] = reinterpret_cast<uintptr_t>(AsyncInfo->Data);
         },
         Data);
   };
@@ -92,8 +108,18 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
         [](rpc_buffer_t *Buffer, void *Data) {
           plugin::GenericDeviceTy &Device =
               *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
+
+          __tgt_async_info *AsyncInfo;
+          if (auto Err = Device.initAsyncInfo(&AsyncInfo)) {
+            consumeError(std::move(Err));
+            return;
+          }
+
           Device.free(reinterpret_cast<void *>(Buffer->data[0]),
-                      TARGET_ALLOC_DEVICE);
+                      TARGET_ALLOC_DEVICE, AsyncInfo);
+
+          if (auto Err = Device.synchronize(AsyncInfo))
+            consumeError(std::move(Err));
         },
         Data);
   };
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 3d0de0d5b2caff6..e968ec712921641 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -42,6 +42,7 @@ DLWRAP(cuLaunchKernel, 11)
 DLWRAP(cuMemAlloc, 2)
 DLWRAP(cuMemAllocHost, 2)
 DLWRAP(cuMemAllocManaged, 3)
+DLWRAP(cuMemAllocAsync, 3)
 
 DLWRAP(cuMemcpyDtoDAsync, 4)
 DLWRAP(cuMemcpyDtoH, 3)
@@ -51,6 +52,8 @@ DLWRAP(cuMemcpyHtoDAsync, 4)
 
 DLWRAP(cuMemFree, 1)
 DLWRAP(cuMemFreeHost, 1)
+DLWRAP(cuMemFreeAsync, 2)
+
 DLWRAP(cuModuleGetFunction, 3)
 DLWRAP(cuModuleGetGlobal, 4)
 
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 3e0307759924b21..32031c28f8797ed 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -293,6 +293,7 @@ CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
 CUresult cuMemAlloc(CUdeviceptr *, size_t);
 CUresult cuMemAllocHost(void **, size_t);
 CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
+CUresult cuMemAllocAsync(CUdeviceptr *, size_t, CUstream);
 
 CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
 CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
@@ -302,6 +303,7 @@ CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
 
 CUresult cuMemFree(CUdeviceptr);
 CUresult cuMemFreeHost(void *);
+CUresult cuMemFreeAsync(CUdeviceptr, CUstream);
 
 CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
 CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index a6e28574a7f08e3..67f80e432b86999 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -61,6 +61,14 @@ cuMemGetAllocationGranularity(size_t *granularity,
                               CUmemAllocationGranularity_flags option) {}
 #endif
 
+#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
+// Forward declarations of asynchronous memory management functions. This is
+// necessary for older versions of CUDA.
+CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = nullptr; }
+
+CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
+#endif
+
 /// Class implementing the CUDA device images properties.
 struct CUDADeviceImageTy : public DeviceImageTy {
   /// Create the CUDA image with the id and the target image pointer.
@@ -460,7 +468,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate memory on the device or related to the device.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
+  void *allocate(size_t Size, void *, TargetAllocTy Kind,
+                 __tgt_async_info *AsyncInfo = nullptr) override {
     if (Size == 0)
       return nullptr;
 
@@ -476,7 +485,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
     switch (Kind) {
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
-      Res = cuMemAlloc(&DevicePtr, Size);
+      if (AsyncInfo) {
+        Res = cuMemAllocAsync(&DevicePtr, Size,
+                              reinterpret_cast<CUstream>(AsyncInfo->Queue));
+      } else
+        Res = cuMemAlloc(&DevicePtr, Size);
       MemAlloc = (void *)DevicePtr;
       break;
     case TARGET_ALLOC_HOST:
@@ -493,11 +506,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
       REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
       return nullptr;
     }
+
+    if (AsyncInfo)
+      AsyncInfo->Data = MemAlloc;
     return MemAlloc;
   }
 
   /// Deallocate memory on the device or related to the device.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  int free(void *TgtPtr, TargetAllocTy Kind,
+           __tgt_async_info *AsyncInfo = nullptr) override {
     if (TgtPtr == nullptr)
       return OFFLOAD_SUCCESS;
 
@@ -511,7 +528,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
     case TARGET_ALLOC_SHARED:
-      Res = cuMemFree((CUdeviceptr)TgtPtr);
+      if (AsyncInfo)
+        Res = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr),
+                             reinterpret_cast<CUstream>(AsyncInfo->Queue));
+      else
+        Res = cuMemFree(reinterpret_cast<CUdeviceptr>(TgtPtr));
       break;
     case TARGET_ALLOC_HOST:
       Res = cuMemFreeHost(TgtPtr);
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 85cf9bef1543b2a..9ab27c26bce70c0 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -204,7 +204,8 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate memory. Use std::malloc in all cases.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
+  void *allocate(size_t Size, void *, TargetAllocTy Kind,
+                 __tgt_async_info *AsyncInfo) override {
     if (Size == 0)
       return nullptr;
 
@@ -217,11 +218,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
       MemAlloc = std::malloc(Size);
       break;
     }
+    if (AsyncInfo)
+      AsyncInfo->Data = MemAlloc;
     return MemAlloc;
   }
 
   /// Free the memory. Use std::free in all cases.
-  int free(void *TgtPtr, TargetAllocTy Kind) override {
+  int free(void *TgtPtr, TargetAllocTy Kind,
+           __tgt_async_info *AsyncInfo) override {
     std::free(TgtPtr);
     return OFFLOAD_SUCCESS;
   }

>From 0521bbc8b69fcbf0551f0be128759d211d28b6c3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 15 Nov 2023 15:06:22 -0600
Subject: [PATCH 2/2] [Libomptarget] Fix RPC-based `malloc` on NVPTX

Summary:
The device allocator on NVPTX architectures is enqueud to a stream that
the kernel is potentially executing on. This can lead to deadlocks as
the kernel will not proceed until the allocation is complete and the
allocation will not proceed until the kernel is complete. CUDA 11.2
introduced async allocations that we can manually place on separate
streams to combat this. This patch makes a new allocation type that's
guarunteed to be non-blocking so it will actually make progress, only
Nvidia needs to care about this as the others are not blocking in this
way by default.

I had originally tried to make the `alloc` and `free` methods take a
`__tgt_async_info`. However, I observed that with the large volume of
streams being created by a parallel test it quickly locked up the system
as presumably too many streams were being created. This implementation
not just creates a new stream and immediately destroys it. This
obviously isn't very fast, but it at least gets the cases to stop
deadlocking for now.
---
 openmp/libomptarget/include/omptarget.h       |  8 ++--
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 22 ++++-------
 .../common/MemoryManager/MemoryManager.h      |  7 +---
 .../common/PluginInterface/RPC.cpp            | 33 ++--------------
 .../plugins-nextgen/cuda/src/rtl.cpp          | 38 ++++++++++---------
 .../generic-elf-64bit/src/rtl.cpp             |  9 ++---
 openmp/libomptarget/test/libc/malloc.c        | 10 ++++-
 7 files changed, 49 insertions(+), 78 deletions(-)

diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index 46902bb133631d9..745f6d928476990 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -116,7 +116,9 @@ enum TargetAllocTy : int32_t {
   TARGET_ALLOC_DEVICE = 0,
   TARGET_ALLOC_HOST,
   TARGET_ALLOC_SHARED,
-  TARGET_ALLOC_DEFAULT
+  TARGET_ALLOC_DEFAULT,
+  /// The allocation will not block on other streams.
+  TARGET_ALLOC_DEVICE_NON_BLOCKING,
 };
 
 /// This struct contains all of the arguments to a target kernel region launch.
@@ -195,10 +197,6 @@ struct __tgt_async_info {
   // is CUstream.
   void *Queue = nullptr;
 
-  /// Arbitrary data that is only valid once the asynchronous operation has been
-  /// completed.
-  void *Data = nullptr;
-
   /// A collection of allocations that are associated with this stream and that
   /// should be freed after finalization.
   llvm::SmallVector<void *, 2> AssociatedAllocations;
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index e490dcbb09175c5..d7141726bffa5d6 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -341,12 +341,10 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
 private:
   /// Allocation callback that will be called once the memory manager does not
   /// have more previously allocated buffers.
-  void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind,
-                 __tgt_async_info *AsyncInfo = nullptr) override;
+  void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
 
   /// Deallocation callack that will be called by the memory manager.
-  int free(void *TgtPtr, TargetAllocTy Kind,
-           __tgt_async_info *AsyncInfo = nullptr) override {
+  int free(void *TgtPtr, TargetAllocTy Kind) override {
     if (auto Err = MemoryPool->deallocate(TgtPtr)) {
       consumeError(std::move(Err));
       return OFFLOAD_FAIL;
@@ -2055,12 +2053,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Allocate memory on the device or related to the device.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind,
-                 __tgt_async_info *AsyncInfo) override;
+  void *allocate(size_t Size, void *, TargetAllocTy Kind) override;
 
   /// Deallocate memory on the device or related to the device.
-  int free(void *TgtPtr, TargetAllocTy Kind,
-           __tgt_async_info *AsyncInfo) override {
+  int free(void *TgtPtr, TargetAllocTy Kind) override {
     if (TgtPtr == nullptr)
       return OFFLOAD_SUCCESS;
 
@@ -2068,6 +2064,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     switch (Kind) {
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemoryPool = CoarseGrainedMemoryPools[0];
       break;
     case TARGET_ALLOC_HOST:
@@ -3223,8 +3220,7 @@ Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
 }
 
 void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
-                                      TargetAllocTy Kind,
-                                      __tgt_async_info *AsyncInfo) {
+                                      TargetAllocTy Kind) {
   // Allocate memory from the pool.
   void *Ptr = nullptr;
   if (auto Err = MemoryPool->allocate(Size, &Ptr)) {
@@ -3243,8 +3239,7 @@ void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr,
   return Ptr;
 }
 
-void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind,
-                               __tgt_async_info *AsyncInfo) {
+void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   if (Size == 0)
     return nullptr;
 
@@ -3253,6 +3248,7 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind,
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
   case TARGET_ALLOC_DEVICE:
+  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
     MemoryPool = CoarseGrainedMemoryPools[0];
     break;
   case TARGET_ALLOC_HOST:
@@ -3287,8 +3283,6 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind,
     }
   }
 
-  if (AsyncInfo)
-    AsyncInfo->Data = Alloc;
   return Alloc;
 }
 
diff --git a/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h b/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
index 200218cfe4b1cd3..37ef80a1af3ae22 100644
--- a/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
+++ b/openmp/libomptarget/plugins-nextgen/common/MemoryManager/MemoryManager.h
@@ -22,7 +22,6 @@
 #include <vector>
 
 #include "Debug.h"
-#include "omptarget.h"
 
 /// Base class of per-device allocator.
 class DeviceAllocatorTy {
@@ -32,12 +31,10 @@ class DeviceAllocatorTy {
   /// Allocate a memory of size \p Size . \p HstPtr is used to assist the
   /// allocation.
   virtual void *allocate(size_t Size, void *HstPtr,
-                         TargetAllocTy Kind = TARGET_ALLOC_DEFAULT,
-                         __tgt_async_info *AsyncInfo = nullptr) = 0;
+                         TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
 
   /// Delete the pointer \p TgtPtr on the device
-  virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT,
-                   __tgt_async_info *AsyncInfo = nullptr) = 0;
+  virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0;
 };
 
 /// Class of memory manager. The memory manager is per-device by using
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
index 5574a4d7ddab24a..27a5a6324ceb429 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
@@ -73,25 +73,8 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
         [](rpc_buffer_t *Buffer, void *Data) {
           plugin::GenericDeviceTy &Device =
               *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-
-          __tgt_async_info *AsyncInfo;
-          if (auto Err = Device.initAsyncInfo(&AsyncInfo)) {
-            consumeError(std::move(Err));
-            Buffer->data[0] = reinterpret_cast<uintptr_t>(nullptr);
-            return;
-          }
-
-          Buffer->data[0] = reinterpret_cast<uintptr_t>(
-              Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE),
-              &AsyncInfo);
-
-          if (auto Err = Device.synchronize(AsyncInfo)) {
-            consumeError(std::move(Err));
-            Buffer->data[0] = reinterpret_cast<uintptr_t>(nullptr);
-            return;
-          }
-
-          Buffer->data[0] = reinterpret_cast<uintptr_t>(AsyncInfo->Data);
+          Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
+              Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
         },
         Data);
   };
@@ -108,18 +91,8 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
         [](rpc_buffer_t *Buffer, void *Data) {
           plugin::GenericDeviceTy &Device =
               *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-
-          __tgt_async_info *AsyncInfo;
-          if (auto Err = Device.initAsyncInfo(&AsyncInfo)) {
-            consumeError(std::move(Err));
-            return;
-          }
-
           Device.free(reinterpret_cast<void *>(Buffer->data[0]),
-                      TARGET_ALLOC_DEVICE, AsyncInfo);
-
-          if (auto Err = Device.synchronize(AsyncInfo))
-            consumeError(std::move(Err));
+                      TARGET_ALLOC_DEVICE_NON_BLOCKING);
         },
         Data);
   };
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 67f80e432b86999..0ee46cd64a64ebf 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -468,8 +468,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate memory on the device or related to the device.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind,
-                 __tgt_async_info *AsyncInfo = nullptr) override {
+  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
     if (Size == 0)
       return nullptr;
 
@@ -485,11 +484,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
     switch (Kind) {
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
-      if (AsyncInfo) {
-        Res = cuMemAllocAsync(&DevicePtr, Size,
-                              reinterpret_cast<CUstream>(AsyncInfo->Queue));
-      } else
-        Res = cuMemAlloc(&DevicePtr, Size);
+      Res = cuMemAlloc(&DevicePtr, Size);
       MemAlloc = (void *)DevicePtr;
       break;
     case TARGET_ALLOC_HOST:
@@ -499,6 +494,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
       Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
       MemAlloc = (void *)DevicePtr;
       break;
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+      CUstream Stream;
+      if (Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))
+        break;
+      if (Res = cuMemAllocAsync(&DevicePtr, Size, Stream))
+        break;
+      MemAlloc = (void *)DevicePtr;
+      Res = cuStreamDestroy(Stream);
+    }
     }
 
     if (auto Err =
@@ -506,15 +510,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
       REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
       return nullptr;
     }
-
-    if (AsyncInfo)
-      AsyncInfo->Data = MemAlloc;
     return MemAlloc;
   }
 
   /// Deallocate memory on the device or related to the device.
-  int free(void *TgtPtr, TargetAllocTy Kind,
-           __tgt_async_info *AsyncInfo = nullptr) override {
+  int free(void *TgtPtr, TargetAllocTy Kind) override {
     if (TgtPtr == nullptr)
       return OFFLOAD_SUCCESS;
 
@@ -528,15 +528,19 @@ struct CUDADeviceTy : public GenericDeviceTy {
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
     case TARGET_ALLOC_SHARED:
-      if (AsyncInfo)
-        Res = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr),
-                             reinterpret_cast<CUstream>(AsyncInfo->Queue));
-      else
-        Res = cuMemFree(reinterpret_cast<CUdeviceptr>(TgtPtr));
+      Res = cuMemFree((CUdeviceptr)TgtPtr);
       break;
     case TARGET_ALLOC_HOST:
       Res = cuMemFreeHost(TgtPtr);
       break;
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
+      CUstream Stream;
+      if (Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING))
+        break;
+      cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
+      if (Res = cuStreamDestroy(Stream))
+        break;
+    }
     }
 
     if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 9ab27c26bce70c0..66937aa3e10bb4d 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -204,8 +204,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate memory. Use std::malloc in all cases.
-  void *allocate(size_t Size, void *, TargetAllocTy Kind,
-                 __tgt_async_info *AsyncInfo) override {
+  void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
     if (Size == 0)
       return nullptr;
 
@@ -215,17 +214,15 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     case TARGET_ALLOC_DEVICE:
     case TARGET_ALLOC_HOST:
     case TARGET_ALLOC_SHARED:
+    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemAlloc = std::malloc(Size);
       break;
     }
-    if (AsyncInfo)
-      AsyncInfo->Data = MemAlloc;
     return MemAlloc;
   }
 
   /// Free the memory. Use std::free in all cases.
-  int free(void *TgtPtr, TargetAllocTy Kind,
-           __tgt_async_info *AsyncInfo) override {
+  int free(void *TgtPtr, TargetAllocTy Kind) override {
     std::free(TgtPtr);
     return OFFLOAD_SUCCESS;
   }
diff --git a/openmp/libomptarget/test/libc/malloc.c b/openmp/libomptarget/test/libc/malloc.c
index c18a724930f41e5..b587b618472e430 100644
--- a/openmp/libomptarget/test/libc/malloc.c
+++ b/openmp/libomptarget/test/libc/malloc.c
@@ -13,7 +13,7 @@ int main() {
   unsigned *d_x;
 #pragma omp target map(from : d_x)
   {
-    d_x = malloc(sizeof(unsigned));
+    d_x = (unsigned *)malloc(sizeof(unsigned));
     *d_x = 1;
   }
 
@@ -23,6 +23,14 @@ int main() {
 #pragma omp target is_device_ptr(d_x)
   { free(d_x); }
 
+#pragma omp target teams num_teams(64)
+#pragma omp parallel num_threads(32)
+  {
+    int *ptr = (int *)malloc(sizeof(int));
+    *ptr = 42;
+    free(ptr);
+  }
+
   // CHECK: PASS
   if (h_x == 1)
     fputs("PASS\n", stdout);