[llvm] [Offload] Make olLaunchKernel test thread safe (PR #149497)

Tue Jul 22 04:40:46 PDT 2025

https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/149497

>From 72f32b6295fb58fa2cd726af52f10710bd4584a0 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 18 Jul 2025 12:27:17 +0100
Subject: [PATCH 1/6] [Offload] Make olLaunchKernel test thread safe

This sprinkles a few mutexes around the plugin interface so that the
olLaunchKernel CTS test now passes when ran on multiple threads.

Part of this also involved changing the interface for device synchronise
so that it can optionally not free the underlying queue (which
introduced a race condition in liboffload).
---
 offload/include/Shared/APITypes.h             |  4 ++++
 offload/liboffload/src/OffloadImpl.cpp        |  8 +------
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 14 ++++++++---
 .../common/include/PluginInterface.h          |  8 +++++--
 .../common/src/PluginInterface.cpp            |  7 ++++--
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 15 ++++++++----
 offload/plugins-nextgen/host/src/rtl.cpp      |  3 ++-
 .../unittests/OffloadAPI/common/Fixtures.hpp  | 18 +++++++++++++++
 .../OffloadAPI/kernel/olLaunchKernel.cpp      | 23 +++++++++++++++++++
 9 files changed, 81 insertions(+), 19 deletions(-)

diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 978b53d5d69b9..a988edce481e6 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -21,6 +21,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <mutex>
 
 extern "C" {
 
@@ -75,6 +76,9 @@ struct __tgt_async_info {
   /// should be freed after finalization.
   llvm::SmallVector<void *, 2> AssociatedAllocations;
 
+  /// Mutex to guard access to AssociatedAllocations
+  std::mutex AllocationsMutex;
+
   /// The kernel launch environment used to issue a kernel. Stored here to
   /// ensure it is a valid location while the transfer to the device is
   /// happening.
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index ffc9016bca0a3..d0dced8be7a61 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -487,16 +487,10 @@ Error olWaitQueue_impl(ol_queue_handle_t Queue) {
   // Host plugin doesn't have a queue set so it's not safe to call synchronize
   // on it, but we have nothing to synchronize in that situation anyway.
   if (Queue->AsyncInfo->Queue) {
-    if (auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo))
+    if (auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo, false))
       return Err;
   }
 
-  // Recreate the stream resource so the queue can be reused
-  // TODO: Would be easier for the synchronization to (optionally) not release
-  // it to begin with.
-  if (auto Res = Queue->Device->Device->initAsyncInfo(&Queue->AsyncInfo))
-    return Res;
-
   return Error::success();
 }
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index b2fd950c9d500..509f6c03e21fe 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2227,6 +2227,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Get the stream of the asynchronous info structure or get a new one.
   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
                   AMDGPUStreamTy *&Stream) {
+    std::lock_guard<std::mutex> StreamLock{StreamMutex};
     // Get the stream (if any) from the async info.
     Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
     if (!Stream) {
@@ -2291,7 +2292,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Synchronize current thread with the pending operations on the async info.
-  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                        bool RemoveQueue) override {
     AMDGPUStreamTy *Stream =
         reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
     assert(Stream && "Invalid stream");
@@ -2302,8 +2304,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
     // own tasks.
-    AsyncInfo.Queue = nullptr;
-    return AMDGPUStreamManager.returnResource(Stream);
+    if (RemoveQueue) {
+      AsyncInfo.Queue = nullptr;
+      return AMDGPUStreamManager.returnResource(Stream);
+    }
+    return Plugin::success();
   }
 
   /// Query for the completion of the pending operations on the async info.
@@ -3013,6 +3018,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// True is the system is configured with XNACK-Enabled.
   /// False otherwise.
   bool IsXnackEnabled = false;
+
+  /// Mutex to guard getting/setting the stream
+  std::mutex StreamMutex;
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 162b149ab483e..5fd34a9236f83 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -104,6 +104,7 @@ struct AsyncInfoWrapperTy {
   /// Register \p Ptr as an associated allocation that is freed after
   /// finalization.
   void freeAllocationAfterSynchronization(void *Ptr) {
+    std::lock_guard<std::mutex> AllocationGuard{AsyncInfoPtr->AllocationsMutex};
     AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
   }
 
@@ -772,8 +773,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   /// Synchronize the current thread with the pending operations on the
   /// __tgt_async_info structure.
-  Error synchronize(__tgt_async_info *AsyncInfo);
-  virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
+  Error synchronize(__tgt_async_info *AsyncInfo, bool RemoveQueue = true);
+  virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                                bool RemoveQueue) = 0;
 
   /// Invokes any global constructors on the device if present and is required
   /// by the target.
@@ -1501,6 +1503,8 @@ template <typename ResourceRef> class GenericDeviceResourceManagerTy {
   /// Deinitialize the resource pool and delete all resources. This function
   /// must be called before the destructor.
   virtual Error deinit() {
+    const std::lock_guard<std::mutex> Lock(Mutex);
+
     if (NextAvailable)
       DP("Missing %d resources to be returned\n", NextAvailable);
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 81b9d423e13d8..4844f88229fb2 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1329,12 +1329,15 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
   return eraseEntry(*Entry);
 }
 
-Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
+Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
+                                   bool RemoveQueue) {
+  std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->AllocationsMutex};
+
   if (!AsyncInfo || !AsyncInfo->Queue)
     return Plugin::error(ErrorCode::INVALID_ARGUMENT,
                          "invalid async info queue");
 
-  if (auto Err = synchronizeImpl(*AsyncInfo))
+  if (auto Err = synchronizeImpl(*AsyncInfo, RemoveQueue))
     return Err;
 
   for (auto *Ptr : AsyncInfo->AssociatedAllocations)
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index b787376eb1770..f637379c5b29d 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -522,6 +522,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
   /// Get the stream of the asynchronous info structure or get a new one.
   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
+    std::lock_guard<std::mutex> StreamLock{StreamMutex};
     // Get the stream (if any) from the async info.
     Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
     if (!Stream) {
@@ -642,7 +643,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Synchronize current thread with the pending operations on the async info.
-  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                        bool RemoveQueue) override {
     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
     CUresult Res;
     Res = cuStreamSynchronize(Stream);
@@ -650,9 +652,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
     // own tasks.
-    AsyncInfo.Queue = nullptr;
-    if (auto Err = CUDAStreamManager.returnResource(Stream))
-      return Err;
+    if (RemoveQueue) {
+      AsyncInfo.Queue = nullptr;
+      if (auto Err = CUDAStreamManager.returnResource(Stream))
+        return Err;
+    }
 
     return Plugin::check(Res, "error in cuStreamSynchronize: %s");
   }
@@ -1281,6 +1285,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
   /// The maximum number of warps that can be resident on all the SMs
   /// simultaneously.
   uint32_t HardwareParallelism = 0;
+
+  /// Mutex to guard getting/setting the stream
+  std::mutex StreamMutex;
 };
 
 Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index d950572265b4c..725a37c280248 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -297,7 +297,8 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
 
   /// All functions are already synchronous. No need to do anything on this
   /// synchronization function.
-  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                        bool RemoveQueue) override {
     return Plugin::success();
   }
 
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index 546921164f691..4fe57bd80d704 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -9,6 +9,7 @@
 #include <OffloadAPI.h>
 #include <OffloadPrint.hpp>
 #include <gtest/gtest.h>
+#include <thread>
 
 #include "Environment.hpp"
 
@@ -57,6 +58,23 @@ inline std::string SanitizeString(const std::string &Str) {
   return NewStr;
 }
 
+template <typename Fn> inline void threadify(Fn body) {
+  std::vector<std::thread> Threads;
+  for (size_t I = 0; I < 20; I++) {
+    Threads.emplace_back(
+        [&body](size_t I) {
+          std::string ScopeMsg{"Thread #"};
+          ScopeMsg.append(std::to_string(I));
+          SCOPED_TRACE(ScopeMsg);
+          body(I);
+        },
+        I);
+  }
+  for (auto &T : Threads) {
+    T.join();
+  }
+}
+
 struct OffloadTest : ::testing::Test {
   ol_device_handle_t Host = TestEnvironment::getHostDevice();
 };
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index e7e608f2a64d4..3e128d1e84645 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -104,6 +104,29 @@ TEST_P(olLaunchKernelFooTest, Success) {
   ASSERT_SUCCESS(olMemFree(Mem));
 }
 
+TEST_P(olLaunchKernelFooTest, SuccessThreaded) {
+  threadify([&](size_t) {
+    void *Mem;
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                              LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+    struct {
+      void *Mem;
+    } Args{Mem};
+
+    ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                  &LaunchArgs, nullptr));
+
+    ASSERT_SUCCESS(olWaitQueue(Queue));
+
+    uint32_t *Data = (uint32_t *)Mem;
+    for (uint32_t i = 0; i < 64; i++) {
+      ASSERT_EQ(Data[i], i);
+    }
+
+    ASSERT_SUCCESS(olMemFree(Mem));
+  });
+}
+
 TEST_P(olLaunchKernelNoArgsTest, Success) {
   ASSERT_SUCCESS(
       olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, nullptr));

>From 624d6ec8b276d48b95eaf3ba88419685f3516c73 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 18 Jul 2025 12:34:47 +0100
Subject: [PATCH 2/6] Remove unneeded lock

---
 offload/plugins-nextgen/common/include/PluginInterface.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 5fd34a9236f83..c2691a04a5e29 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1503,8 +1503,6 @@ template <typename ResourceRef> class GenericDeviceResourceManagerTy {
   /// Deinitialize the resource pool and delete all resources. This function
   /// must be called before the destructor.
   virtual Error deinit() {
-    const std::lock_guard<std::mutex> Lock(Mutex);
-
     if (NextAvailable)
       DP("Missing %d resources to be returned\n", NextAvailable);
 

>From d4894b38c96c4f8f661fd2f73c5c64fa178eb742 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 18 Jul 2025 14:09:37 +0100
Subject: [PATCH 3/6] Fix convertable

---
 offload/libomptarget/interface.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index ea354400f2e99..e9b148d8a2605 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -116,7 +116,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
            TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg,
            const char *RegionName) {
   assert(PM && "Runtime not initialized");
-  static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
+  static_assert(std::is_convertible_v<TargetAsyncInfoTy &, AsyncInfoTy &>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
   TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
@@ -311,7 +311,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                int32_t ThreadLimit, void *HostPtr,
                                KernelArgsTy *KernelArgs) {
   assert(PM && "Runtime not initialized");
-  static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
+  static_assert(std::is_convertible_v<TargetAsyncInfoTy &, AsyncInfoTy &>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",

>From d3ec9ae9d242c105747973426873537ff45cff7e Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 22 Jul 2025 11:14:31 +0100
Subject: [PATCH 4/6] Refactor stream mutex

---
 offload/include/Shared/APITypes.h             |  4 ++--
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 19 +++++--------------
 .../common/include/PluginInterface.h          | 17 ++++++++++++++++-
 .../common/src/PluginInterface.cpp            |  2 +-
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 19 +++++--------------
 5 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index a988edce481e6..d6a69bfb57944 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -76,8 +76,8 @@ struct __tgt_async_info {
   /// should be freed after finalization.
   llvm::SmallVector<void *, 2> AssociatedAllocations;
 
-  /// Mutex to guard access to AssociatedAllocations
-  std::mutex AllocationsMutex;
+  /// Mutex to guard access to AssociatedAllocations and the Queue
+  std::mutex Mutex;
 
   /// The kernel launch environment used to issue a kernel. Stored here to
   /// ensure it is a valid location while the transfer to the device is
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 509f6c03e21fe..44e5a6766db48 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2227,17 +2227,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Get the stream of the asynchronous info structure or get a new one.
   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
                   AMDGPUStreamTy *&Stream) {
-    std::lock_guard<std::mutex> StreamLock{StreamMutex};
-    // Get the stream (if any) from the async info.
-    Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
-    if (!Stream) {
-      // There was no stream; get an idle one.
-      if (auto Err = AMDGPUStreamManager.getResource(Stream))
-        return Err;
-
-      // Modify the async info's stream.
-      AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
-    }
+    auto WrapperStream =
+        AsyncInfoWrapper.getOrInitQueue<AMDGPUStreamTy *>(AMDGPUStreamManager);
+    if (!WrapperStream)
+      return WrapperStream.takeError();
+    Stream = *WrapperStream;
     return Plugin::success();
   }
 
@@ -3018,9 +3012,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// True is the system is configured with XNACK-Enabled.
   /// False otherwise.
   bool IsXnackEnabled = false;
-
-  /// Mutex to guard getting/setting the stream
-  std::mutex StreamMutex;
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index c2691a04a5e29..db9302bcda367 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -59,6 +59,7 @@ struct GenericPluginTy;
 struct GenericKernelTy;
 struct GenericDeviceTy;
 struct RecordReplayTy;
+template <typename ResourceRef> class GenericDeviceResourceManagerTy;
 
 /// Class that wraps the __tgt_async_info to simply its usage. In case the
 /// object is constructed without a valid __tgt_async_info, the object will use
@@ -93,6 +94,20 @@ struct AsyncInfoWrapperTy {
     AsyncInfoPtr->Queue = Queue;
   }
 
+  /// Get the queue, using the provided resource manager to initialise it if it
+  /// doesn't exist.
+  template <typename Ty, typename RMTy>
+  Expected<Ty>
+  getOrInitQueue(GenericDeviceResourceManagerTy<RMTy> &ResourceManager) {
+    std::lock_guard<std::mutex> Lock(AsyncInfoPtr->Mutex);
+    if (!AsyncInfoPtr->Queue) {
+      if (auto Err = ResourceManager.getResource(
+              *reinterpret_cast<Ty *>(&AsyncInfoPtr->Queue)))
+        return Err;
+    }
+    return getQueueAs<Ty>();
+  }
+
   /// Synchronize with the __tgt_async_info's pending operations if it's the
   /// internal async info. The error associated to the asynchronous operations
   /// issued in this queue must be provided in \p Err. This function will update
@@ -104,7 +119,7 @@ struct AsyncInfoWrapperTy {
   /// Register \p Ptr as an associated allocation that is freed after
   /// finalization.
   void freeAllocationAfterSynchronization(void *Ptr) {
-    std::lock_guard<std::mutex> AllocationGuard{AsyncInfoPtr->AllocationsMutex};
+    std::lock_guard<std::mutex> AllocationGuard{AsyncInfoPtr->Mutex};
     AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
   }
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 4844f88229fb2..5f02dbe20252d 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1331,7 +1331,7 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
 
 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
                                    bool RemoveQueue) {
-  std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->AllocationsMutex};
+  std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
 
   if (!AsyncInfo || !AsyncInfo->Queue)
     return Plugin::error(ErrorCode::INVALID_ARGUMENT,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index f637379c5b29d..61b18f01881d2 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -522,17 +522,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
   /// Get the stream of the asynchronous info structure or get a new one.
   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
-    std::lock_guard<std::mutex> StreamLock{StreamMutex};
-    // Get the stream (if any) from the async info.
-    Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
-    if (!Stream) {
-      // There was no stream; get an idle one.
-      if (auto Err = CUDAStreamManager.getResource(Stream))
-        return Err;
-
-      // Modify the async info's stream.
-      AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
-    }
+    auto WrapperStream =
+        AsyncInfoWrapper.getOrInitQueue<CUstream>(CUDAStreamManager);
+    if (!WrapperStream)
+      return WrapperStream.takeError();
+    Stream = *WrapperStream;
     return Plugin::success();
   }
 
@@ -1285,9 +1279,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
   /// The maximum number of warps that can be resident on all the SMs
   /// simultaneously.
   uint32_t HardwareParallelism = 0;
-
-  /// Mutex to guard getting/setting the stream
-  std::mutex StreamMutex;
 };
 
 Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,

>From b1b6ec4799515cb69bbb1bf84e38bf8bbf3df11f Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 22 Jul 2025 11:24:34 +0100
Subject: [PATCH 5/6] Delete allocations outside of lock

---
 .../common/src/PluginInterface.cpp            | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 5f02dbe20252d..7e5354db736bd 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1331,19 +1331,23 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
 
 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
                                    bool RemoveQueue) {
-  std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
+  SmallVector<void *, 2> AllocsToDelete{};
+  {
+    std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
 
-  if (!AsyncInfo || !AsyncInfo->Queue)
-    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                         "invalid async info queue");
+    if (!AsyncInfo || !AsyncInfo->Queue)
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                           "invalid async info queue");
 
-  if (auto Err = synchronizeImpl(*AsyncInfo, RemoveQueue))
-    return Err;
+    if (auto Err = synchronizeImpl(*AsyncInfo, RemoveQueue))
+      return Err;
+
+    std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations);
+  }
 
-  for (auto *Ptr : AsyncInfo->AssociatedAllocations)
+  for (auto *Ptr : AllocsToDelete)
     if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE))
       return Err;
-  AsyncInfo->AssociatedAllocations.clear();
 
   return Plugin::success();
 }

>From 6e0c8e36fc9f3be2d345a65e8afa797077444178 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 22 Jul 2025 12:40:22 +0100
Subject: [PATCH 6/6] PR response

---
 offload/liboffload/src/OffloadImpl.cpp                 |  8 +++++---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp             |  4 ++--
 .../plugins-nextgen/common/include/PluginInterface.h   |  8 +++++---
 offload/plugins-nextgen/common/src/PluginInterface.cpp |  4 ++--
 offload/plugins-nextgen/cuda/src/rtl.cpp               | 10 +++++-----
 offload/plugins-nextgen/host/src/rtl.cpp               |  2 +-
 6 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index d0dced8be7a61..2db2186bec7de 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -206,7 +206,7 @@ Error initPlugins(OffloadContext &Context) {
 }
 
 Error olInit_impl() {
-  std::lock_guard<std::mutex> Lock{OffloadContextValMutex};
+  std::lock_guard<std::mutex> Lock(OffloadContextValMutex);
 
   if (isOffloadInitialized()) {
     OffloadContext::get().RefCount++;
@@ -224,7 +224,7 @@ Error olInit_impl() {
 }
 
 Error olShutDown_impl() {
-  std::lock_guard<std::mutex> Lock{OffloadContextValMutex};
+  std::lock_guard<std::mutex> Lock(OffloadContextValMutex);
 
   if (--OffloadContext::get().RefCount != 0)
     return Error::success();
@@ -487,6 +487,8 @@ Error olWaitQueue_impl(ol_queue_handle_t Queue) {
   // Host plugin doesn't have a queue set so it's not safe to call synchronize
   // on it, but we have nothing to synchronize in that situation anyway.
   if (Queue->AsyncInfo->Queue) {
+    // We don't need to release the queue and we would like the ability for
+    // other offload threads to submit work concurrently, so pass "false" here.
     if (auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo, false))
       return Err;
   }
@@ -711,7 +713,7 @@ Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name,
                        ol_symbol_kind_t Kind, ol_symbol_handle_t *Symbol) {
   auto &Device = Program->Image->getDevice();
 
-  std::lock_guard<std::mutex> Lock{Program->SymbolListMutex};
+  std::lock_guard<std::mutex> Lock(Program->SymbolListMutex);
 
   switch (Kind) {
   case OL_SYMBOL_KIND_KERNEL: {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 44e5a6766db48..03bd373df08d0 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2287,7 +2287,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// Synchronize current thread with the pending operations on the async info.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo,
-                        bool RemoveQueue) override {
+                        bool ReleaseQueue) override {
     AMDGPUStreamTy *Stream =
         reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
     assert(Stream && "Invalid stream");
@@ -2298,7 +2298,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
     // own tasks.
-    if (RemoveQueue) {
+    if (ReleaseQueue) {
       AsyncInfo.Queue = nullptr;
       return AMDGPUStreamManager.returnResource(Stream);
     }
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index db9302bcda367..a836970fe2eba 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -787,10 +787,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
 
   /// Synchronize the current thread with the pending operations on the
-  /// __tgt_async_info structure.
-  Error synchronize(__tgt_async_info *AsyncInfo, bool RemoveQueue = true);
+  /// __tgt_async_info structure. If ReleaseQueue is false, then the
+  // underlying queue will not be released. In this case, additional
+  // work may be submitted to the queue whilst a synchronize is running.
+  Error synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue = true);
   virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo,
-                                bool RemoveQueue) = 0;
+                                bool ReleaseQueue) = 0;
 
   /// Invokes any global constructors on the device if present and is required
   /// by the target.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 7e5354db736bd..934eb1b483da7 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1330,7 +1330,7 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
 }
 
 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
-                                   bool RemoveQueue) {
+                                   bool ReleaseQueue) {
   SmallVector<void *, 2> AllocsToDelete{};
   {
     std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
@@ -1339,7 +1339,7 @@ Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
       return Plugin::error(ErrorCode::INVALID_ARGUMENT,
                            "invalid async info queue");
 
-    if (auto Err = synchronizeImpl(*AsyncInfo, RemoveQueue))
+    if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
       return Err;
 
     std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 61b18f01881d2..42a9a6a33c41d 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -638,15 +638,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
   /// Synchronize current thread with the pending operations on the async info.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo,
-                        bool RemoveQueue) override {
+                        bool ReleaseQueue) override {
     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
     CUresult Res;
     Res = cuStreamSynchronize(Stream);
 
-    // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
-    if (RemoveQueue) {
+    // Once the stream is synchronized and we want to release the queue, return
+    // it to stream pool and reset AsyncInfo. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (ReleaseQueue) {
       AsyncInfo.Queue = nullptr;
       if (auto Err = CUDAStreamManager.returnResource(Stream))
         return Err;
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 725a37c280248..0dbf78d6eaed8 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -298,7 +298,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   /// All functions are already synchronous. No need to do anything on this
   /// synchronization function.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo,
-                        bool RemoveQueue) override {
+                        bool ReleaseQueue) override {
     return Plugin::success();
   }