[llvm] [Offload] Make olLaunchKernel test thread safe (PR #149497)

Fri Aug 8 02:10:21 PDT 2025

https://github.com/RossBrunton updated https://github.com/llvm/llvm-project/pull/149497

>From 8cd0873ee80f127af787201af067aa6aa0a8cca5 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 18 Jul 2025 12:27:17 +0100
Subject: [PATCH 01/10] [Offload] Make olLaunchKernel test thread safe

This sprinkles a few mutexes around the plugin interface so that the
olLaunchKernel CTS test now passes when ran on multiple threads.

Part of this also involved changing the interface for device synchronise
so that it can optionally not free the underlying queue (which
introduced a race condition in liboffload).
---
 offload/include/Shared/APITypes.h             |  4 ++++
 offload/liboffload/src/OffloadImpl.cpp        |  8 +------
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 14 ++++++++---
 .../common/include/PluginInterface.h          |  8 +++++--
 .../common/src/PluginInterface.cpp            |  7 ++++--
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 15 ++++++++----
 offload/plugins-nextgen/host/src/rtl.cpp      |  3 ++-
 .../unittests/OffloadAPI/common/Fixtures.hpp  | 18 +++++++++++++++
 .../OffloadAPI/kernel/olLaunchKernel.cpp      | 23 +++++++++++++++++++
 9 files changed, 81 insertions(+), 19 deletions(-)

diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index f376c7dc861f9..42bdb7df50343 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -21,6 +21,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <mutex>
 
 extern "C" {
 
@@ -76,6 +77,9 @@ struct __tgt_async_info {
   /// should be freed after finalization.
   llvm::SmallVector<void *, 2> AssociatedAllocations;
 
+  /// Mutex to guard access to AssociatedAllocations
+  std::mutex AllocationsMutex;
+
   /// The kernel launch environment used to issue a kernel. Stored here to
   /// ensure it is a valid location while the transfer to the device is
   /// happening.
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 272a12ab59a06..e56ef941b1c7e 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -487,16 +487,10 @@ Error olSyncQueue_impl(ol_queue_handle_t Queue) {
   // Host plugin doesn't have a queue set so it's not safe to call synchronize
   // on it, but we have nothing to synchronize in that situation anyway.
   if (Queue->AsyncInfo->Queue) {
-    if (auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo))
+    if (auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo, false))
       return Err;
   }
 
-  // Recreate the stream resource so the queue can be reused
-  // TODO: Would be easier for the synchronization to (optionally) not release
-  // it to begin with.
-  if (auto Res = Queue->Device->Device->initAsyncInfo(&Queue->AsyncInfo))
-    return Res;
-
   return Error::success();
 }
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 852c0e99b2266..0a22d2b33cc98 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2232,6 +2232,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Get the stream of the asynchronous info structure or get a new one.
   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
                   AMDGPUStreamTy *&Stream) {
+    std::lock_guard<std::mutex> StreamLock{StreamMutex};
     // Get the stream (if any) from the async info.
     Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
     if (!Stream) {
@@ -2296,7 +2297,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Synchronize current thread with the pending operations on the async info.
-  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                        bool RemoveQueue) override {
     AMDGPUStreamTy *Stream =
         reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
     assert(Stream && "Invalid stream");
@@ -2307,8 +2309,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
     // own tasks.
-    AsyncInfo.Queue = nullptr;
-    return AMDGPUStreamManager.returnResource(Stream);
+    if (RemoveQueue) {
+      AsyncInfo.Queue = nullptr;
+      return AMDGPUStreamManager.returnResource(Stream);
+    }
+    return Plugin::success();
   }
 
   /// Query for the completion of the pending operations on the async info.
@@ -3067,6 +3072,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// True is the system is configured with XNACK-Enabled.
   /// False otherwise.
   bool IsXnackEnabled = false;
+
+  /// Mutex to guard getting/setting the stream
+  std::mutex StreamMutex;
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 1d64193c17f6b..9ebf955eedbbe 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -138,6 +138,7 @@ struct AsyncInfoWrapperTy {
   /// Register \p Ptr as an associated allocation that is freed after
   /// finalization.
   void freeAllocationAfterSynchronization(void *Ptr) {
+    std::lock_guard<std::mutex> AllocationGuard{AsyncInfoPtr->AllocationsMutex};
     AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
   }
 
@@ -828,8 +829,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   /// Synchronize the current thread with the pending operations on the
   /// __tgt_async_info structure.
-  Error synchronize(__tgt_async_info *AsyncInfo);
-  virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
+  Error synchronize(__tgt_async_info *AsyncInfo, bool RemoveQueue = true);
+  virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                                bool RemoveQueue) = 0;
 
   /// Invokes any global constructors on the device if present and is required
   /// by the target.
@@ -1591,6 +1593,8 @@ template <typename ResourceRef> class GenericDeviceResourceManagerTy {
   /// Deinitialize the resource pool and delete all resources. This function
   /// must be called before the destructor.
   virtual Error deinit() {
+    const std::lock_guard<std::mutex> Lock(Mutex);
+
     if (NextAvailable)
       DP("Missing %d resources to be returned\n", NextAvailable);
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index bcc91798f3f90..fa01fb197d7df 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1335,12 +1335,15 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
   return eraseEntry(*Entry);
 }
 
-Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
+Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
+                                   bool RemoveQueue) {
+  std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->AllocationsMutex};
+
   if (!AsyncInfo || !AsyncInfo->Queue)
     return Plugin::error(ErrorCode::INVALID_ARGUMENT,
                          "invalid async info queue");
 
-  if (auto Err = synchronizeImpl(*AsyncInfo))
+  if (auto Err = synchronizeImpl(*AsyncInfo, RemoveQueue))
     return Err;
 
   for (auto *Ptr : AsyncInfo->AssociatedAllocations)
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 7649fd9285bb5..9536e5863c569 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -522,6 +522,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
   /// Get the stream of the asynchronous info structure or get a new one.
   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
+    std::lock_guard<std::mutex> StreamLock{StreamMutex};
     // Get the stream (if any) from the async info.
     Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
     if (!Stream) {
@@ -642,7 +643,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Synchronize current thread with the pending operations on the async info.
-  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                        bool RemoveQueue) override {
     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
     CUresult Res;
     Res = cuStreamSynchronize(Stream);
@@ -650,9 +652,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
     // own tasks.
-    AsyncInfo.Queue = nullptr;
-    if (auto Err = CUDAStreamManager.returnResource(Stream))
-      return Err;
+    if (RemoveQueue) {
+      AsyncInfo.Queue = nullptr;
+      if (auto Err = CUDAStreamManager.returnResource(Stream))
+        return Err;
+    }
 
     return Plugin::check(Res, "error in cuStreamSynchronize: %s");
   }
@@ -1289,6 +1293,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
   /// The maximum number of warps that can be resident on all the SMs
   /// simultaneously.
   uint32_t HardwareParallelism = 0;
+
+  /// Mutex to guard getting/setting the stream
+  std::mutex StreamMutex;
 };
 
 Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 9abc3507f6e68..272d58e87fd18 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -297,7 +297,8 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
 
   /// All functions are already synchronous. No need to do anything on this
   /// synchronization function.
-  Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
+  Error synchronizeImpl(__tgt_async_info &AsyncInfo,
+                        bool RemoveQueue) override {
     return Plugin::success();
   }
 
diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp
index 24ce1a0cf7d7e..43240fa3c4a08 100644
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@@ -9,6 +9,7 @@
 #include <OffloadAPI.h>
 #include <OffloadPrint.hpp>
 #include <gtest/gtest.h>
+#include <thread>
 
 #include "Environment.hpp"
 
@@ -57,6 +58,23 @@ inline std::string SanitizeString(const std::string &Str) {
   return NewStr;
 }
 
+template <typename Fn> inline void threadify(Fn body) {
+  std::vector<std::thread> Threads;
+  for (size_t I = 0; I < 20; I++) {
+    Threads.emplace_back(
+        [&body](size_t I) {
+          std::string ScopeMsg{"Thread #"};
+          ScopeMsg.append(std::to_string(I));
+          SCOPED_TRACE(ScopeMsg);
+          body(I);
+        },
+        I);
+  }
+  for (auto &T : Threads) {
+    T.join();
+  }
+}
+
 struct OffloadTest : ::testing::Test {
   ol_device_handle_t Host = TestEnvironment::getHostDevice();
 };
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 758f6071b030f..2b4fe915450da 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -104,6 +104,29 @@ TEST_P(olLaunchKernelFooTest, Success) {
   ASSERT_SUCCESS(olMemFree(Mem));
 }
 
+TEST_P(olLaunchKernelFooTest, SuccessThreaded) {
+  threadify([&](size_t) {
+    void *Mem;
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                              LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+    struct {
+      void *Mem;
+    } Args{Mem};
+
+    ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                  &LaunchArgs, nullptr));
+
+    ASSERT_SUCCESS(olWaitQueue(Queue));
+
+    uint32_t *Data = (uint32_t *)Mem;
+    for (uint32_t i = 0; i < 64; i++) {
+      ASSERT_EQ(Data[i], i);
+    }
+
+    ASSERT_SUCCESS(olMemFree(Mem));
+  });
+}
+
 TEST_P(olLaunchKernelNoArgsTest, Success) {
   ASSERT_SUCCESS(
       olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs));

>From acb7fab7eb8a956e93167af3c1e3552ae26282c2 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 18 Jul 2025 12:34:47 +0100
Subject: [PATCH 02/10] Remove unneeded lock

---
 offload/plugins-nextgen/common/include/PluginInterface.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 9ebf955eedbbe..27c4f786c10d0 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1593,8 +1593,6 @@ template <typename ResourceRef> class GenericDeviceResourceManagerTy {
   /// Deinitialize the resource pool and delete all resources. This function
   /// must be called before the destructor.
   virtual Error deinit() {
-    const std::lock_guard<std::mutex> Lock(Mutex);
-
     if (NextAvailable)
       DP("Missing %d resources to be returned\n", NextAvailable);
 

>From 98f9ea42753ecb61cd239236acbefa596c412a62 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Fri, 18 Jul 2025 14:09:37 +0100
Subject: [PATCH 03/10] Fix convertable

---
 offload/libomptarget/interface.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp
index ea354400f2e99..e9b148d8a2605 100644
--- a/offload/libomptarget/interface.cpp
+++ b/offload/libomptarget/interface.cpp
@@ -116,7 +116,7 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
            TargetDataFuncPtrTy TargetDataFunction, const char *RegionTypeMsg,
            const char *RegionName) {
   assert(PM && "Runtime not initialized");
-  static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
+  static_assert(std::is_convertible_v<TargetAsyncInfoTy &, AsyncInfoTy &>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
   TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
@@ -311,7 +311,7 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                int32_t ThreadLimit, void *HostPtr,
                                KernelArgsTy *KernelArgs) {
   assert(PM && "Runtime not initialized");
-  static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
+  static_assert(std::is_convertible_v<TargetAsyncInfoTy &, AsyncInfoTy &>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",

>From 65ff11088ae74b320f6bbfc6c0676965b7895006 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 22 Jul 2025 11:14:31 +0100
Subject: [PATCH 04/10] Refactor stream mutex

---
 offload/include/Shared/APITypes.h             |  4 ++--
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 19 +++++--------------
 .../common/include/PluginInterface.h          | 17 ++++++++++++++++-
 .../common/src/PluginInterface.cpp            |  2 +-
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 19 +++++--------------
 5 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 42bdb7df50343..fdc36c30ec485 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -77,8 +77,8 @@ struct __tgt_async_info {
   /// should be freed after finalization.
   llvm::SmallVector<void *, 2> AssociatedAllocations;
 
-  /// Mutex to guard access to AssociatedAllocations
-  std::mutex AllocationsMutex;
+  /// Mutex to guard access to AssociatedAllocations and the Queue
+  std::mutex Mutex;
 
   /// The kernel launch environment used to issue a kernel. Stored here to
   /// ensure it is a valid location while the transfer to the device is
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 0a22d2b33cc98..d33dbb116143b 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2232,17 +2232,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Get the stream of the asynchronous info structure or get a new one.
   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
                   AMDGPUStreamTy *&Stream) {
-    std::lock_guard<std::mutex> StreamLock{StreamMutex};
-    // Get the stream (if any) from the async info.
-    Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
-    if (!Stream) {
-      // There was no stream; get an idle one.
-      if (auto Err = AMDGPUStreamManager.getResource(Stream))
-        return Err;
-
-      // Modify the async info's stream.
-      AsyncInfoWrapper.setQueueAs<AMDGPUStreamTy *>(Stream);
-    }
+    auto WrapperStream =
+        AsyncInfoWrapper.getOrInitQueue<AMDGPUStreamTy *>(AMDGPUStreamManager);
+    if (!WrapperStream)
+      return WrapperStream.takeError();
+    Stream = *WrapperStream;
     return Plugin::success();
   }
 
@@ -3072,9 +3066,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// True is the system is configured with XNACK-Enabled.
   /// False otherwise.
   bool IsXnackEnabled = false;
-
-  /// Mutex to guard getting/setting the stream
-  std::mutex StreamMutex;
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 27c4f786c10d0..7d11f23b4cdfc 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -60,6 +60,7 @@ struct GenericPluginTy;
 struct GenericKernelTy;
 struct GenericDeviceTy;
 struct RecordReplayTy;
+template <typename ResourceRef> class GenericDeviceResourceManagerTy;
 
 namespace Plugin {
 /// Create a success error. This is the same as calling Error::success(), but
@@ -127,6 +128,20 @@ struct AsyncInfoWrapperTy {
     AsyncInfoPtr->Queue = Queue;
   }
 
+  /// Get the queue, using the provided resource manager to initialise it if it
+  /// doesn't exist.
+  template <typename Ty, typename RMTy>
+  Expected<Ty>
+  getOrInitQueue(GenericDeviceResourceManagerTy<RMTy> &ResourceManager) {
+    std::lock_guard<std::mutex> Lock(AsyncInfoPtr->Mutex);
+    if (!AsyncInfoPtr->Queue) {
+      if (auto Err = ResourceManager.getResource(
+              *reinterpret_cast<Ty *>(&AsyncInfoPtr->Queue)))
+        return Err;
+    }
+    return getQueueAs<Ty>();
+  }
+
   /// Synchronize with the __tgt_async_info's pending operations if it's the
   /// internal async info. The error associated to the asynchronous operations
   /// issued in this queue must be provided in \p Err. This function will update
@@ -138,7 +153,7 @@ struct AsyncInfoWrapperTy {
   /// Register \p Ptr as an associated allocation that is freed after
   /// finalization.
   void freeAllocationAfterSynchronization(void *Ptr) {
-    std::lock_guard<std::mutex> AllocationGuard{AsyncInfoPtr->AllocationsMutex};
+    std::lock_guard<std::mutex> AllocationGuard{AsyncInfoPtr->Mutex};
     AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
   }
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index fa01fb197d7df..7a1cedb4da49e 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1337,7 +1337,7 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
 
 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
                                    bool RemoveQueue) {
-  std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->AllocationsMutex};
+  std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
 
   if (!AsyncInfo || !AsyncInfo->Queue)
     return Plugin::error(ErrorCode::INVALID_ARGUMENT,
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 9536e5863c569..fd3a62b0c715e 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -522,17 +522,11 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
   /// Get the stream of the asynchronous info structure or get a new one.
   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
-    std::lock_guard<std::mutex> StreamLock{StreamMutex};
-    // Get the stream (if any) from the async info.
-    Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
-    if (!Stream) {
-      // There was no stream; get an idle one.
-      if (auto Err = CUDAStreamManager.getResource(Stream))
-        return Err;
-
-      // Modify the async info's stream.
-      AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
-    }
+    auto WrapperStream =
+        AsyncInfoWrapper.getOrInitQueue<CUstream>(CUDAStreamManager);
+    if (!WrapperStream)
+      return WrapperStream.takeError();
+    Stream = *WrapperStream;
     return Plugin::success();
   }
 
@@ -1293,9 +1287,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
   /// The maximum number of warps that can be resident on all the SMs
   /// simultaneously.
   uint32_t HardwareParallelism = 0;
-
-  /// Mutex to guard getting/setting the stream
-  std::mutex StreamMutex;
 };
 
 Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,

>From 8feee57c70b5763112766a9baf7d0749d995999a Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 22 Jul 2025 11:24:34 +0100
Subject: [PATCH 05/10] Delete allocations outside of lock

---
 .../common/src/PluginInterface.cpp            | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 7a1cedb4da49e..2b847498fc657 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1337,19 +1337,23 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
 
 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
                                    bool RemoveQueue) {
-  std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
+  SmallVector<void *, 2> AllocsToDelete{};
+  {
+    std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
 
-  if (!AsyncInfo || !AsyncInfo->Queue)
-    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                         "invalid async info queue");
+    if (!AsyncInfo || !AsyncInfo->Queue)
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                           "invalid async info queue");
 
-  if (auto Err = synchronizeImpl(*AsyncInfo, RemoveQueue))
-    return Err;
+    if (auto Err = synchronizeImpl(*AsyncInfo, RemoveQueue))
+      return Err;
+
+    std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations);
+  }
 
-  for (auto *Ptr : AsyncInfo->AssociatedAllocations)
+  for (auto *Ptr : AllocsToDelete)
     if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE))
       return Err;
-  AsyncInfo->AssociatedAllocations.clear();
 
   return Plugin::success();
 }

>From 94a61fe31011a0a6be4ad8b08d331ea81a814a8e Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 22 Jul 2025 12:40:22 +0100
Subject: [PATCH 06/10] PR response

---
 offload/liboffload/src/OffloadImpl.cpp                 |  8 +++++---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp             |  4 ++--
 .../plugins-nextgen/common/include/PluginInterface.h   |  8 +++++---
 offload/plugins-nextgen/common/src/PluginInterface.cpp |  4 ++--
 offload/plugins-nextgen/cuda/src/rtl.cpp               | 10 +++++-----
 offload/plugins-nextgen/host/src/rtl.cpp               |  2 +-
 6 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index e56ef941b1c7e..a2f087eabba60 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -208,7 +208,7 @@ Error initPlugins(OffloadContext &Context) {
 }
 
 Error olInit_impl() {
-  std::lock_guard<std::mutex> Lock{OffloadContextValMutex};
+  std::lock_guard<std::mutex> Lock(OffloadContextValMutex);
 
   if (isOffloadInitialized()) {
     OffloadContext::get().RefCount++;
@@ -226,7 +226,7 @@ Error olInit_impl() {
 }
 
 Error olShutDown_impl() {
-  std::lock_guard<std::mutex> Lock{OffloadContextValMutex};
+  std::lock_guard<std::mutex> Lock(OffloadContextValMutex);
 
   if (--OffloadContext::get().RefCount != 0)
     return Error::success();
@@ -487,6 +487,8 @@ Error olSyncQueue_impl(ol_queue_handle_t Queue) {
   // Host plugin doesn't have a queue set so it's not safe to call synchronize
   // on it, but we have nothing to synchronize in that situation anyway.
   if (Queue->AsyncInfo->Queue) {
+    // We don't need to release the queue and we would like the ability for
+    // other offload threads to submit work concurrently, so pass "false" here.
     if (auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo, false))
       return Err;
   }
@@ -735,7 +737,7 @@ Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name,
                        ol_symbol_kind_t Kind, ol_symbol_handle_t *Symbol) {
   auto &Device = Program->Image->getDevice();
 
-  std::lock_guard<std::mutex> Lock{Program->SymbolListMutex};
+  std::lock_guard<std::mutex> Lock(Program->SymbolListMutex);
 
   switch (Kind) {
   case OL_SYMBOL_KIND_KERNEL: {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index d33dbb116143b..796182075ff3d 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2292,7 +2292,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// Synchronize current thread with the pending operations on the async info.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo,
-                        bool RemoveQueue) override {
+                        bool ReleaseQueue) override {
     AMDGPUStreamTy *Stream =
         reinterpret_cast<AMDGPUStreamTy *>(AsyncInfo.Queue);
     assert(Stream && "Invalid stream");
@@ -2303,7 +2303,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
     // own tasks.
-    if (RemoveQueue) {
+    if (ReleaseQueue) {
       AsyncInfo.Queue = nullptr;
       return AMDGPUStreamManager.returnResource(Stream);
     }
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 7d11f23b4cdfc..771ba80f9eb04 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -843,10 +843,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
 
   /// Synchronize the current thread with the pending operations on the
-  /// __tgt_async_info structure.
-  Error synchronize(__tgt_async_info *AsyncInfo, bool RemoveQueue = true);
+  /// __tgt_async_info structure. If ReleaseQueue is false, then the
+  // underlying queue will not be released. In this case, additional
+  // work may be submitted to the queue whilst a synchronize is running.
+  Error synchronize(__tgt_async_info *AsyncInfo, bool ReleaseQueue = true);
   virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo,
-                                bool RemoveQueue) = 0;
+                                bool ReleaseQueue) = 0;
 
   /// Invokes any global constructors on the device if present and is required
   /// by the target.
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 2b847498fc657..8614c75f7cea9 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1336,7 +1336,7 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
 }
 
 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
-                                   bool RemoveQueue) {
+                                   bool ReleaseQueue) {
   SmallVector<void *, 2> AllocsToDelete{};
   {
     std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};
@@ -1345,7 +1345,7 @@ Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
       return Plugin::error(ErrorCode::INVALID_ARGUMENT,
                            "invalid async info queue");
 
-    if (auto Err = synchronizeImpl(*AsyncInfo, RemoveQueue))
+    if (auto Err = synchronizeImpl(*AsyncInfo, ReleaseQueue))
       return Err;
 
     std::swap(AllocsToDelete, AsyncInfo->AssociatedAllocations);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index fd3a62b0c715e..f3f3783b3ce7c 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -638,15 +638,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
   /// Synchronize current thread with the pending operations on the async info.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo,
-                        bool RemoveQueue) override {
+                        bool ReleaseQueue) override {
     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
     CUresult Res;
     Res = cuStreamSynchronize(Stream);
 
-    // Once the stream is synchronized, return it to stream pool and reset
-    // AsyncInfo. This is to make sure the synchronization only works for its
-    // own tasks.
-    if (RemoveQueue) {
+    // Once the stream is synchronized and we want to release the queue, return
+    // it to stream pool and reset AsyncInfo. This is to make sure the
+    // synchronization only works for its own tasks.
+    if (ReleaseQueue) {
       AsyncInfo.Queue = nullptr;
       if (auto Err = CUDAStreamManager.returnResource(Stream))
         return Err;
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 272d58e87fd18..ed5213531999d 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -298,7 +298,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   /// All functions are already synchronous. No need to do anything on this
   /// synchronization function.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo,
-                        bool RemoveQueue) override {
+                        bool ReleaseQueue) override {
     return Plugin::success();
   }
 

>From a45d8b09aca50e9c16df41ca9735b316b583c8e4 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 22 Jul 2025 13:05:52 +0100
Subject: [PATCH 07/10] Improved mutex comment

---
 offload/include/Shared/APITypes.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index fdc36c30ec485..046f10cf9bb48 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -77,7 +77,9 @@ struct __tgt_async_info {
   /// should be freed after finalization.
   llvm::SmallVector<void *, 2> AssociatedAllocations;
 
-  /// Mutex to guard access to AssociatedAllocations and the Queue
+  /// Mutex to guard access to AssociatedAllocations and the Queue.
+  /// This is only used for liboffload and should be ignored in libomptarget
+  /// code.
   std::mutex Mutex;
 
   /// The kernel launch environment used to issue a kernel. Stored here to

>From 02886c1eab5ae1172d8066ec31ad957ea784f64a Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Tue, 5 Aug 2025 16:05:21 +0100
Subject: [PATCH 08/10] Respond to feedback

---
 offload/include/Shared/APITypes.h                        | 2 --
 offload/liboffload/src/OffloadImpl.cpp                   | 3 ++-
 offload/plugins-nextgen/common/include/PluginInterface.h | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 046f10cf9bb48..8c150b6bfc2d4 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -78,8 +78,6 @@ struct __tgt_async_info {
   llvm::SmallVector<void *, 2> AssociatedAllocations;
 
   /// Mutex to guard access to AssociatedAllocations and the Queue.
-  /// This is only used for liboffload and should be ignored in libomptarget
-  /// code.
   std::mutex Mutex;
 
   /// The kernel launch environment used to issue a kernel. Stored here to
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index a2f087eabba60..12f62122a00e8 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -488,7 +488,8 @@ Error olSyncQueue_impl(ol_queue_handle_t Queue) {
   // on it, but we have nothing to synchronize in that situation anyway.
   if (Queue->AsyncInfo->Queue) {
     // We don't need to release the queue and we would like the ability for
-    // other offload threads to submit work concurrently, so pass "false" here.
+    // other offload threads to submit work concurrently, so pass "false" here
+    // so we don't release the underlying queue object.
     if (auto Err = Queue->Device->Device->synchronize(Queue->AsyncInfo, false))
       return Err;
   }
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 771ba80f9eb04..c9ab34b024b77 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -153,7 +153,7 @@ struct AsyncInfoWrapperTy {
   /// Register \p Ptr as an associated allocation that is freed after
   /// finalization.
   void freeAllocationAfterSynchronization(void *Ptr) {
-    std::lock_guard<std::mutex> AllocationGuard{AsyncInfoPtr->Mutex};
+    std::lock_guard<std::mutex> AllocationGuard(AsyncInfoPtr->Mutex);
     AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
   }
 

>From a9ec7b0750580a9946e1ea5813e5c1b60c2acfab Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Wed, 6 Aug 2025 11:13:48 +0100
Subject: [PATCH 09/10] Update calls after bump

---
 offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 2b4fe915450da..1dac8c50271b5 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -114,9 +114,9 @@ TEST_P(olLaunchKernelFooTest, SuccessThreaded) {
     } Args{Mem};
 
     ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
-                                  &LaunchArgs, nullptr));
+                                  &LaunchArgs));
 
-    ASSERT_SUCCESS(olWaitQueue(Queue));
+    ASSERT_SUCCESS(olSyncQueue(Queue));
 
     uint32_t *Data = (uint32_t *)Mem;
     for (uint32_t i = 0; i < 64; i++) {

>From f7921092f5e184978d9fe5a5273233732482af00 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Thu, 7 Aug 2025 15:36:29 +0100
Subject: [PATCH 10/10] Use default SmallVector size

---
 offload/plugins-nextgen/common/src/PluginInterface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 8614c75f7cea9..083d41659a469 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1337,7 +1337,7 @@ Error PinnedAllocationMapTy::unlockUnmappedHostBuffer(void *HstPtr) {
 
 Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo,
                                    bool ReleaseQueue) {
-  SmallVector<void *, 2> AllocsToDelete{};
+  SmallVector<void *> AllocsToDelete{};
   {
     std::lock_guard<std::mutex> AllocationGuard{AsyncInfo->Mutex};