[llvm] [OFFLOAD] Improve resource management of the plugin (PR #187597)

via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 19 14:59:51 PDT 2026


https://github.com/fineg74 created https://github.com/llvm/llvm-project/pull/187597

This PR improves event management of the plugin by fixing potential resource leaks and preventing a potential deadlock

>From 7e9c8f3fa52b5cd1dfa524d3af41031606ebde0a Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Thu, 19 Mar 2026 14:57:50 -0700
Subject: [PATCH] Improve resource management of the plugin

---
 .../level_zero/src/L0Device.cpp               | 38 +++++++++++++-----
 .../level_zero/src/L0Kernel.cpp               | 40 ++++++++++++-------
 2 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 651d133570f8d..2b321586706c1 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -806,6 +806,10 @@ Error L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
                                       bool CopyTo) {
   const bool Ordered =
       (getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+  auto CmdListOrError = getImmCopyCmdList();
+  if (!CmdListOrError)
+    return CmdListOrError.takeError();
+  const auto CmdList = *CmdListOrError;
   auto EventOrErr = getEvent();
   if (!EventOrErr)
     return EventOrErr.takeError();
@@ -823,14 +827,22 @@ Error L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
     else
       NumWaitEvents = 0;
   }
-  auto CmdListOrError = getImmCopyCmdList();
-  if (!CmdListOrError)
-    return CmdListOrError.takeError();
-  const auto CmdList = *CmdListOrError;
-  CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+  
+  Error SyncErrors = Error::success();
+  auto addError = [&](Error Err) {
+    SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
+  };
+
+  CALL_ZE_HANDLE_ERROR(addError, zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
                     SignalEvent, NumWaitEvents, WaitEvents);
-  AsyncQueue->WaitEvents.push_back(SignalEvent);
-  return Plugin::success();
+  if (!SyncErrors)
+    AsyncQueue->WaitEvents.push_back(SignalEvent);
+  else{
+    if (auto Err = releaseEvent(SignalEvent))
+      addError(std::move(Err));
+  }
+
+  return SyncErrors;
 }
 
 /// Enqueue memory fill.
@@ -844,10 +856,18 @@ Error L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
     auto EventOrErr = getEvent();
     if (!EventOrErr)
       return EventOrErr.takeError();
+    Error SyncErrors = Error::success();
+    auto addError = [&](Error Err) {
+      SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
+    };
     ze_event_handle_t Event = *EventOrErr;
-    CALL_ZE_RET_ERROR(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+    CALL_ZE_HANDLE_ERROR(addError,zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
                       PatternSize, Size, Event, 0, nullptr);
-    CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout);
+    if (!SyncErrors)
+      CALL_ZE_HANDLE_ERROR(addError,zeEventHostSynchronize, Event, L0DefaultTimeout);
+    if (auto Err = releaseEvent(Event))
+      addError(std::move(Err));
+    return SyncErrors;
   } else {
     auto CmdListOrErr = getCopyCmdList();
     if (!CmdListOrErr)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 1bffbbcd2fe92..6c092b4e01017 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -253,7 +253,8 @@ Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
 static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
                                         ze_kernel_handle_t zeKernel,
                                         L0LaunchEnvTy &KEnv,
-                                        CommandModeTy CommandMode) {
+                                        CommandModeTy CommandMode,
+                                        std::unique_lock<std::mutex> &Lock) {
   const auto DeviceId = l0Device.getDeviceId();
   auto *IdStr = l0Device.getZeIdCStr();
   auto CmdListOrErr = l0Device.getImmCmdList();
@@ -282,9 +283,18 @@ static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
   }
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Kernel depends on %zu data copying events.\n", NumWaitEvents);
-  CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+  Error SyncErrors = Error::success();
+  auto addError = [&](Error Err) {
+    SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
+  };
+  CALL_ZE_HANDLE_ERROR(addError, zeCommandListAppendLaunchKernel, CmdList, zeKernel,
                     &KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
-  KEnv.KernelPR.Mtx.unlock();
+  Lock.unlock();
+  if (SyncErrors) {
+    if (auto Err = l0Device.releaseEvent(Event))
+      addError(std::move(Err));
+    return SyncErrors;
+  }
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
        "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
 
@@ -292,20 +302,22 @@ static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
     AsyncQueue->WaitEvents.push_back(Event);
     AsyncQueue->KernelEvent = Event;
   } else {
-    CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout);
+    CALL_ZE_HANDLE_ERROR(addError, zeEventHostSynchronize, Event, L0DefaultTimeout);
     if (auto Err = l0Device.releaseEvent(Event))
-      return Err;
+      addError(std::move(Err));
   }
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-       "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
-       IdStr);
+  if (!SyncErrors)
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+        "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
+        IdStr);
 
-  return Plugin::success();
+  return SyncErrors;
 }
 
 static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
                                       ze_kernel_handle_t zeKernel,
-                                      L0LaunchEnvTy &KEnv) {
+                                      L0LaunchEnvTy &KEnv,
+                                      std::unique_lock<std::mutex> &Lock) {
   const auto DeviceId = l0Device.getDeviceId();
   const auto *IdStr = l0Device.getZeIdCStr();
 
@@ -324,7 +336,7 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
   ze_event_handle_t Event = nullptr;
   CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
                     &KEnv.GroupCounts, Event, 0, nullptr);
-  KEnv.KernelPR.Mtx.unlock();
+  Lock.unlock();
   CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
   CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
                         CmdQueue, 1, &CmdList, nullptr);
@@ -445,7 +457,7 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR);
 
   // Protect from kernel preparation to submission as kernels are shared.
-  KernelPR.Mtx.lock();
+  std::unique_lock<std::mutex> Lock(KernelPR.Mtx);
 
   if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks))
     return Err;
@@ -471,9 +483,9 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   const bool UseImmCmdList = l0Device.useImmForCompute();
   if (UseImmCmdList)
     return launchKernelWithImmCmdList(l0Device, zeKernel, KEnv,
-                                      Options.CommandMode);
+                                      Options.CommandMode, Lock);
 
-  return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv);
+  return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv, Lock);
 }
 
 } // namespace llvm::omp::target::plugin



More information about the llvm-commits mailing list