[llvm] [OFFLOAD] Improve resource management of the plugin (PR #187597)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 19 14:59:51 PDT 2026
https://github.com/fineg74 created https://github.com/llvm/llvm-project/pull/187597
This PR improves event management of the plugin by fixing potential resource leaks and preventing a potential deadlock
>From 7e9c8f3fa52b5cd1dfa524d3af41031606ebde0a Mon Sep 17 00:00:00 2001
From: "Fine, Gregory" <gregory.fine at intel.com>
Date: Thu, 19 Mar 2026 14:57:50 -0700
Subject: [PATCH] Improve resource management of the plugin
---
.../level_zero/src/L0Device.cpp | 38 +++++++++++++-----
.../level_zero/src/L0Kernel.cpp | 40 ++++++++++++-------
2 files changed, 55 insertions(+), 23 deletions(-)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
index 651d133570f8d..2b321586706c1 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp
@@ -806,6 +806,10 @@ Error L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
bool CopyTo) {
const bool Ordered =
(getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
+ auto CmdListOrError = getImmCopyCmdList();
+ if (!CmdListOrError)
+ return CmdListOrError.takeError();
+ const auto CmdList = *CmdListOrError;
auto EventOrErr = getEvent();
if (!EventOrErr)
return EventOrErr.takeError();
@@ -823,14 +827,22 @@ Error L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
else
NumWaitEvents = 0;
}
- auto CmdListOrError = getImmCopyCmdList();
- if (!CmdListOrError)
- return CmdListOrError.takeError();
- const auto CmdList = *CmdListOrError;
- CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
+
+ Error SyncErrors = Error::success();
+ auto addError = [&](Error Err) {
+ SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
+ };
+
+ CALL_ZE_HANDLE_ERROR(addError, zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
SignalEvent, NumWaitEvents, WaitEvents);
- AsyncQueue->WaitEvents.push_back(SignalEvent);
- return Plugin::success();
+ if (!SyncErrors)
+ AsyncQueue->WaitEvents.push_back(SignalEvent);
+ else{
+ if (auto Err = releaseEvent(SignalEvent))
+ addError(std::move(Err));
+ }
+
+ return SyncErrors;
}
/// Enqueue memory fill.
@@ -844,10 +856,18 @@ Error L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
auto EventOrErr = getEvent();
if (!EventOrErr)
return EventOrErr.takeError();
+ Error SyncErrors = Error::success();
+ auto addError = [&](Error Err) {
+ SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
+ };
ze_event_handle_t Event = *EventOrErr;
- CALL_ZE_RET_ERROR(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
+ CALL_ZE_HANDLE_ERROR(addError,zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
PatternSize, Size, Event, 0, nullptr);
- CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout);
+ if (!SyncErrors)
+ CALL_ZE_HANDLE_ERROR(addError,zeEventHostSynchronize, Event, L0DefaultTimeout);
+ if (auto Err = releaseEvent(Event))
+ addError(std::move(Err));
+ return SyncErrors;
} else {
auto CmdListOrErr = getCopyCmdList();
if (!CmdListOrErr)
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 1bffbbcd2fe92..6c092b4e01017 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -253,7 +253,8 @@ Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
ze_kernel_handle_t zeKernel,
L0LaunchEnvTy &KEnv,
- CommandModeTy CommandMode) {
+ CommandModeTy CommandMode,
+ std::unique_lock<std::mutex> &Lock) {
const auto DeviceId = l0Device.getDeviceId();
auto *IdStr = l0Device.getZeIdCStr();
auto CmdListOrErr = l0Device.getImmCmdList();
@@ -282,9 +283,18 @@ static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Kernel depends on %zu data copying events.\n", NumWaitEvents);
- CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
+ Error SyncErrors = Error::success();
+ auto addError = [&](Error Err) {
+ SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
+ };
+ CALL_ZE_HANDLE_ERROR(addError, zeCommandListAppendLaunchKernel, CmdList, zeKernel,
&KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
- KEnv.KernelPR.Mtx.unlock();
+ Lock.unlock();
+ if (SyncErrors) {
+ if (auto Err = l0Device.releaseEvent(Event))
+ addError(std::move(Err));
+ return SyncErrors;
+ }
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
@@ -292,20 +302,22 @@ static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
AsyncQueue->WaitEvents.push_back(Event);
AsyncQueue->KernelEvent = Event;
} else {
- CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout);
+ CALL_ZE_HANDLE_ERROR(addError, zeEventHostSynchronize, Event, L0DefaultTimeout);
if (auto Err = l0Device.releaseEvent(Event))
- return Err;
+ addError(std::move(Err));
}
- INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
- "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
- IdStr);
+ if (!SyncErrors)
+ INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+ "Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
+ IdStr);
- return Plugin::success();
+ return SyncErrors;
}
static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
ze_kernel_handle_t zeKernel,
- L0LaunchEnvTy &KEnv) {
+ L0LaunchEnvTy &KEnv,
+ std::unique_lock<std::mutex> &Lock) {
const auto DeviceId = l0Device.getDeviceId();
const auto *IdStr = l0Device.getZeIdCStr();
@@ -324,7 +336,7 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
ze_event_handle_t Event = nullptr;
CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
&KEnv.GroupCounts, Event, 0, nullptr);
- KEnv.KernelPR.Mtx.unlock();
+ Lock.unlock();
CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
CmdQueue, 1, &CmdList, nullptr);
@@ -445,7 +457,7 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR);
// Protect from kernel preparation to submission as kernels are shared.
- KernelPR.Mtx.lock();
+ std::unique_lock<std::mutex> Lock(KernelPR.Mtx);
if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks))
return Err;
@@ -471,9 +483,9 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
const bool UseImmCmdList = l0Device.useImmForCompute();
if (UseImmCmdList)
return launchKernelWithImmCmdList(l0Device, zeKernel, KEnv,
- Options.CommandMode);
+ Options.CommandMode, Lock);
- return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv);
+ return launchKernelWithCmdQueue(l0Device, zeKernel, KEnv, Lock);
}
} // namespace llvm::omp::target::plugin
More information about the llvm-commits
mailing list