[llvm] RFC: [Offload] Design for async error handling (PR #155596)
Ross Brunton via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 27 04:22:48 PDT 2025
https://github.com/RossBrunton created https://github.com/llvm/llvm-project/pull/155596
This updates the spec to provide a way for async errors to be signaled
from, for example, kernels. The error is stored on the queue and can be
queried with `olGetQueueError`. In addition, if any other queues are
waiting on the error'd queue they will also enter the error state.
With this design, both `olSyncEvent` and `olSyncQueue` will now exit
early on error. More specifically, unless a kernel gets trapped in an
infinite loop, both sync functions will always return in a finite amount
of time.
>From 4888933ab462333b0fe8cf3a7b2b8b048b1cb55f Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Wed, 27 Aug 2025 12:18:04 +0100
Subject: [PATCH] [Offload] Design for async error handling
This updates the spec to provide a way for async errors to be signaled
from, for example, kernels. The error is stored on the queue and can be
queried with `olGetQueueError`. In addition, if any other queues are
waiting on the error'd queue they will also enter the error state.
With this design, both `olSyncEvent` and `olSyncQueue` will now exit
early on error. More specifically, unless a kernel gets trapped in an
infinite loop, both sync functions will always return in a finite amount
of time.
---
offload/liboffload/API/Common.td | 1 +
offload/liboffload/API/Event.td | 8 ++++++--
offload/liboffload/API/Queue.td | 27 ++++++++++++++++++++++++--
offload/liboffload/src/OffloadImpl.cpp | 7 +++++++
4 files changed, 39 insertions(+), 4 deletions(-)
diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index ac27d85b6c964..3035347d2ceca 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -90,6 +90,7 @@ def ol_errc_t : Enum {
Etor<"COMPILE_FAILURE", "jit compile failure while processing binary image">,
Etor<"LINK_FAILURE", "linker failure while processing binary image">,
Etor<"BACKEND_FAILURE", "the plugin backend is in an invalid or unsupported state">,
+ Etor<"QUEUE_ERROR", "the queue entered an error state">,
Etor<"UNINITIALIZED", "not initialized">,
// Handle related errors - only makes sense for liboffload
diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td
index 075bf5bafaa64..19eec11c4f626 100644
--- a/offload/liboffload/API/Event.td
+++ b/offload/liboffload/API/Event.td
@@ -33,11 +33,15 @@ def olDestroyEvent : Function {
def olSyncEvent : Function {
let desc = "Block the calling thread until the event is complete.";
- let details = [];
+ let details = [
+ "If the queue or any dependencies encounter an error, this returns early and no work after the error will be complete."
+ ];
let params = [
Param<"ol_event_handle_t", "Event", "handle of the event", PARAM_IN>
];
- let returns = [];
+ let returns = [
+ Return<"OL_ERRC_QUEUE_ERROR", ["The queue associated with this event or any of its dependencies encountered an error"]>,
+ ];
}
def ol_event_info_t : Enum {
diff --git a/offload/liboffload/API/Queue.td b/offload/liboffload/API/Queue.td
index ededa9cc92fef..43cd8a9a63bb2 100644
--- a/offload/liboffload/API/Queue.td
+++ b/offload/liboffload/API/Queue.td
@@ -33,11 +33,15 @@ def olDestroyQueue : Function {
def olSyncQueue : Function {
let desc = "Block the calling thread until the enqueued work on a queue is complete.";
- let details = [];
+ let details = [
+ "If the queue or any dependencies encounter an error, this returns early and no work after the error will be complete."
+ ];
let params = [
Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>
];
- let returns = [];
+ let returns = [
+ Return<"OL_ERRC_INVALID_QUEUE_ERROR", ["The queue or any of it's dependencies encountered an error"]>,
+ ];
}
def olWaitEvents : Function {
@@ -45,6 +49,7 @@ def olWaitEvents : Function {
let details = [
"All events in `Events` must complete before the queue is unblocked.",
"The input events can be from any queue on any device provided by the same platform as `Queue`.",
+ "If `Event`'s queue is different from `Queue`, a dependency is created. If `Event`'s queue enters the error state, then Queue will also enter the error state.",
];
let params = [
Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
@@ -125,3 +130,21 @@ def olLaunchHostFunction : Function {
];
let returns = [];
}
+
+def olGetQueueError : Function {
+ let desc = "Gets the error from a queue or any of its dependencies in the error state.";
+ let details = [
+ "If the queue is not in the error state, OL_SUCCESS is written",
+ "Dependencies are created using `olWaitEvents`, if any waited on queue enters the fail state then this will also be in the fail state",
+ "The error is not cleared; there is no way to recover a queue in the error state",
+ ];
+ let params = [
+ Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
+ Param<"ol_result_t *", "Error", "output location to put the queue error", PARAM_IN>,
+ Param<"ol_queue_handle_t *", "FailingQueue", "output location to put the queue that encountered an error", PARAM_IN_OPTIONAL>,
+
+ ];
+ let returns = [
+ Return<"OL_ERRC_INVALID_QUEUE">
+ ];
+}
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 9d342e06127a2..d47ee74978b9f 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -644,6 +644,13 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName,
return olGetQueueInfoImplDetail(Queue, PropName, 0, nullptr, PropSizeRet);
}
+Error olGetQueueError_impl(ol_queue_handle_t Queue, ol_result_t *Error,
+ ol_queue_handle_t *ErrQueue) {
+ // TODO
+ *Error = nullptr;
+ return Error::success();
+}
+
Error olSyncEvent_impl(ol_event_handle_t Event) {
// No event info means that this event was complete on creation
if (!Event->EventInfo)
More information about the llvm-commits
mailing list