[llvm] RFC: [Offload] Design for async error handling (PR #155596)

Ross Brunton via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 27 04:22:48 PDT 2025


https://github.com/RossBrunton created https://github.com/llvm/llvm-project/pull/155596

This updates the spec to provide a way for async errors to be signaled
from, for example, kernels. The error is stored on the queue and can be
queried with `olGetQueueError`. In addition, if any other queues are
waiting on the error'd queue they will also enter the error state.

With this design, both `olSyncEvent` and `olSyncQueue` will now exit
early on error. More specifically, unless a kernel gets trapped in an
infinite loop, both sync functions will always return in a finite amount
of time.


>From 4888933ab462333b0fe8cf3a7b2b8b048b1cb55f Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross at codeplay.com>
Date: Wed, 27 Aug 2025 12:18:04 +0100
Subject: [PATCH] [Offload] Design for async error handling

This updates the spec to provide a way for async errors to be signaled
from, for example, kernels. The error is stored on the queue and can be
queried with `olGetQueueError`. In addition, if any other queues are
waiting on the error'd queue they will also enter the error state.

With this design, both `olSyncEvent` and `olSyncQueue` will now exit
early on error. More specifically, unless a kernel gets trapped in an
infinite loop, both sync functions will always return in a finite amount
of time.
---
 offload/liboffload/API/Common.td       |  1 +
 offload/liboffload/API/Event.td        |  8 ++++++--
 offload/liboffload/API/Queue.td        | 27 ++++++++++++++++++++++++--
 offload/liboffload/src/OffloadImpl.cpp |  7 +++++++
 4 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/offload/liboffload/API/Common.td b/offload/liboffload/API/Common.td
index ac27d85b6c964..3035347d2ceca 100644
--- a/offload/liboffload/API/Common.td
+++ b/offload/liboffload/API/Common.td
@@ -90,6 +90,7 @@ def ol_errc_t : Enum {
     Etor<"COMPILE_FAILURE", "jit compile failure while processing binary image">,
     Etor<"LINK_FAILURE", "linker failure while processing binary image">,
     Etor<"BACKEND_FAILURE", "the plugin backend is in an invalid or unsupported state">,
+    Etor<"QUEUE_ERROR", "the queue entered an error state">,
     Etor<"UNINITIALIZED", "not initialized">,
 
     // Handle related errors - only makes sense for liboffload
diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td
index 075bf5bafaa64..19eec11c4f626 100644
--- a/offload/liboffload/API/Event.td
+++ b/offload/liboffload/API/Event.td
@@ -33,11 +33,15 @@ def olDestroyEvent : Function {
 
 def olSyncEvent : Function {
     let desc = "Block the calling thread until the event is complete.";
-    let details = [];
+    let details = [
+      "If the queue or any dependencies encounter an error, this returns early and no work after the error will be complete."
+    ];
     let params = [
         Param<"ol_event_handle_t", "Event", "handle of the event", PARAM_IN>
     ];
-    let returns = [];
+    let returns = [
+      Return<"OL_ERRC_QUEUE_ERROR", ["The queue associated with this event or any of its dependencies encountered an error"]>,
+    ];
 }
 
 def ol_event_info_t : Enum {
diff --git a/offload/liboffload/API/Queue.td b/offload/liboffload/API/Queue.td
index ededa9cc92fef..43cd8a9a63bb2 100644
--- a/offload/liboffload/API/Queue.td
+++ b/offload/liboffload/API/Queue.td
@@ -33,11 +33,15 @@ def olDestroyQueue : Function {
 
 def olSyncQueue : Function {
     let desc = "Block the calling thread until the enqueued work on a queue is complete.";
-    let details = [];
+    let details = [
+      "If the queue or any dependencies encounter an error, this returns early and no work after the error will be complete."
+    ];
     let params = [
         Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>
     ];
-    let returns = [];
+    let returns = [
+      Return<"OL_ERRC_INVALID_QUEUE_ERROR", ["The queue or any of it's dependencies encountered an error"]>,
+    ];
 }
 
 def olWaitEvents : Function {
@@ -45,6 +49,7 @@ def olWaitEvents : Function {
     let details = [
       "All events in `Events` must complete before the queue is unblocked.",
       "The input events can be from any queue on any device provided by the same platform as `Queue`.",
+      "If `Event`'s queue is different from `Queue`, a dependency is created. If `Event`'s queue enters the error state, then Queue will also enter the error state.",
     ];
     let params = [
         Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
@@ -125,3 +130,21 @@ def olLaunchHostFunction : Function {
   ];
   let returns = [];
 }
+
+def olGetQueueError : Function {
+  let desc = "Gets the error from a queue or any of its dependencies in the error state.";
+  let details = [
+    "If the queue is not in the error state, OL_SUCCESS is written",
+    "Dependencies are created using `olWaitEvents`, if any waited on queue enters the fail state then this will also be in the fail state",
+    "The error is not cleared; there is no way to recover a queue in the error state",
+  ];
+  let params = [
+    Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN>,
+    Param<"ol_result_t *", "Error", "output location to put the queue error", PARAM_IN>,
+    Param<"ol_queue_handle_t *", "FailingQueue", "output location to put the queue that encountered an error", PARAM_IN_OPTIONAL>,
+
+  ];
+  let returns = [
+    Return<"OL_ERRC_INVALID_QUEUE">
+  ];
+}
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 9d342e06127a2..d47ee74978b9f 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -644,6 +644,13 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName,
   return olGetQueueInfoImplDetail(Queue, PropName, 0, nullptr, PropSizeRet);
 }
 
+Error olGetQueueError_impl(ol_queue_handle_t Queue, ol_result_t *Error,
+                           ol_queue_handle_t *ErrQueue) {
+  // TODO
+  *Error = nullptr;
+  return Error::success();
+}
+
 Error olSyncEvent_impl(ol_event_handle_t Event) {
   // No event info means that this event was complete on creation
   if (!Event->EventInfo)



More information about the llvm-commits mailing list