[llvm] [Offload] Add support for measuring elapsed time between events (PR #186856)

Leandro Lacerda via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 30 18:46:01 PDT 2026


https://github.com/leandrolcampos updated https://github.com/llvm/llvm-project/pull/186856

>From 1788404f910795fcfd1183393e06e184904b1aac Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 16:30:59 -0300
Subject: [PATCH 01/16] Add offload event timing

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/liboffload/API/Event.td               |  17 +-
 offload/liboffload/src/OffloadImpl.cpp        |  60 ++++---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |   7 +
 .../common/include/PluginInterface.h          |  10 ++
 .../common/src/PluginInterface.cpp            |  21 +++
 .../cuda/dynamic_cuda/cuda.cpp                |   1 +
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |   1 +
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  10 ++
 offload/plugins-nextgen/host/src/rtl.cpp      |   6 +
 .../level_zero/include/L0Device.h             |   6 +
 offload/unittests/OffloadAPI/CMakeLists.txt   |   1 +
 .../event/olGetEventElapsedTime.cpp           | 146 ++++++++++++++++++
 12 files changed, 266 insertions(+), 20 deletions(-)
 create mode 100644 offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp

diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td
index 075bf5bafaa64..be77500562a19 100644
--- a/offload/liboffload/API/Event.td
+++ b/offload/liboffload/API/Event.td
@@ -13,7 +13,8 @@
 def olCreateEvent : Function {
     let desc = "Enqueue an event to `Queue` and return it.";
     let details = [
-      "This event can be used with `olSyncEvent` and `olWaitEvents` and will be complete once all enqueued work prior to the `olCreateEvent` call is complete.",
+      "This event can be used with `olSyncEvent`, `olWaitEvents`, and `olGetEventElapsedTime`.",
+      "It will be complete once all enqueued work prior to the `olCreateEvent` call is complete.",
     ];
     let params = [
         Param<"ol_queue_handle_t", "Queue", "queue to create the event for", PARAM_IN>,
@@ -40,6 +41,20 @@ def olSyncEvent : Function {
     let returns = [];
 }
 
+def olGetEventElapsedTime : Function {
+    let desc = "Get the elapsed time in milliseconds between two events.";
+    let details = [
+        "The elapsed time is returned in milliseconds.",
+        "The queues associated with `StartEvent` and `EndEvent` must belong to the same device."
+    ];
+    let params = [
+        Param<"ol_event_handle_t", "StartEvent", "handle of the start event", PARAM_IN>,
+        Param<"ol_event_handle_t", "EndEvent", "handle of the end event", PARAM_IN>,
+        Param<"float*", "ElapsedTime", "output pointer for the elapsed time in milliseconds", PARAM_OUT>
+    ];
+    let returns = [];
+}
+
 def ol_event_info_t : Enum {
   let desc = "Supported event info.";
   let is_typed = 1;
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index dd3ec0f61b4da..48748074c79cf 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -175,8 +175,8 @@ struct ol_event_impl_t {
                   ol_queue_handle_t Queue)
       : EventInfo(EventInfo), Device(Device), QueueId(Queue->Id), Queue(Queue) {
   }
-  // EventInfo may be null, in which case the event should be considered always
-  // complete
+  // Opaque backend-specific event state. This is expected to be non-null for
+  // backends that materialize real events.
   void *EventInfo;
   ol_device_handle_t Device;
   size_t QueueId;
@@ -794,7 +794,8 @@ Error olWaitEvents_impl(ol_queue_handle_t Queue, ol_event_handle_t *Events,
       return Plugin::error(ErrorCode::INVALID_NULL_HANDLE,
                            "olWaitEvents asked to wait on a NULL event");
 
-    // Do nothing if the event is for this queue or the event is always complete
+    // Do nothing if the event is for this queue or the backend does not
+    // materialize event state for it.
     if (Event->QueueId == Queue->Id || !Event->EventInfo)
       continue;
 
@@ -839,7 +840,8 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName,
 }
 
 Error olSyncEvent_impl(ol_event_handle_t Event) {
-  // No event info means that this event was complete on creation
+  // Some backends do not materialize backend event state. Treat such events as
+  // trivially complete.
   if (!Event->EventInfo)
     return Plugin::success();
 
@@ -849,6 +851,26 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
   return Error::success();
 }
 
+Error olGetEventElapsedTime_impl(ol_event_handle_t StartEvent,
+                                 ol_event_handle_t EndEvent,
+                                 float *ElapsedTime) {
+  if (!StartEvent || !EndEvent)
+    return createOffloadError(ErrorCode::INVALID_NULL_HANDLE,
+                              "olGetEventElapsedTime was given a NULL event");
+
+  if (!ElapsedTime)
+    return createOffloadError(ErrorCode::INVALID_NULL_POINTER,
+                              "ElapsedTime is null");
+
+  if (StartEvent->Device != EndEvent->Device)
+    return createOffloadError(
+        ErrorCode::INVALID_DEVICE,
+        "StartEvent and EndEvent must belong to the same device");
+
+  return StartEvent->Device->Device->getEventElapsedTime(
+      StartEvent->EventInfo, EndEvent->EventInfo, ElapsedTime);
+}
+
 Error olDestroyEvent_impl(ol_event_handle_t Event) {
   if (Event->EventInfo)
     if (auto Res = Event->Device->Device->destroyEvent(Event->EventInfo))
@@ -867,7 +889,8 @@ Error olGetEventInfoImplDetail(ol_event_handle_t Event,
   case OL_EVENT_INFO_QUEUE:
     return Info.write<ol_queue_handle_t>(Queue);
   case OL_EVENT_INFO_IS_COMPLETE: {
-    // No event info means that this event was complete on creation
+    // Some backends do not materialize backend event state. Treat such events as
+    // trivially complete.
     if (!Event->EventInfo)
       return Info.write<bool>(true);
 
@@ -898,24 +921,23 @@ Error olGetEventInfoSize_impl(ol_event_handle_t Event, ol_event_info_t PropName,
 }
 
 Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) {
-  auto Pending = Queue->Device->Device->hasPendingWork(Queue->AsyncInfo);
-  if (auto Err = Pending.takeError())
-    return Err;
+  auto Event = std::make_unique<ol_event_impl_t>(nullptr, Queue->Device, Queue);
 
-  *EventOut = new ol_event_impl_t(nullptr, Queue->Device, Queue);
-  if (!*Pending)
-    // Queue is empty, don't record an event and consider the event always
-    // complete
-    return Plugin::success();
+  if (auto Err = Queue->Device->Device->createEvent(&Event->EventInfo))
+    return Err;
 
-  if (auto Res = Queue->Device->Device->createEvent(&(*EventOut)->EventInfo))
-    return Res;
+  if (auto Err = Queue->Device->Device->recordEvent(Event->EventInfo,
+                                                    Queue->AsyncInfo)) {
+    if (Event->EventInfo)
+      if (auto DestroyErr =
+              Queue->Device->Device->destroyEvent(Event->EventInfo))
+        return joinErrors(std::move(Err), std::move(DestroyErr));
 
-  if (auto Res = Queue->Device->Device->recordEvent((*EventOut)->EventInfo,
-                                                    Queue->AsyncInfo))
-    return Res;
+    return Err;
+  }
 
-  return Plugin::success();
+  *EventOut = Event.release();
+  return Error::success();
 }
 
 Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 3f9064eaf00b3..70aff9f43c5b0 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2871,6 +2871,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Event->sync();
   }
 
+  /// Get the elapsed time in milliseconds between two events.
+  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+                                float *ElapsedTime) override {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet",
+                         __func__);
+  }
+
   /// Print information about the device.
   Expected<InfoTreeNode> obtainInfoImpl() override {
     char TmpChar[1000];
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index b6a54f05b1dcc..04a4a78a24ba9 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -994,6 +994,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error syncEvent(void *EventPtr);
   virtual Error syncEventImpl(void *EventPtr) = 0;
 
+  /// Get the elapsed time in milliseconds between two events.
+  Error getEventElapsedTime(void *StartEventPtr, void *EndEventPtr,
+                            float *ElapsedTime);
+  virtual Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+                                        float *ElapsedTime) = 0;
+
   /// Obtain information about the device.
   Expected<InfoTreeNode> obtainInfo();
   virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
@@ -1543,6 +1549,10 @@ struct GenericPluginTy {
   /// Synchronize execution until an event is done.
   int32_t sync_event(int32_t DeviceId, void *EventPtr);
 
+  /// Get the elapsed time in milliseconds between two events.
+  int32_t get_event_elapsed_time(int32_t DeviceId, void *StartEventPtr,
+                                 void *EndEventPtr, float *ElapsedTime);
+
   /// Remove the event from the plugin.
   int32_t destroy_event(int32_t DeviceId, void *EventPtr);
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 4093d08044bc3..6cc462e6162dd 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1568,6 +1568,12 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
   return syncEventImpl(EventPtr);
 }
 
+Error GenericDeviceTy::getEventElapsedTime(void *StartEventPtr,
+                                           void *EndEventPtr,
+                                           float *ElapsedTime) {
+  return getEventElapsedTimeImpl(StartEventPtr, EndEventPtr, ElapsedTime);
+}
+
 bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
 
 Expected<bool> GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
@@ -2087,6 +2093,21 @@ int32_t GenericPluginTy::sync_event(int32_t DeviceId, void *EventPtr) {
   return OFFLOAD_SUCCESS;
 }
 
+int32_t GenericPluginTy::get_event_elapsed_time(int32_t DeviceId,
+                                                void *StartEventPtr,
+                                                void *EndEventPtr,
+                                                float *ElapsedTime) {
+  auto Err = getDevice(DeviceId).getEventElapsedTime(StartEventPtr, EndEventPtr,
+                                                     ElapsedTime);
+  if (Err) {
+    REPORT() << "Failure to get elapsed time between events " << StartEventPtr
+             << " and " << EndEventPtr << ": " << toString(std::move(Err));
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
 int32_t GenericPluginTy::destroy_event(int32_t DeviceId, void *EventPtr) {
   auto Err = getDevice(DeviceId).destroyEvent(EventPtr);
   if (Err) {
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 80e3e418ae3fa..8fc8d0e43fab2 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -97,6 +97,7 @@ DLWRAP(cuEventRecord, 2)
 DLWRAP(cuEventQuery, 1)
 DLWRAP(cuStreamWaitEvent, 3)
 DLWRAP(cuEventSynchronize, 1)
+DLWRAP(cuEventElapsedTime, 3)
 DLWRAP(cuEventDestroy, 1)
 
 DLWRAP_FINALIZE()
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index fa4f4634ecec3..dd47fb98dc03a 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -371,6 +371,7 @@ CUresult cuEventRecord(CUevent, CUstream);
 CUresult cuEventQuery(CUevent);
 CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
 CUresult cuEventSynchronize(CUevent);
+CUresult cuEventElapsedTime(float *, CUevent, CUevent);
 CUresult cuEventDestroy(CUevent);
 
 CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 4de754265ea77..6d61da4fdc17b 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1086,6 +1086,16 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "error in cuEventSynchronize: %s");
   }
 
+  /// Get the elapsed time in milliseconds between two events.
+  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+                                float *ElapsedTime) override {
+    CUevent StartEvent = reinterpret_cast<CUevent>(StartEventPtr);
+    CUevent EndEvent = reinterpret_cast<CUevent>(EndEventPtr);
+
+    CUresult Res = cuEventElapsedTime(ElapsedTime, StartEvent, EndEvent);
+    return Plugin::check(Res, "error in cuEventElapsedTime: %s");
+  }
+
   /// Print information about the device.
   Expected<InfoTreeNode> obtainInfoImpl() override {
     char TmpChar[1000];
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 077dd14b959e0..b90478a520b39 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -360,6 +360,12 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return true;
   }
   Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
+  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+                                float *ElapsedTime) override {
+    if (ElapsedTime)
+      *ElapsedTime = 0.0f;
+    return Plugin::success();
+  }
 
   /// Print information about the device.
   Expected<InfoTreeNode> obtainInfoImpl() override {
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 001a41ba77d7b..19a88ca0b70ad 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -626,6 +626,12 @@ class L0DeviceTy final : public GenericDeviceTy {
                          __func__);
   }
 
+  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+                                float *ElapsedTime) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+                         __func__);
+  }
+
   Expected<InfoTreeNode> obtainInfoImpl() override;
   uint64_t getClockFrequency() const override { return getClockRate(); }
   uint64_t getHardwareParallelism() const override { return getTotalThreads(); }
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 031dbea660fbc..39863391f27d6 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -13,6 +13,7 @@ add_offload_unittest("event"
     event/olCreateEvent.cpp
     event/olDestroyEvent.cpp
     event/olSyncEvent.cpp
+    event/olGetEventElapsedTime.cpp
     event/olGetEventInfo.cpp
     event/olGetEventInfoSize.cpp)
 
diff --git a/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
new file mode 100644
index 0000000000000..13da46a798140
--- /dev/null
+++ b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
@@ -0,0 +1,146 @@
+//===------- Offload API tests - olGetEventElapsedTime --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include "llvm/Support/MemoryBuffer.h"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+struct olGetEventElapsedTimeTest : OffloadQueueTest {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
+
+    ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin));
+    ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
+                                   DeviceBin->getBufferSize(), &Program));
+    ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &Kernel));
+
+    LaunchArgs.Dimensions = 1;
+    LaunchArgs.GroupSize = {64, 1, 1};
+    LaunchArgs.NumGroups = {1, 1, 1};
+    LaunchArgs.DynSharedMemory = 0;
+
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                              LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+  }
+
+  void TearDown() override {
+    if (Mem)
+      ASSERT_SUCCESS(olMemFree(Mem));
+    if (Program)
+      ASSERT_SUCCESS(olDestroyProgram(Program));
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown());
+  }
+
+  void launchFoo() {
+    struct {
+      void *Mem;
+    } Args{Mem};
+
+    ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                  &LaunchArgs));
+  }
+
+  std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
+  ol_program_handle_t Program = nullptr;
+  ol_symbol_handle_t Kernel = nullptr;
+  ol_kernel_launch_size_args_t LaunchArgs{};
+  void *Mem = nullptr;
+};
+
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetEventElapsedTimeTest);
+
+TEST_P(olGetEventElapsedTimeTest, Success) {
+  ol_event_handle_t StartEvent = nullptr;
+  ol_event_handle_t EndEvent = nullptr;
+
+  ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+  ASSERT_NE(StartEvent, nullptr);
+
+  launchFoo();
+
+  ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+  ASSERT_NE(EndEvent, nullptr);
+
+  ASSERT_SUCCESS(olSyncEvent(EndEvent));
+
+  float ElapsedTime = -1.0f;
+
+  ASSERT_SUCCESS(olGetEventElapsedTime(StartEvent, EndEvent, &ElapsedTime));
+  ASSERT_GE(ElapsedTime, 0.0f);
+
+  ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+  ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, SuccessMultipleCalls) {
+  ol_event_handle_t StartEvent = nullptr;
+  ol_event_handle_t EndEvent = nullptr;
+
+  ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+  ASSERT_NE(StartEvent, nullptr);
+
+  launchFoo();
+
+  ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+  ASSERT_NE(EndEvent, nullptr);
+
+  ASSERT_SUCCESS(olSyncEvent(EndEvent));
+
+  float ElapsedTimeA = -1.0f;
+  float ElapsedTimeB = -1.0f;
+
+  ASSERT_SUCCESS(olGetEventElapsedTime(StartEvent, EndEvent, &ElapsedTimeA));
+  ASSERT_SUCCESS(olGetEventElapsedTime(StartEvent, EndEvent, &ElapsedTimeB));
+
+  ASSERT_GE(ElapsedTimeA, 0.0f);
+  ASSERT_GE(ElapsedTimeB, 0.0f);
+
+  ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+  ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, InvalidNullStartEvent) {
+  ol_event_handle_t EndEvent = nullptr;
+  ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+
+  float ElapsedTime = 0.0f;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetEventElapsedTime(nullptr, EndEvent, &ElapsedTime));
+
+  ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, InvalidNullEndEvent) {
+  ol_event_handle_t StartEvent = nullptr;
+  ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+
+  float ElapsedTime = 0.0f;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetEventElapsedTime(StartEvent, nullptr, &ElapsedTime));
+
+  ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, InvalidNullElapsedTime) {
+  ol_event_handle_t StartEvent = nullptr;
+  ol_event_handle_t EndEvent = nullptr;
+
+  ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+  ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olGetEventElapsedTime(StartEvent, EndEvent, nullptr));
+
+  ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+  ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+} // namespace
\ No newline at end of file

>From e1743262c9667c0306526baae0c1ae524b27c47c Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 16:36:12 -0300
Subject: [PATCH 02/16] Add event elapsed-time support for AMDGPU plugin

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 .../amdgpu/dynamic_hsa/hsa.cpp                |   2 +
 .../plugins-nextgen/amdgpu/dynamic_hsa/hsa.h  |   2 +
 .../amdgpu/dynamic_hsa/hsa_ext_amd.h          |  12 +
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 258 ++++++++++++++++--
 .../event/olGetEventElapsedTime.cpp           |   2 +-
 5 files changed, 249 insertions(+), 27 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
index 37d12861eb387..279a296dd1618 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
@@ -70,6 +70,8 @@ DLWRAP(hsa_amd_register_system_event_handler, 2)
 DLWRAP(hsa_amd_signal_create, 5)
 DLWRAP(hsa_amd_signal_async_handler, 5)
 DLWRAP(hsa_amd_pointer_info, 5)
+DLWRAP(hsa_amd_profiling_get_dispatch_time, 3)
+DLWRAP(hsa_amd_profiling_set_profiler_enabled, 2)
 DLWRAP(hsa_code_object_reader_create_from_memory, 3)
 DLWRAP(hsa_code_object_reader_destroy, 1)
 DLWRAP(hsa_executable_load_agent_code_object, 5)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index ad135f72fff12..f6e3337ddb3f4 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -99,6 +99,8 @@ typedef enum {
 typedef enum {
   HSA_SYSTEM_INFO_VERSION_MAJOR = 0,
   HSA_SYSTEM_INFO_VERSION_MINOR = 1,
+  HSA_SYSTEM_INFO_TIMESTAMP = 2,
+  HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3,
 } hsa_system_info_t;
 
 typedef enum {
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index ddfa65c76cf2d..7ff77f8e2a2fa 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -169,6 +169,18 @@ hsa_status_t hsa_amd_pointer_info(const void* ptr,
                                           uint32_t* num_agents_accessible,
                                           hsa_agent_t** accessible);
 
+typedef struct hsa_amd_profiling_dispatch_time_s {
+  uint64_t start;
+  uint64_t end;
+} hsa_amd_profiling_dispatch_time_t;
+
+hsa_status_t
+hsa_amd_profiling_get_dispatch_time(hsa_agent_t agent, hsa_signal_t signal,
+                                    hsa_amd_profiling_dispatch_time_t *time);
+
+hsa_status_t hsa_amd_profiling_set_profiler_enabled(hsa_queue_t *queue,
+                                                    int enable);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 70aff9f43c5b0..58dabbc374486 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -714,16 +714,32 @@ using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
 /// Class holding an HSA queue to submit kernel and barrier packets.
 struct AMDGPUQueueTy {
   /// Create an empty queue.
-  AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
+  AMDGPUQueueTy()
+      : Queue(nullptr), ProfilingEnabled(false), Mutex(), NumUsers(0) {}
 
   /// Lazily initialize a new queue belonging to a specific agent.
   Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
     if (Queue)
       return Plugin::success();
+
     hsa_status_t Status =
         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
                          &Device, UINT32_MAX, UINT32_MAX, &Queue);
-    return Plugin::check(Status, "error in hsa_queue_create: %s");
+    if (auto Err = Plugin::check(Status, "error in hsa_queue_create: %s"))
+      return Err;
+
+    // Keep queue profiling enabled from creation time onward, as HIP/ROCclr
+    // does. Elapsed-time queries rely on queue-level hardware profiling support
+    // to retrieve packet timing.
+    //
+    // If enabling profiling fails, leave the queue usable and preserve existing
+    // event behavior; only elapsed-time queries will later report that timing
+    // is unavailable.
+    Status = hsa_amd_profiling_set_profiler_enabled(Queue, 1);
+    if (Status == HSA_STATUS_SUCCESS)
+      ProfilingEnabled = true;
+
+    return Plugin::success();
   }
 
   /// Deinitialize the queue and destroy its resources.
@@ -731,10 +747,16 @@ struct AMDGPUQueueTy {
     std::lock_guard<std::mutex> Lock(Mutex);
     if (!Queue)
       return Plugin::success();
+
+    ProfilingEnabled = false;
+
     hsa_status_t Status = hsa_queue_destroy(Queue);
     return Plugin::check(Status, "error in hsa_queue_destroy: %s");
   }
 
+  /// Returns whether profiling is enabled on the underlying HSA queue.
+  bool isProfilingEnabled() const { return ProfilingEnabled; }
+
   /// Returns the number of streams, this queue is currently assigned to.
   bool getUserCount() const { return NumUsers; }
 
@@ -917,6 +939,9 @@ struct AMDGPUQueueTy {
   /// The HSA queue.
   hsa_queue_t *Queue;
 
+  /// Indicates whether profiling is enabled on the underlying HSA queue.
+  bool ProfilingEnabled;
+
   /// Mutex to protect the acquiring and publishing of packets. For the moment,
   /// we need this mutex to prevent publishing packets that are not ready to be
   /// published in a multi-thread scenario. Without a queue lock, a thread T1
@@ -1144,6 +1169,18 @@ struct AMDGPUStreamTy {
     return {Curr, InputSignal};
   }
 
+  /// Roll back the last consumed slot after a submission failure so the stream
+  /// does not retain a slot for an operation that was never enqueued.
+  void rollbackConsumedSlot(uint32_t Slot) {
+    assert(NextSlot > 0 && "cannot roll back an empty stream");
+    assert(Slot + 1 == NextSlot && "can only roll back the last consumed slot");
+
+    Slots[Slot].Signal = nullptr;
+    Slots[Slot].Callbacks.clear();
+    Slots[Slot].ActionArgs.clear();
+    --NextSlot;
+  }
+
   /// Complete all pending post actions and reset the stream after synchronizing
   /// or positively querying the stream.
   Error complete() {
@@ -1643,8 +1680,8 @@ struct AMDGPUStreamTy {
 
   const AMDGPUQueueTy *getQueue() const { return Queue; }
 
-  /// Record the state of the stream on an event.
-  Error recordEvent(AMDGPUEventTy &Event) const;
+  /// Record an event by enqueuing a barrier marker packet on the stream.
+  Error recordEvent(AMDGPUEventTy &Event);
 
   /// Make the stream wait on an event.
   Error waitEvent(const AMDGPUEventTy &Event);
@@ -1652,25 +1689,47 @@ struct AMDGPUStreamTy {
   friend struct AMDGPUStreamManagerTy;
 };
 
-/// Class representing an event on AMDGPU. The event basically stores some
-/// information regarding the state of the recorded stream.
+/// Class representing an event on AMDGPU. The event stores the recorded stream
+/// point and retained timing state.
 struct AMDGPUEventTy {
   /// Create an empty event.
   AMDGPUEventTy(AMDGPUDeviceTy &Device)
-      : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
+      : Device(Device), RecordedStream(nullptr), RecordedSlot(-1),
+        RecordedSyncCycle(-1), TimingSignal(nullptr), TimingAgent({0}) {}
 
   /// Initialize and deinitialize.
-  Error init() { return Plugin::success(); }
-  Error deinit() { return Plugin::success(); }
+  Error init() { return resetState(); }
+  Error deinit() { return resetState(); }
+
+  /// Clear the current recording and retained timing state.
+  Error resetState() {
+    if (auto Err = releaseTimingSignal())
+      return Err;
+
+    RecordedStream = nullptr;
+    RecordedSlot = -1;
+    RecordedSyncCycle = -1;
+    TimingAgent = {0};
+
+    return Plugin::success();
+  }
 
   /// Record the state of a stream on the event.
   Error record(AMDGPUStreamTy &Stream) {
     std::lock_guard<std::mutex> Lock(Mutex);
 
     // Ignore the last recorded stream.
+    if (auto Err = resetState())
+      return Err;
+
     RecordedStream = &Stream;
 
-    return Stream.recordEvent(*this);
+    if (auto Err = Stream.recordEvent(*this)) {
+      llvm::consumeError(resetState());
+      return Err;
+    }
+
+    return Plugin::success();
   }
 
   /// Make a stream wait on the current event.
@@ -1708,38 +1767,80 @@ struct AMDGPUEventTy {
     return RecordedStream->synchronizeOn(*this);
   }
 
+  /// Return the elapsed time in milliseconds between this event and EndEvent.
+  Error elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime);
+
 protected:
+  /// Release the retained timing signal, if any, back to the signal manager.
+  Error releaseTimingSignal();
+
+  /// The device that owns this event.
+  AMDGPUDeviceTy &Device;
+
   /// The stream registered in this event.
   AMDGPUStreamTy *RecordedStream;
 
-  /// The recordered operation on the recorded stream.
+  /// The recorded operation on the recorded stream.
   int64_t RecordedSlot;
 
   /// The sync cycle when the stream was recorded. Used to detect stale events.
   int64_t RecordedSyncCycle;
 
+  /// The signal of the recorded barrier marker used for timing. Null means
+  /// timing is unavailable for the current recording.
+  AMDGPUSignalTy *TimingSignal;
+
+  /// The agent that owns the queue where the timing marker was recorded. A zero
+  /// handle means timing is unavailable for the current recording.
+  hsa_agent_t TimingAgent;
+
   /// Mutex to safely access event fields.
   mutable std::mutex Mutex;
 
   friend struct AMDGPUStreamTy;
 };
 
-Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
-  std::lock_guard<std::mutex> Lock(Mutex);
+Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
+  if (Queue == nullptr)
+    return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
+                         "target queue was nullptr");
+
+  // Retrieve an available signal for the operation's output.
+  AMDGPUSignalTy *OutputSignal = nullptr;
+  if (auto Err = SignalManager.getResource(OutputSignal))
+    return Err;
+  OutputSignal->reset();
+  OutputSignal->increaseUseCount();
 
-  if (size() > 0) {
-    // Record the synchronize identifier (to detect stale recordings) and
-    // the last valid stream's operation.
-    Event.RecordedSyncCycle = SyncCycle;
-    Event.RecordedSlot = last();
+  std::lock_guard<std::mutex> StreamLock(Mutex);
 
-    assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
-    assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
+  // Consume stream slot and compute dependencies.
+  auto [Curr, InputSignal] = consume(OutputSignal);
+
+  // Materialize the event as a real marker on the queue. Elapsed-time queries
+  // need a packet-backed completion signal to retrieve dispatch timing.
+  if (auto Err = Queue->pushBarrier(OutputSignal, InputSignal, nullptr)) {
+    rollbackConsumedSlot(Curr);
+    if (OutputSignal->decreaseUseCount())
+      llvm::consumeError(SignalManager.returnResource(OutputSignal));
+    return Err;
+  }
+
+  Event.RecordedSyncCycle = SyncCycle;
+  Event.RecordedSlot = Curr;
+
+  if (Queue->isProfilingEnabled()) {
+    OutputSignal->increaseUseCount();
+    Event.TimingSignal = OutputSignal;
+    Event.TimingAgent = Agent;
   } else {
-    // The stream is empty, everything already completed, record nothing.
-    Event.RecordedSyncCycle = -1;
-    Event.RecordedSlot = -1;
+    Event.TimingSignal = nullptr;
+    Event.TimingAgent = {0};
   }
+
+  assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
+  assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
+
   return Plugin::success();
 }
 
@@ -2124,6 +2225,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                          ClockFrequency) != HSA_STATUS_SUCCESS)
       ClockFrequency = 0;
 
+    // Retrieve the HSA system timestamp frequency for this runtime. A zero
+    // value means the frequency is unavailable.
+    if (hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY,
+                            &SystemTimestampFrequency) != HSA_STATUS_SUCCESS)
+      SystemTimestampFrequency = 0;
+
     // Load the grid values depending on the wavefront.
     if (WavefrontSize == 32)
       GridValues = getAMDGPUGridValues<32>();
@@ -2333,6 +2440,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Returns the clock frequency for the given AMDGPU device.
   uint64_t getClockFrequency() const override { return ClockFrequency; }
 
+  /// Returns the HSA system timestamp frequency. Zero means unavailable.
+  uint64_t getSystemTimestampFrequency() const {
+    return SystemTimestampFrequency;
+  }
+
   /// Allocate and construct an AMDGPU kernel.
   Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the AMDGPU kernel.
@@ -2813,12 +2925,19 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Create an event.
   Error createEventImpl(void **EventPtrStorage) override {
     AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
-    return AMDGPUEventManager.getResource(*Event);
+    if (auto Err = AMDGPUEventManager.getResource(*Event))
+      return Err;
+    return (*Event)->resetState();
   }
 
   /// Destroy a previously created event.
   Error destroyEventImpl(void *EventPtr) override {
     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
+    assert(Event && "Invalid event");
+
+    if (auto Err = Event->resetState())
+      return Err;
+
     return AMDGPUEventManager.returnResource(Event);
   }
 
@@ -2874,8 +2993,18 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Get the elapsed time in milliseconds between two events.
   Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
                                 float *ElapsedTime) override {
-    return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet",
-                         __func__);
+    if (!ElapsedTime)
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                           "elapsed time output pointer is null");
+
+    AMDGPUEventTy *StartEvent =
+        reinterpret_cast<AMDGPUEventTy *>(StartEventPtr);
+    AMDGPUEventTy *EndEvent = reinterpret_cast<AMDGPUEventTy *>(EndEventPtr);
+
+    if (!StartEvent || !EndEvent)
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid event handle");
+
+    return StartEvent->elapsedTime(*EndEvent, *ElapsedTime);
   }
 
   /// Print information about the device.
@@ -3354,6 +3483,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// The frequency of the steady clock inside the device.
   uint64_t ClockFrequency;
 
+  /// The HSA system timestamp frequency reported by the runtime. Zero means
+  /// unavailable.
+  uint64_t SystemTimestampFrequency = 0;
+
   /// The total number of concurrent work items that can be running on the GPU.
   uint64_t HardwareParallelism;
 
@@ -3460,6 +3593,79 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
       StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
       UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
 
+Error AMDGPUEventTy::releaseTimingSignal() {
+  if (!TimingSignal)
+    return Plugin::success();
+
+  AMDGPUSignalTy *Signal = TimingSignal;
+  TimingSignal = nullptr;
+
+  if (Signal->decreaseUseCount())
+    return Device.getSignalManager().returnResource(Signal);
+
+  return Plugin::success();
+}
+
+Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
+  if (this == &EndEvent) {
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    if (!TimingSignal)
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                           "event timing is not available");
+
+    if (TimingSignal->load())
+      return Plugin::error(ErrorCode::UNKNOWN, "event timing is not ready");
+
+    ElapsedTime = 0.0f;
+    return Plugin::success();
+  }
+
+  std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, EndEvent.Mutex);
+
+  if (&Device != &EndEvent.Device)
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "events belong to different devices");
+
+  if (!TimingSignal || !EndEvent.TimingSignal)
+    return Plugin::error(
+        ErrorCode::INVALID_ARGUMENT,
+        "timing information is not available for one or both events");
+
+  if (TimingSignal->load() || EndEvent.TimingSignal->load())
+    return Plugin::error(
+        ErrorCode::UNKNOWN,
+        "timing information is not ready for one or both events");
+
+  const uint64_t Frequency = Device.getSystemTimestampFrequency();
+  if (Frequency == 0)
+    return Plugin::error(ErrorCode::UNSUPPORTED,
+                         "HSA system timestamp frequency is unavailable");
+
+  hsa_amd_profiling_dispatch_time_t StartTime = {};
+  hsa_amd_profiling_dispatch_time_t StopTime = {};
+
+  hsa_status_t Status = hsa_amd_profiling_get_dispatch_time(
+      TimingAgent, TimingSignal->get(), &StartTime);
+  if (auto Err = Plugin::check(
+          Status, "error in hsa_amd_profiling_get_dispatch_time: %s"))
+    return Err;
+
+  Status = hsa_amd_profiling_get_dispatch_time(
+      EndEvent.TimingAgent, EndEvent.TimingSignal->get(), &StopTime);
+  if (auto Err = Plugin::check(
+          Status, "error in hsa_amd_profiling_get_dispatch_time: %s"))
+    return Err;
+
+  const int64_t DeltaTicks =
+      static_cast<int64_t>(StopTime.end) - static_cast<int64_t>(StartTime.end);
+
+  ElapsedTime = static_cast<float>(static_cast<double>(DeltaTicks) * 1000.0 /
+                                   static_cast<double>(Frequency));
+
+  return Plugin::success();
+}
+
 /// Class implementing the AMDGPU-specific functionalities of the global
 /// handler.
 struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
diff --git a/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
index 13da46a798140..aca2dccff72fe 100644
--- a/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
+++ b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
@@ -143,4 +143,4 @@ TEST_P(olGetEventElapsedTimeTest, InvalidNullElapsedTime) {
   ASSERT_SUCCESS(olDestroyEvent(EndEvent));
 }
 
-} // namespace
\ No newline at end of file
+} // namespace

>From beb718615a06a87b515e4bcd90951b2fe9f5443b Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 16:53:56 -0300
Subject: [PATCH 03/16] Apply clang-format fixes

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/liboffload/src/OffloadImpl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 48748074c79cf..3b794092a1d23 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -889,8 +889,8 @@ Error olGetEventInfoImplDetail(ol_event_handle_t Event,
   case OL_EVENT_INFO_QUEUE:
     return Info.write<ol_queue_handle_t>(Queue);
   case OL_EVENT_INFO_IS_COMPLETE: {
-    // Some backends do not materialize backend event state. Treat such events as
-    // trivially complete.
+    // Some backends do not materialize backend event state. Treat such events
+    // as trivially complete.
     if (!Event->EventInfo)
       return Info.write<bool>(true);
 

>From 2702292e1ed1b4e0135448f98c9f18ae4b4e9dfc Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 18:57:20 -0300
Subject: [PATCH 04/16] Remove redundant validation from
 olGetEventElapsedTime_impl

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/liboffload/src/OffloadImpl.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 3b794092a1d23..0e280586f746b 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -854,14 +854,6 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
 Error olGetEventElapsedTime_impl(ol_event_handle_t StartEvent,
                                  ol_event_handle_t EndEvent,
                                  float *ElapsedTime) {
-  if (!StartEvent || !EndEvent)
-    return createOffloadError(ErrorCode::INVALID_NULL_HANDLE,
-                              "olGetEventElapsedTime was given a NULL event");
-
-  if (!ElapsedTime)
-    return createOffloadError(ErrorCode::INVALID_NULL_POINTER,
-                              "ElapsedTime is null");
-
   if (StartEvent->Device != EndEvent->Device)
     return createOffloadError(
         ErrorCode::INVALID_DEVICE,

>From 6ac00d38d75cae0097ad749dfc97ef937d40f3f4 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Tue, 17 Mar 2026 16:36:46 -0300
Subject: [PATCH 05/16] Use clearer names in elapsed-time conversion

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 58dabbc374486..6bddfe65f38ee 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3637,8 +3637,8 @@ Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
         ErrorCode::UNKNOWN,
         "timing information is not ready for one or both events");
 
-  const uint64_t Frequency = Device.getSystemTimestampFrequency();
-  if (Frequency == 0)
+  const uint64_t TicksPerSecond = Device.getSystemTimestampFrequency();
+  if (TicksPerSecond == 0)
     return Plugin::error(ErrorCode::UNSUPPORTED,
                          "HSA system timestamp frequency is unavailable");
 
@@ -3659,9 +3659,11 @@ Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
 
   const int64_t DeltaTicks =
       static_cast<int64_t>(StopTime.end) - static_cast<int64_t>(StartTime.end);
+  constexpr double MillisecondsPerSecond = 1000.0;
 
-  ElapsedTime = static_cast<float>(static_cast<double>(DeltaTicks) * 1000.0 /
-                                   static_cast<double>(Frequency));
+  ElapsedTime = static_cast<float>(static_cast<double>(DeltaTicks) *
+                                   MillisecondsPerSecond /
+                                   static_cast<double>(TicksPerSecond));
 
   return Plugin::success();
 }

>From 218a76626371c22883ad28d860925cfcbcb5b083 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 19 Mar 2026 13:34:30 -0300
Subject: [PATCH 06/16] Clarify event timing comments

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 6bddfe65f38ee..5072185fa072f 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1718,7 +1718,7 @@ struct AMDGPUEventTy {
   Error record(AMDGPUStreamTy &Stream) {
     std::lock_guard<std::mutex> Lock(Mutex);
 
-    // Ignore the last recorded stream.
+    // Discard the previous recording and retained timing state.
     if (auto Err = resetState())
       return Err;
 
@@ -1786,12 +1786,12 @@ struct AMDGPUEventTy {
   /// The sync cycle when the stream was recorded. Used to detect stale events.
   int64_t RecordedSyncCycle;
 
-  /// The signal of the recorded barrier marker used for timing. Null means
-  /// timing is unavailable for the current recording.
+  /// The signal of the recorded timing barrier. Null means timing is
+  /// unavailable for the current recording.
   AMDGPUSignalTy *TimingSignal;
 
-  /// The agent that owns the queue where the timing marker was recorded. A zero
-  /// handle means timing is unavailable for the current recording.
+  /// The agent that owns the queue where the timing barrier was recorded. A
+  /// zero handle means timing is unavailable for the current recording.
   hsa_agent_t TimingAgent;
 
   /// Mutex to safely access event fields.

>From 10ff6db6174e57ec000ae3cc9265e9a4a0cc111a Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 19 Mar 2026 13:47:55 -0300
Subject: [PATCH 07/16] Clarify event recording comments

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 5072185fa072f..c9aa3c66a45f8 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1703,18 +1703,18 @@ struct AMDGPUEventTy {
 
   /// Clear the current recording and retained timing state.
   Error resetState() {
-    if (auto Err = releaseTimingSignal())
-      return Err;
-
     RecordedStream = nullptr;
     RecordedSlot = -1;
     RecordedSyncCycle = -1;
     TimingAgent = {0};
 
+    if (auto Err = releaseTimingSignal())
+      return Err;
+
     return Plugin::success();
   }
 
-  /// Record the state of a stream on the event.
+  /// Record the current stream point on the event.
   Error record(AMDGPUStreamTy &Stream) {
     std::lock_guard<std::mutex> Lock(Mutex);
 
@@ -1826,8 +1826,8 @@ Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
     return Err;
   }
 
-  Event.RecordedSyncCycle = SyncCycle;
   Event.RecordedSlot = Curr;
+  Event.RecordedSyncCycle = SyncCycle;
 
   if (Queue->isProfilingEnabled()) {
     OutputSignal->increaseUseCount();

>From 67431402a00fbe6ab88c79453d93ab746f07fcf9 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 26 Mar 2026 17:56:41 -0300
Subject: [PATCH 08/16] Move elapsed-time output check to common plugin layer

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp             | 4 ----
 offload/plugins-nextgen/common/src/PluginInterface.cpp | 3 +++
 offload/plugins-nextgen/host/src/rtl.cpp               | 3 +--
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index c9aa3c66a45f8..4d14372db3be6 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2993,10 +2993,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Get the elapsed time in milliseconds between two events.
   Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
                                 float *ElapsedTime) override {
-    if (!ElapsedTime)
-      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                           "elapsed time output pointer is null");
-
     AMDGPUEventTy *StartEvent =
         reinterpret_cast<AMDGPUEventTy *>(StartEventPtr);
     AMDGPUEventTy *EndEvent = reinterpret_cast<AMDGPUEventTy *>(EndEventPtr);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 6cc462e6162dd..0a261d0810485 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1571,6 +1571,9 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
 Error GenericDeviceTy::getEventElapsedTime(void *StartEventPtr,
                                            void *EndEventPtr,
                                            float *ElapsedTime) {
+  if (!ElapsedTime)
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "elapsed time output pointer is null");
   return getEventElapsedTimeImpl(StartEventPtr, EndEventPtr, ElapsedTime);
 }
 
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index b90478a520b39..0f89f38d0ad73 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -362,8 +362,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
   Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
                                 float *ElapsedTime) override {
-    if (ElapsedTime)
-      *ElapsedTime = 0.0f;
+    *ElapsedTime = 0.0f;
     return Plugin::success();
   }
 

>From f4acab727f79dc21f67cde363524d9c000981482 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 26 Mar 2026 22:02:12 -0300
Subject: [PATCH 09/16] Preserve cleanup errors in event recording paths

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 4d14372db3be6..058d065bc8180 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1725,7 +1725,8 @@ struct AMDGPUEventTy {
     RecordedStream = &Stream;
 
     if (auto Err = Stream.recordEvent(*this)) {
-      llvm::consumeError(resetState());
+      if (auto ResetErr = resetState())
+        return joinErrors(std::move(Err), std::move(ResetErr));
       return Err;
     }
 
@@ -1821,8 +1822,11 @@ Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
   // need a packet-backed completion signal to retrieve dispatch timing.
   if (auto Err = Queue->pushBarrier(OutputSignal, InputSignal, nullptr)) {
     rollbackConsumedSlot(Curr);
+
     if (OutputSignal->decreaseUseCount())
-      llvm::consumeError(SignalManager.returnResource(OutputSignal));
+      if (auto ReturnErr = SignalManager.returnResource(OutputSignal))
+        return joinErrors(std::move(Err), std::move(ReturnErr));
+
     return Err;
   }
 

>From 17c2dae954d3692dc8831f9f5af21bff00e09920 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 26 Mar 2026 23:06:11 -0300
Subject: [PATCH 10/16] Use Expected<float> for event elapsed time

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/liboffload/src/OffloadImpl.cpp        |  9 +++++--
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 25 ++++++++-----------
 .../common/include/PluginInterface.h          |  7 +++---
 .../common/src/PluginInterface.cpp            | 20 +++++++--------
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 12 ++++++---
 offload/plugins-nextgen/host/src/rtl.cpp      |  7 +++---
 .../level_zero/include/L0Device.h             |  4 +--
 7 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 0e280586f746b..3bc64032e2604 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -859,8 +859,13 @@ Error olGetEventElapsedTime_impl(ol_event_handle_t StartEvent,
         ErrorCode::INVALID_DEVICE,
         "StartEvent and EndEvent must belong to the same device");
 
-  return StartEvent->Device->Device->getEventElapsedTime(
-      StartEvent->EventInfo, EndEvent->EventInfo, ElapsedTime);
+  auto ElapsedTimeOrErr = StartEvent->Device->Device->getEventElapsedTime(
+      StartEvent->EventInfo, EndEvent->EventInfo);
+  if (!ElapsedTimeOrErr)
+    return ElapsedTimeOrErr.takeError();
+
+  *ElapsedTime = *ElapsedTimeOrErr;
+  return Error::success();
 }
 
 Error olDestroyEvent_impl(ol_event_handle_t Event) {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 058d065bc8180..37063ab819b9d 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1769,7 +1769,7 @@ struct AMDGPUEventTy {
   }
 
   /// Return the elapsed time in milliseconds between this event and EndEvent.
-  Error elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime);
+  Expected<float> getElapsedTime(AMDGPUEventTy &EndEvent);
 
 protected:
   /// Release the retained timing signal, if any, back to the signal manager.
@@ -2995,8 +2995,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   }
 
   /// Get the elapsed time in milliseconds between two events.
-  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
-                                float *ElapsedTime) override {
+  Expected<float> getEventElapsedTimeImpl(void *StartEventPtr,
+                                          void *EndEventPtr) override {
     AMDGPUEventTy *StartEvent =
         reinterpret_cast<AMDGPUEventTy *>(StartEventPtr);
     AMDGPUEventTy *EndEvent = reinterpret_cast<AMDGPUEventTy *>(EndEventPtr);
@@ -3004,7 +3004,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (!StartEvent || !EndEvent)
       return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid event handle");
 
-    return StartEvent->elapsedTime(*EndEvent, *ElapsedTime);
+    return StartEvent->getElapsedTime(*EndEvent);
   }
 
   /// Print information about the device.
@@ -3606,7 +3606,7 @@ Error AMDGPUEventTy::releaseTimingSignal() {
   return Plugin::success();
 }
 
-Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
+Expected<float> AMDGPUEventTy::getElapsedTime(AMDGPUEventTy &EndEvent) {
   if (this == &EndEvent) {
     std::lock_guard<std::mutex> Lock(Mutex);
 
@@ -3617,8 +3617,7 @@ Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
     if (TimingSignal->load())
       return Plugin::error(ErrorCode::UNKNOWN, "event timing is not ready");
 
-    ElapsedTime = 0.0f;
-    return Plugin::success();
+    return 0.0f;
   }
 
   std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, EndEvent.Mutex);
@@ -3649,23 +3648,21 @@ Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
       TimingAgent, TimingSignal->get(), &StartTime);
   if (auto Err = Plugin::check(
           Status, "error in hsa_amd_profiling_get_dispatch_time: %s"))
-    return Err;
+    return std::move(Err);
 
   Status = hsa_amd_profiling_get_dispatch_time(
       EndEvent.TimingAgent, EndEvent.TimingSignal->get(), &StopTime);
   if (auto Err = Plugin::check(
           Status, "error in hsa_amd_profiling_get_dispatch_time: %s"))
-    return Err;
+    return std::move(Err);
 
   const int64_t DeltaTicks =
       static_cast<int64_t>(StopTime.end) - static_cast<int64_t>(StartTime.end);
   constexpr double MillisecondsPerSecond = 1000.0;
 
-  ElapsedTime = static_cast<float>(static_cast<double>(DeltaTicks) *
-                                   MillisecondsPerSecond /
-                                   static_cast<double>(TicksPerSecond));
-
-  return Plugin::success();
+  return static_cast<float>(static_cast<double>(DeltaTicks) *
+                            MillisecondsPerSecond /
+                            static_cast<double>(TicksPerSecond));
 }
 
 /// Class implementing the AMDGPU-specific functionalities of the global
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 04a4a78a24ba9..9fe4af7b41b26 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -995,10 +995,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual Error syncEventImpl(void *EventPtr) = 0;
 
   /// Get the elapsed time in milliseconds between two events.
-  Error getEventElapsedTime(void *StartEventPtr, void *EndEventPtr,
-                            float *ElapsedTime);
-  virtual Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
-                                        float *ElapsedTime) = 0;
+  Expected<float> getEventElapsedTime(void *StartEventPtr, void *EndEventPtr);
+  virtual Expected<float> getEventElapsedTimeImpl(void *StartEventPtr,
+                                                  void *EndEventPtr) = 0;
 
   /// Obtain information about the device.
   Expected<InfoTreeNode> obtainInfo();
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 0a261d0810485..a286047da4032 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1568,13 +1568,9 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
   return syncEventImpl(EventPtr);
 }
 
-Error GenericDeviceTy::getEventElapsedTime(void *StartEventPtr,
-                                           void *EndEventPtr,
-                                           float *ElapsedTime) {
-  if (!ElapsedTime)
-    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
-                         "elapsed time output pointer is null");
-  return getEventElapsedTimeImpl(StartEventPtr, EndEventPtr, ElapsedTime);
+Expected<float> GenericDeviceTy::getEventElapsedTime(void *StartEventPtr,
+                                                     void *EndEventPtr) {
+  return getEventElapsedTimeImpl(StartEventPtr, EndEventPtr);
 }
 
 bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
@@ -2100,14 +2096,16 @@ int32_t GenericPluginTy::get_event_elapsed_time(int32_t DeviceId,
                                                 void *StartEventPtr,
                                                 void *EndEventPtr,
                                                 float *ElapsedTime) {
-  auto Err = getDevice(DeviceId).getEventElapsedTime(StartEventPtr, EndEventPtr,
-                                                     ElapsedTime);
-  if (Err) {
+  auto ElapsedTimeOrErr =
+      getDevice(DeviceId).getEventElapsedTime(StartEventPtr, EndEventPtr);
+  if (!ElapsedTimeOrErr) {
     REPORT() << "Failure to get elapsed time between events " << StartEventPtr
-             << " and " << EndEventPtr << ": " << toString(std::move(Err));
+             << " and " << EndEventPtr << ": "
+             << toString(ElapsedTimeOrErr.takeError());
     return OFFLOAD_FAIL;
   }
 
+  *ElapsedTime = *ElapsedTimeOrErr;
   return OFFLOAD_SUCCESS;
 }
 
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 6d61da4fdc17b..7a47f2ce7e5aa 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1087,13 +1087,17 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Get the elapsed time in milliseconds between two events.
-  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
-                                float *ElapsedTime) override {
+  Expected<float> getEventElapsedTimeImpl(void *StartEventPtr,
+                                          void *EndEventPtr) override {
     CUevent StartEvent = reinterpret_cast<CUevent>(StartEventPtr);
     CUevent EndEvent = reinterpret_cast<CUevent>(EndEventPtr);
 
-    CUresult Res = cuEventElapsedTime(ElapsedTime, StartEvent, EndEvent);
-    return Plugin::check(Res, "error in cuEventElapsedTime: %s");
+    float ElapsedTime = 0.0f;
+    CUresult Res = cuEventElapsedTime(&ElapsedTime, StartEvent, EndEvent);
+    if (auto Err = Plugin::check(Res, "error in cuEventElapsedTime: %s"))
+      return std::move(Err);
+
+    return ElapsedTime;
   }
 
   /// Print information about the device.
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 0f89f38d0ad73..bef49faf45383 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -360,10 +360,9 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return true;
   }
   Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
-  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
-                                float *ElapsedTime) override {
-    *ElapsedTime = 0.0f;
-    return Plugin::success();
+  Expected<float> getEventElapsedTimeImpl(void *StartEventPtr,
+                                          void *EndEventPtr) override {
+    return 0.0f;
   }
 
   /// Print information about the device.
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 19a88ca0b70ad..141c3f2cc75b5 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -626,8 +626,8 @@ class L0DeviceTy final : public GenericDeviceTy {
                          __func__);
   }
 
-  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
-                                float *ElapsedTime) override {
+  Expected<float> getEventElapsedTimeImpl(void *StartEventPtr,
+                                          void *EndEventPtr) override {
     return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
                          __func__);
   }

>From 79bacf222ba8c6a8221f6225654b47500859933c Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 26 Mar 2026 23:40:55 -0300
Subject: [PATCH 11/16] Move timestamp frequency check out of lock

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 37063ab819b9d..d0c2c15c6816a 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3620,6 +3620,11 @@ Expected<float> AMDGPUEventTy::getElapsedTime(AMDGPUEventTy &EndEvent) {
     return 0.0f;
   }
 
+  const uint64_t TicksPerSecond = Device.getSystemTimestampFrequency();
+  if (TicksPerSecond == 0)
+    return Plugin::error(ErrorCode::UNSUPPORTED,
+                         "HSA system timestamp frequency is unavailable");
+
   std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, EndEvent.Mutex);
 
   if (&Device != &EndEvent.Device)
@@ -3636,11 +3641,6 @@ Expected<float> AMDGPUEventTy::getElapsedTime(AMDGPUEventTy &EndEvent) {
         ErrorCode::UNKNOWN,
         "timing information is not ready for one or both events");
 
-  const uint64_t TicksPerSecond = Device.getSystemTimestampFrequency();
-  if (TicksPerSecond == 0)
-    return Plugin::error(ErrorCode::UNSUPPORTED,
-                         "HSA system timestamp frequency is unavailable");
-
   hsa_amd_profiling_dispatch_time_t StartTime = {};
   hsa_amd_profiling_dispatch_time_t StopTime = {};
 

>From dea4f0d00f1d641bcb8bd760c00c76992ca0e00d Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Fri, 27 Mar 2026 00:59:35 -0300
Subject: [PATCH 12/16] Use device agent for event timing

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index d0c2c15c6816a..8fb8f4327d12b 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1695,7 +1695,7 @@ struct AMDGPUEventTy {
   /// Create an empty event.
   AMDGPUEventTy(AMDGPUDeviceTy &Device)
       : Device(Device), RecordedStream(nullptr), RecordedSlot(-1),
-        RecordedSyncCycle(-1), TimingSignal(nullptr), TimingAgent({0}) {}
+        RecordedSyncCycle(-1), TimingSignal(nullptr) {}
 
   /// Initialize and deinitialize.
   Error init() { return resetState(); }
@@ -1706,7 +1706,6 @@ struct AMDGPUEventTy {
     RecordedStream = nullptr;
     RecordedSlot = -1;
     RecordedSyncCycle = -1;
-    TimingAgent = {0};
 
     if (auto Err = releaseTimingSignal())
       return Err;
@@ -1791,10 +1790,6 @@ struct AMDGPUEventTy {
   /// unavailable for the current recording.
   AMDGPUSignalTy *TimingSignal;
 
-  /// The agent that owns the queue where the timing barrier was recorded. A
-  /// zero handle means timing is unavailable for the current recording.
-  hsa_agent_t TimingAgent;
-
   /// Mutex to safely access event fields.
   mutable std::mutex Mutex;
 
@@ -1836,10 +1831,8 @@ Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
   if (Queue->isProfilingEnabled()) {
     OutputSignal->increaseUseCount();
     Event.TimingSignal = OutputSignal;
-    Event.TimingAgent = Agent;
   } else {
     Event.TimingSignal = nullptr;
-    Event.TimingAgent = {0};
   }
 
   assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
@@ -3645,13 +3638,13 @@ Expected<float> AMDGPUEventTy::getElapsedTime(AMDGPUEventTy &EndEvent) {
   hsa_amd_profiling_dispatch_time_t StopTime = {};
 
   hsa_status_t Status = hsa_amd_profiling_get_dispatch_time(
-      TimingAgent, TimingSignal->get(), &StartTime);
+      Device.getAgent(), TimingSignal->get(), &StartTime);
   if (auto Err = Plugin::check(
           Status, "error in hsa_amd_profiling_get_dispatch_time: %s"))
     return std::move(Err);
 
   Status = hsa_amd_profiling_get_dispatch_time(
-      EndEvent.TimingAgent, EndEvent.TimingSignal->get(), &StopTime);
+      EndEvent.Device.getAgent(), EndEvent.TimingSignal->get(), &StopTime);
   if (auto Err = Plugin::check(
           Status, "error in hsa_amd_profiling_get_dispatch_time: %s"))
     return std::move(Err);

>From 0096deeef06cda644b8c899edfa7265d26025abe Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Sat, 28 Mar 2026 16:04:14 -0300
Subject: [PATCH 13/16] Capitalize rollback assert messages

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 8fb8f4327d12b..cf7e3fddb5aa5 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1172,8 +1172,8 @@ struct AMDGPUStreamTy {
   /// Roll back the last consumed slot after a submission failure so the stream
   /// does not retain a slot for an operation that was never enqueued.
   void rollbackConsumedSlot(uint32_t Slot) {
-    assert(NextSlot > 0 && "cannot roll back an empty stream");
-    assert(Slot + 1 == NextSlot && "can only roll back the last consumed slot");
+    assert(NextSlot > 0 && "Cannot roll back an empty stream");
+    assert(Slot + 1 == NextSlot && "Can only roll back the last consumed slot");
 
     Slots[Slot].Signal = nullptr;
     Slots[Slot].Callbacks.clear();

>From 0a3330a9a71c3799130a9816c3a2c605846aec9b Mon Sep 17 00:00:00 2001
From: "Leandro A. Lacerda Campos" <leandrolcampos at yahoo.com.br>
Date: Mon, 30 Mar 2026 21:21:22 -0300
Subject: [PATCH 14/16] Generalize refcount increments and simplify AMDGPU
 event signal retention

Signed-off-by: Leandro A. Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/include/Shared/RefCnt.h            | 20 ++++++++--------
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 28 +++++++++++-----------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/offload/include/Shared/RefCnt.h b/offload/include/Shared/RefCnt.h
index 7c615ba167a3d..5031a6ff90246 100644
--- a/offload/include/Shared/RefCnt.h
+++ b/offload/include/Shared/RefCnt.h
@@ -31,16 +31,16 @@ struct RefCountTy {
 
   ~RefCountTy() { assert(Refs == 0 && "Destroying with non-zero refcount"); }
 
-  /// Increase the reference count atomically.
-  void increase() { Refs.fetch_add(1, MemoryOrder); }
-
-  /// Decrease the reference count and return whether it became zero. Decreasing
-  /// the counter in more units than it was previously increased results in
-  /// undefined behavior.
-  bool decrease() {
-    Ty Prev = Refs.fetch_sub(1, MemoryOrder);
-    assert(Prev > 0 && "Invalid refcount");
-    return (Prev == 1);
+  /// Increase the reference count atomically by \p Amount.
+  void increase(Ty Amount = 1) { Refs.fetch_add(Amount, MemoryOrder); }
+
+  /// Decrease the reference count by \p Amount and return whether it became
+  /// zero. Decreasing the counter by more than it was previously increased
+  /// results in undefined behavior.
+  bool decrease(Ty Amount = 1) {
+    Ty Prev = Refs.fetch_sub(Amount, MemoryOrder);
+    assert(Prev >= Amount && "Invalid refcount");
+    return (Prev == Amount);
   }
 
   Ty get() const { return Refs.load(MemoryOrder); }
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index cf7e3fddb5aa5..92f08620af0c9 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -690,11 +690,14 @@ struct AMDGPUSignalTy {
   /// plugin thread or the HSA runtime.
   void reset() { hsa_signal_store_screlease(HSASignal, 1); }
 
-  /// Increase the number of concurrent uses.
-  void increaseUseCount() { UseCount.increase(); }
+  /// Increase the number of concurrent uses by \p Amount.
+  void increaseUseCount(uint32_t Amount = 1) { UseCount.increase(Amount); }
 
-  /// Decrease the number of concurrent uses and return whether was the last.
-  bool decreaseUseCount() { return UseCount.decrease(); }
+  /// Decrease the number of concurrent uses by \p Amount and return whether it
+  /// became zero.
+  bool decreaseUseCount(uint32_t Amount = 1) {
+    return UseCount.decrease(Amount);
+  }
 
   hsa_signal_t get() const { return HSASignal; }
 
@@ -704,7 +707,7 @@ struct AMDGPUSignalTy {
 
   /// Reference counter for tracking the concurrent use count. This is mainly
   /// used for knowing how many streams are using the signal.
-  RefCountTy<> UseCount;
+  RefCountTy<uint32_t> UseCount;
 };
 
 /// Classes for holding AMDGPU signals and managing signals.
@@ -1801,12 +1804,15 @@ Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
     return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
                          "target queue was nullptr");
 
+  const bool RetainTimingSignal = Queue->isProfilingEnabled();
+  const uint32_t SignalUses = 1 + RetainTimingSignal;
+
   // Retrieve an available signal for the operation's output.
   AMDGPUSignalTy *OutputSignal = nullptr;
   if (auto Err = SignalManager.getResource(OutputSignal))
     return Err;
   OutputSignal->reset();
-  OutputSignal->increaseUseCount();
+  OutputSignal->increaseUseCount(SignalUses);
 
   std::lock_guard<std::mutex> StreamLock(Mutex);
 
@@ -1818,7 +1824,7 @@ Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
   if (auto Err = Queue->pushBarrier(OutputSignal, InputSignal, nullptr)) {
     rollbackConsumedSlot(Curr);
 
-    if (OutputSignal->decreaseUseCount())
+    if (OutputSignal->decreaseUseCount(SignalUses))
       if (auto ReturnErr = SignalManager.returnResource(OutputSignal))
         return joinErrors(std::move(Err), std::move(ReturnErr));
 
@@ -1827,13 +1833,7 @@ Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
 
   Event.RecordedSlot = Curr;
   Event.RecordedSyncCycle = SyncCycle;
-
-  if (Queue->isProfilingEnabled()) {
-    OutputSignal->increaseUseCount();
-    Event.TimingSignal = OutputSignal;
-  } else {
-    Event.TimingSignal = nullptr;
-  }
+  Event.TimingSignal = RetainTimingSignal ? OutputSignal : nullptr;
 
   assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
   assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");

>From dddc09ab6713dcc5c55284f6e6816603a0d5b68d Mon Sep 17 00:00:00 2001
From: "Leandro A. Lacerda Campos" <leandrolcampos at yahoo.com.br>
Date: Mon, 30 Mar 2026 22:36:52 -0300
Subject: [PATCH 15/16] Reuse retained timing signals when re-recording AMDGPU
 events

Signed-off-by: Leandro A. Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 63 +++++++++++++---------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 92f08620af0c9..9c2c5f9477638 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1684,7 +1684,8 @@ struct AMDGPUStreamTy {
   const AMDGPUQueueTy *getQueue() const { return Queue; }
 
   /// Record an event by enqueuing a barrier marker packet on the stream.
-  Error recordEvent(AMDGPUEventTy &Event);
+  Error recordEvent(AMDGPUEventTy &Event,
+                    AMDGPUSignalTy *ReusedSignal = nullptr);
 
   /// Make the stream wait on an event.
   Error waitEvent(const AMDGPUEventTy &Event);
@@ -1704,29 +1705,28 @@ struct AMDGPUEventTy {
   Error init() { return resetState(); }
   Error deinit() { return resetState(); }
 
-  /// Clear the current recording and retained timing state.
-  Error resetState() {
+  /// Clear the current recording and retained timing state, optionally
+  /// returning a reusable timing signal.
+  Error resetState(AMDGPUSignalTy **ReusableSignalPtr = nullptr) {
     RecordedStream = nullptr;
     RecordedSlot = -1;
     RecordedSyncCycle = -1;
-
-    if (auto Err = releaseTimingSignal())
-      return Err;
-
-    return Plugin::success();
+    return releaseTimingSignal(ReusableSignalPtr);
   }
 
   /// Record the current stream point on the event.
   Error record(AMDGPUStreamTy &Stream) {
     std::lock_guard<std::mutex> Lock(Mutex);
 
-    // Discard the previous recording and retained timing state.
-    if (auto Err = resetState())
+    // Discard the previous recording and retained timing state, reusing the
+    // retained timing signal if it becomes available.
+    AMDGPUSignalTy *Signal = nullptr;
+    if (auto Err = resetState(&Signal))
       return Err;
 
     RecordedStream = &Stream;
 
-    if (auto Err = Stream.recordEvent(*this)) {
+    if (auto Err = Stream.recordEvent(*this, Signal)) {
       if (auto ResetErr = resetState())
         return joinErrors(std::move(Err), std::move(ResetErr));
       return Err;
@@ -1774,8 +1774,9 @@ struct AMDGPUEventTy {
   Expected<float> getElapsedTime(AMDGPUEventTy &EndEvent);
 
 protected:
-  /// Release the retained timing signal, if any, back to the signal manager.
-  Error releaseTimingSignal();
+  /// Release the retained timing signal, if any, either back to the signal
+  /// manager or through \p ReusableSignalPtr when provided.
+  Error releaseTimingSignal(AMDGPUSignalTy **ReusableSignalPtr = nullptr);
 
   /// The device that owns this event.
   AMDGPUDeviceTy &Device;
@@ -1799,7 +1800,8 @@ struct AMDGPUEventTy {
   friend struct AMDGPUStreamTy;
 };
 
-Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
+Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event,
+                                  AMDGPUSignalTy *ReusedSignal) {
   if (Queue == nullptr)
     return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
                          "target queue was nullptr");
@@ -1807,10 +1809,13 @@ Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
   const bool RetainTimingSignal = Queue->isProfilingEnabled();
   const uint32_t SignalUses = 1 + RetainTimingSignal;
 
-  // Retrieve an available signal for the operation's output.
-  AMDGPUSignalTy *OutputSignal = nullptr;
-  if (auto Err = SignalManager.getResource(OutputSignal))
-    return Err;
+  // Reuse the provided signal or retrieve one for the operation's output.
+  AMDGPUSignalTy *OutputSignal = ReusedSignal;
+  if (!OutputSignal) {
+    if (auto Err = SignalManager.getResource(OutputSignal))
+      return Err;
+  }
+
   OutputSignal->reset();
   OutputSignal->increaseUseCount(SignalUses);
 
@@ -1824,9 +1829,10 @@ Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
   if (auto Err = Queue->pushBarrier(OutputSignal, InputSignal, nullptr)) {
     rollbackConsumedSlot(Curr);
 
-    if (OutputSignal->decreaseUseCount(SignalUses))
+    if (OutputSignal->decreaseUseCount(SignalUses)) {
       if (auto ReturnErr = SignalManager.returnResource(OutputSignal))
         return joinErrors(std::move(Err), std::move(ReturnErr));
+    }
 
     return Err;
   }
@@ -3586,17 +3592,22 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
       StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
       UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
 
-Error AMDGPUEventTy::releaseTimingSignal() {
-  if (!TimingSignal)
-    return Plugin::success();
-
+Error AMDGPUEventTy::releaseTimingSignal(AMDGPUSignalTy **ReusableSignalPtr) {
   AMDGPUSignalTy *Signal = TimingSignal;
   TimingSignal = nullptr;
 
-  if (Signal->decreaseUseCount())
-    return Device.getSignalManager().returnResource(Signal);
+  if (!Signal)
+    return Plugin::success();
 
-  return Plugin::success();
+  if (!Signal->decreaseUseCount())
+    return Plugin::success();
+
+  if (ReusableSignalPtr && !(*ReusableSignalPtr)) {
+    *ReusableSignalPtr = Signal;
+    return Plugin::success();
+  }
+
+  return Device.getSignalManager().returnResource(Signal);
 }
 
 Expected<float> AMDGPUEventTy::getElapsedTime(AMDGPUEventTy &EndEvent) {

>From 3a745b83c16a0017230afdab26156cad031b6ff7 Mon Sep 17 00:00:00 2001
From: "Leandro A. Lacerda Campos" <leandrolcampos at yahoo.com.br>
Date: Mon, 30 Mar 2026 22:45:29 -0300
Subject: [PATCH 16/16] Add braces to outer nested ifs per LLVM coding
 standards

Signed-off-by: Leandro A. Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/liboffload/src/OffloadImpl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 3bc64032e2604..77933e6291f4a 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -925,10 +925,11 @@ Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) {
 
   if (auto Err = Queue->Device->Device->recordEvent(Event->EventInfo,
                                                     Queue->AsyncInfo)) {
-    if (Event->EventInfo)
+    if (Event->EventInfo) {
       if (auto DestroyErr =
               Queue->Device->Device->destroyEvent(Event->EventInfo))
         return joinErrors(std::move(Err), std::move(DestroyErr));
+    }
 
     return Err;
   }



More information about the llvm-commits mailing list