[llvm] [Offload] Add support for measuring elapsed time between events (PR #186856)

Sat Mar 21 11:05:11 PDT 2026

https://github.com/leandrolcampos updated https://github.com/llvm/llvm-project/pull/186856

>From 1788404f910795fcfd1183393e06e184904b1aac Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 16:30:59 -0300
Subject: [PATCH 1/7] Add offload event timing

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/liboffload/API/Event.td               |  17 +-
 offload/liboffload/src/OffloadImpl.cpp        |  60 ++++---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |   7 +
 .../common/include/PluginInterface.h          |  10 ++
 .../common/src/PluginInterface.cpp            |  21 +++
 .../cuda/dynamic_cuda/cuda.cpp                |   1 +
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |   1 +
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  10 ++
 offload/plugins-nextgen/host/src/rtl.cpp      |   6 +
 .../level_zero/include/L0Device.h             |   6 +
 offload/unittests/OffloadAPI/CMakeLists.txt   |   1 +
 .../event/olGetEventElapsedTime.cpp           | 146 ++++++++++++++++++
 12 files changed, 266 insertions(+), 20 deletions(-)
 create mode 100644 offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp

diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td
index 075bf5bafaa64..be77500562a19 100644
--- a/offload/liboffload/API/Event.td
+++ b/offload/liboffload/API/Event.td
@@ -13,7 +13,8 @@
 def olCreateEvent : Function {
     let desc = "Enqueue an event to `Queue` and return it.";
     let details = [
-      "This event can be used with `olSyncEvent` and `olWaitEvents` and will be complete once all enqueued work prior to the `olCreateEvent` call is complete.",
+      "This event can be used with `olSyncEvent`, `olWaitEvents`, and `olGetEventElapsedTime`.",
+      "It will be complete once all enqueued work prior to the `olCreateEvent` call is complete.",
     ];
     let params = [
         Param<"ol_queue_handle_t", "Queue", "queue to create the event for", PARAM_IN>,
@@ -40,6 +41,20 @@ def olSyncEvent : Function {
     let returns = [];
 }
 
+def olGetEventElapsedTime : Function {
+    let desc = "Get the elapsed time in milliseconds between two events.";
+    let details = [
+        "The elapsed time is returned in milliseconds.",
+        "The queues associated with `StartEvent` and `EndEvent` must belong to the same device."
+    ];
+    let params = [
+        Param<"ol_event_handle_t", "StartEvent", "handle of the start event", PARAM_IN>,
+        Param<"ol_event_handle_t", "EndEvent", "handle of the end event", PARAM_IN>,
+        Param<"float*", "ElapsedTime", "output pointer for the elapsed time in milliseconds", PARAM_OUT>
+    ];
+    let returns = [];
+}
+
 def ol_event_info_t : Enum {
   let desc = "Supported event info.";
   let is_typed = 1;
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index dd3ec0f61b4da..48748074c79cf 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -175,8 +175,8 @@ struct ol_event_impl_t {
                   ol_queue_handle_t Queue)
       : EventInfo(EventInfo), Device(Device), QueueId(Queue->Id), Queue(Queue) {
   }
-  // EventInfo may be null, in which case the event should be considered always
-  // complete
+  // Opaque backend-specific event state. This is expected to be non-null for
+  // backends that materialize real events.
   void *EventInfo;
   ol_device_handle_t Device;
   size_t QueueId;
@@ -794,7 +794,8 @@ Error olWaitEvents_impl(ol_queue_handle_t Queue, ol_event_handle_t *Events,
       return Plugin::error(ErrorCode::INVALID_NULL_HANDLE,
                            "olWaitEvents asked to wait on a NULL event");
 
-    // Do nothing if the event is for this queue or the event is always complete
+    // Do nothing if the event is for this queue or the backend does not
+    // materialize event state for it.
     if (Event->QueueId == Queue->Id || !Event->EventInfo)
       continue;
 
@@ -839,7 +840,8 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName,
 }
 
 Error olSyncEvent_impl(ol_event_handle_t Event) {
-  // No event info means that this event was complete on creation
+  // Some backends do not materialize backend event state. Treat such events as
+  // trivially complete.
   if (!Event->EventInfo)
     return Plugin::success();
 
@@ -849,6 +851,26 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
   return Error::success();
 }
 
+Error olGetEventElapsedTime_impl(ol_event_handle_t StartEvent,
+                                 ol_event_handle_t EndEvent,
+                                 float *ElapsedTime) {
+  if (!StartEvent || !EndEvent)
+    return createOffloadError(ErrorCode::INVALID_NULL_HANDLE,
+                              "olGetEventElapsedTime was given a NULL event");
+
+  if (!ElapsedTime)
+    return createOffloadError(ErrorCode::INVALID_NULL_POINTER,
+                              "ElapsedTime is null");
+
+  if (StartEvent->Device != EndEvent->Device)
+    return createOffloadError(
+        ErrorCode::INVALID_DEVICE,
+        "StartEvent and EndEvent must belong to the same device");
+
+  return StartEvent->Device->Device->getEventElapsedTime(
+      StartEvent->EventInfo, EndEvent->EventInfo, ElapsedTime);
+}
+
 Error olDestroyEvent_impl(ol_event_handle_t Event) {
   if (Event->EventInfo)
     if (auto Res = Event->Device->Device->destroyEvent(Event->EventInfo))
@@ -867,7 +889,8 @@ Error olGetEventInfoImplDetail(ol_event_handle_t Event,
   case OL_EVENT_INFO_QUEUE:
     return Info.write<ol_queue_handle_t>(Queue);
   case OL_EVENT_INFO_IS_COMPLETE: {
-    // No event info means that this event was complete on creation
+    // Some backends do not materialize backend event state. Treat such events as
+    // trivially complete.
     if (!Event->EventInfo)
       return Info.write<bool>(true);
 
@@ -898,24 +921,23 @@ Error olGetEventInfoSize_impl(ol_event_handle_t Event, ol_event_info_t PropName,
 }
 
 Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) {
-  auto Pending = Queue->Device->Device->hasPendingWork(Queue->AsyncInfo);
-  if (auto Err = Pending.takeError())
-    return Err;
+  auto Event = std::make_unique<ol_event_impl_t>(nullptr, Queue->Device, Queue);
 
-  *EventOut = new ol_event_impl_t(nullptr, Queue->Device, Queue);
-  if (!*Pending)
-    // Queue is empty, don't record an event and consider the event always
-    // complete
-    return Plugin::success();
+  if (auto Err = Queue->Device->Device->createEvent(&Event->EventInfo))
+    return Err;
 
-  if (auto Res = Queue->Device->Device->createEvent(&(*EventOut)->EventInfo))
-    return Res;
+  if (auto Err = Queue->Device->Device->recordEvent(Event->EventInfo,
+                                                    Queue->AsyncInfo)) {
+    if (Event->EventInfo)
+      if (auto DestroyErr =
+              Queue->Device->Device->destroyEvent(Event->EventInfo))
+        return joinErrors(std::move(Err), std::move(DestroyErr));
 
-  if (auto Res = Queue->Device->Device->recordEvent((*EventOut)->EventInfo,
-                                                    Queue->AsyncInfo))
-    return Res;
+    return Err;
+  }
 
-  return Plugin::success();
+  *EventOut = Event.release();
+  return Error::success();
 }
 
 Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 3f9064eaf00b3..70aff9f43c5b0 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2871,6 +2871,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Event->sync();
   }
 
+  /// Get the elapsed time in milliseconds between two events.
+  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+                                float *ElapsedTime) override {
+    return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet",
+                         __func__);
+  }
+
   /// Print information about the device.
   Expected<InfoTreeNode> obtainInfoImpl() override {
     char TmpChar[1000];
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index b6a54f05b1dcc..04a4a78a24ba9 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -994,6 +994,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error syncEvent(void *EventPtr);
   virtual Error syncEventImpl(void *EventPtr) = 0;
 
+  /// Get the elapsed time in milliseconds between two events.
+  Error getEventElapsedTime(void *StartEventPtr, void *EndEventPtr,
+                            float *ElapsedTime);
+  virtual Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+                                        float *ElapsedTime) = 0;
+
   /// Obtain information about the device.
   Expected<InfoTreeNode> obtainInfo();
   virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
@@ -1543,6 +1549,10 @@ struct GenericPluginTy {
   /// Synchronize execution until an event is done.
   int32_t sync_event(int32_t DeviceId, void *EventPtr);
 
+  /// Get the elapsed time in milliseconds between two events.
+  int32_t get_event_elapsed_time(int32_t DeviceId, void *StartEventPtr,
+                                 void *EndEventPtr, float *ElapsedTime);
+
   /// Remove the event from the plugin.
   int32_t destroy_event(int32_t DeviceId, void *EventPtr);
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 4093d08044bc3..6cc462e6162dd 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1568,6 +1568,12 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
   return syncEventImpl(EventPtr);
 }
 
+Error GenericDeviceTy::getEventElapsedTime(void *StartEventPtr,
+                                           void *EndEventPtr,
+                                           float *ElapsedTime) {
+  return getEventElapsedTimeImpl(StartEventPtr, EndEventPtr, ElapsedTime);
+}
+
 bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
 
 Expected<bool> GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
@@ -2087,6 +2093,21 @@ int32_t GenericPluginTy::sync_event(int32_t DeviceId, void *EventPtr) {
   return OFFLOAD_SUCCESS;
 }
 
+int32_t GenericPluginTy::get_event_elapsed_time(int32_t DeviceId,
+                                                void *StartEventPtr,
+                                                void *EndEventPtr,
+                                                float *ElapsedTime) {
+  auto Err = getDevice(DeviceId).getEventElapsedTime(StartEventPtr, EndEventPtr,
+                                                     ElapsedTime);
+  if (Err) {
+    REPORT() << "Failure to get elapsed time between events " << StartEventPtr
+             << " and " << EndEventPtr << ": " << toString(std::move(Err));
+    return OFFLOAD_FAIL;
+  }
+
+  return OFFLOAD_SUCCESS;
+}
+
 int32_t GenericPluginTy::destroy_event(int32_t DeviceId, void *EventPtr) {
   auto Err = getDevice(DeviceId).destroyEvent(EventPtr);
   if (Err) {
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 80e3e418ae3fa..8fc8d0e43fab2 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -97,6 +97,7 @@ DLWRAP(cuEventRecord, 2)
 DLWRAP(cuEventQuery, 1)
 DLWRAP(cuStreamWaitEvent, 3)
 DLWRAP(cuEventSynchronize, 1)
+DLWRAP(cuEventElapsedTime, 3)
 DLWRAP(cuEventDestroy, 1)
 
 DLWRAP_FINALIZE()
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index fa4f4634ecec3..dd47fb98dc03a 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -371,6 +371,7 @@ CUresult cuEventRecord(CUevent, CUstream);
 CUresult cuEventQuery(CUevent);
 CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
 CUresult cuEventSynchronize(CUevent);
+CUresult cuEventElapsedTime(float *, CUevent, CUevent);
 CUresult cuEventDestroy(CUevent);
 
 CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 4de754265ea77..6d61da4fdc17b 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1086,6 +1086,16 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "error in cuEventSynchronize: %s");
   }
 
+  /// Get the elapsed time in milliseconds between two events.
+  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+                                float *ElapsedTime) override {
+    CUevent StartEvent = reinterpret_cast<CUevent>(StartEventPtr);
+    CUevent EndEvent = reinterpret_cast<CUevent>(EndEventPtr);
+
+    CUresult Res = cuEventElapsedTime(ElapsedTime, StartEvent, EndEvent);
+    return Plugin::check(Res, "error in cuEventElapsedTime: %s");
+  }
+
   /// Print information about the device.
   Expected<InfoTreeNode> obtainInfoImpl() override {
     char TmpChar[1000];
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 077dd14b959e0..b90478a520b39 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -360,6 +360,12 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return true;
   }
   Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
+  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+                                float *ElapsedTime) override {
+    if (ElapsedTime)
+      *ElapsedTime = 0.0f;
+    return Plugin::success();
+  }
 
   /// Print information about the device.
   Expected<InfoTreeNode> obtainInfoImpl() override {
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 001a41ba77d7b..19a88ca0b70ad 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -626,6 +626,12 @@ class L0DeviceTy final : public GenericDeviceTy {
                          __func__);
   }
 
+  Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+                                float *ElapsedTime) override {
+    return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+                         __func__);
+  }
+
   Expected<InfoTreeNode> obtainInfoImpl() override;
   uint64_t getClockFrequency() const override { return getClockRate(); }
   uint64_t getHardwareParallelism() const override { return getTotalThreads(); }
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 031dbea660fbc..39863391f27d6 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -13,6 +13,7 @@ add_offload_unittest("event"
     event/olCreateEvent.cpp
     event/olDestroyEvent.cpp
     event/olSyncEvent.cpp
+    event/olGetEventElapsedTime.cpp
     event/olGetEventInfo.cpp
     event/olGetEventInfoSize.cpp)
 
diff --git a/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
new file mode 100644
index 0000000000000..13da46a798140
--- /dev/null
+++ b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
@@ -0,0 +1,146 @@
+//===------- Offload API tests - olGetEventElapsedTime --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include "llvm/Support/MemoryBuffer.h"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+struct olGetEventElapsedTimeTest : OffloadQueueTest {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
+
+    ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin));
+    ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
+                                   DeviceBin->getBufferSize(), &Program));
+    ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &Kernel));
+
+    LaunchArgs.Dimensions = 1;
+    LaunchArgs.GroupSize = {64, 1, 1};
+    LaunchArgs.NumGroups = {1, 1, 1};
+    LaunchArgs.DynSharedMemory = 0;
+
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                              LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+  }
+
+  void TearDown() override {
+    if (Mem)
+      ASSERT_SUCCESS(olMemFree(Mem));
+    if (Program)
+      ASSERT_SUCCESS(olDestroyProgram(Program));
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown());
+  }
+
+  void launchFoo() {
+    struct {
+      void *Mem;
+    } Args{Mem};
+
+    ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                  &LaunchArgs));
+  }
+
+  std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
+  ol_program_handle_t Program = nullptr;
+  ol_symbol_handle_t Kernel = nullptr;
+  ol_kernel_launch_size_args_t LaunchArgs{};
+  void *Mem = nullptr;
+};
+
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetEventElapsedTimeTest);
+
+TEST_P(olGetEventElapsedTimeTest, Success) {
+  ol_event_handle_t StartEvent = nullptr;
+  ol_event_handle_t EndEvent = nullptr;
+
+  ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+  ASSERT_NE(StartEvent, nullptr);
+
+  launchFoo();
+
+  ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+  ASSERT_NE(EndEvent, nullptr);
+
+  ASSERT_SUCCESS(olSyncEvent(EndEvent));
+
+  float ElapsedTime = -1.0f;
+
+  ASSERT_SUCCESS(olGetEventElapsedTime(StartEvent, EndEvent, &ElapsedTime));
+  ASSERT_GE(ElapsedTime, 0.0f);
+
+  ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+  ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, SuccessMultipleCalls) {
+  ol_event_handle_t StartEvent = nullptr;
+  ol_event_handle_t EndEvent = nullptr;
+
+  ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+  ASSERT_NE(StartEvent, nullptr);
+
+  launchFoo();
+
+  ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+  ASSERT_NE(EndEvent, nullptr);
+
+  ASSERT_SUCCESS(olSyncEvent(EndEvent));
+
+  float ElapsedTimeA = -1.0f;
+  float ElapsedTimeB = -1.0f;
+
+  ASSERT_SUCCESS(olGetEventElapsedTime(StartEvent, EndEvent, &ElapsedTimeA));
+  ASSERT_SUCCESS(olGetEventElapsedTime(StartEvent, EndEvent, &ElapsedTimeB));
+
+  ASSERT_GE(ElapsedTimeA, 0.0f);
+  ASSERT_GE(ElapsedTimeB, 0.0f);
+
+  ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+  ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, InvalidNullStartEvent) {
+  ol_event_handle_t EndEvent = nullptr;
+  ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+
+  float ElapsedTime = 0.0f;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetEventElapsedTime(nullptr, EndEvent, &ElapsedTime));
+
+  ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, InvalidNullEndEvent) {
+  ol_event_handle_t StartEvent = nullptr;
+  ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+
+  float ElapsedTime = 0.0f;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olGetEventElapsedTime(StartEvent, nullptr, &ElapsedTime));
+
+  ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, InvalidNullElapsedTime) {
+  ol_event_handle_t StartEvent = nullptr;
+  ol_event_handle_t EndEvent = nullptr;
+
+  ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+  ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olGetEventElapsedTime(StartEvent, EndEvent, nullptr));
+
+  ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+  ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+} // namespace
\ No newline at end of file

>From e1743262c9667c0306526baae0c1ae524b27c47c Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 16:36:12 -0300
Subject: [PATCH 2/7] Add event elapsed-time support for AMDGPU plugin

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 .../amdgpu/dynamic_hsa/hsa.cpp                |   2 +
 .../plugins-nextgen/amdgpu/dynamic_hsa/hsa.h  |   2 +
 .../amdgpu/dynamic_hsa/hsa_ext_amd.h          |  12 +
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 258 ++++++++++++++++--
 .../event/olGetEventElapsedTime.cpp           |   2 +-
 5 files changed, 249 insertions(+), 27 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
index 37d12861eb387..279a296dd1618 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
@@ -70,6 +70,8 @@ DLWRAP(hsa_amd_register_system_event_handler, 2)
 DLWRAP(hsa_amd_signal_create, 5)
 DLWRAP(hsa_amd_signal_async_handler, 5)
 DLWRAP(hsa_amd_pointer_info, 5)
+DLWRAP(hsa_amd_profiling_get_dispatch_time, 3)
+DLWRAP(hsa_amd_profiling_set_profiler_enabled, 2)
 DLWRAP(hsa_code_object_reader_create_from_memory, 3)
 DLWRAP(hsa_code_object_reader_destroy, 1)
 DLWRAP(hsa_executable_load_agent_code_object, 5)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index ad135f72fff12..f6e3337ddb3f4 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -99,6 +99,8 @@ typedef enum {
 typedef enum {
   HSA_SYSTEM_INFO_VERSION_MAJOR = 0,
   HSA_SYSTEM_INFO_VERSION_MINOR = 1,
+  HSA_SYSTEM_INFO_TIMESTAMP = 2,
+  HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3,
 } hsa_system_info_t;
 
 typedef enum {
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index ddfa65c76cf2d..7ff77f8e2a2fa 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -169,6 +169,18 @@ hsa_status_t hsa_amd_pointer_info(const void* ptr,
                                           uint32_t* num_agents_accessible,
                                           hsa_agent_t** accessible);
 
+typedef struct hsa_amd_profiling_dispatch_time_s {
+  uint64_t start;
+  uint64_t end;
+} hsa_amd_profiling_dispatch_time_t;
+
+hsa_status_t
+hsa_amd_profiling_get_dispatch_time(hsa_agent_t agent, hsa_signal_t signal,
+                                    hsa_amd_profiling_dispatch_time_t *time);
+
+hsa_status_t hsa_amd_profiling_set_profiler_enabled(hsa_queue_t *queue,
+                                                    int enable);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 70aff9f43c5b0..58dabbc374486 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -714,16 +714,32 @@ using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
 /// Class holding an HSA queue to submit kernel and barrier packets.
 struct AMDGPUQueueTy {
   /// Create an empty queue.
-  AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
+  AMDGPUQueueTy()
+      : Queue(nullptr), ProfilingEnabled(false), Mutex(), NumUsers(0) {}
 
   /// Lazily initialize a new queue belonging to a specific agent.
   Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
     if (Queue)
       return Plugin::success();
+
     hsa_status_t Status =
         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
                          &Device, UINT32_MAX, UINT32_MAX, &Queue);
-    return Plugin::check(Status, "error in hsa_queue_create: %s");
+    if (auto Err = Plugin::check(Status, "error in hsa_queue_create: %s"))
+      return Err;
+
+    // Keep queue profiling enabled from creation time onward, as HIP/ROCclr
+    // does. Elapsed-time queries rely on queue-level hardware profiling support
+    // to retrieve packet timing.
+    //
+    // If enabling profiling fails, leave the queue usable and preserve existing
+    // event behavior; only elapsed-time queries will later report that timing
+    // is unavailable.
+    Status = hsa_amd_profiling_set_profiler_enabled(Queue, 1);
+    if (Status == HSA_STATUS_SUCCESS)
+      ProfilingEnabled = true;
+
+    return Plugin::success();
   }
 
   /// Deinitialize the queue and destroy its resources.
@@ -731,10 +747,16 @@ struct AMDGPUQueueTy {
     std::lock_guard<std::mutex> Lock(Mutex);
     if (!Queue)
       return Plugin::success();
+
+    ProfilingEnabled = false;
+
     hsa_status_t Status = hsa_queue_destroy(Queue);
     return Plugin::check(Status, "error in hsa_queue_destroy: %s");
   }
 
+  /// Returns whether profiling is enabled on the underlying HSA queue.
+  bool isProfilingEnabled() const { return ProfilingEnabled; }
+
   /// Returns the number of streams, this queue is currently assigned to.
   bool getUserCount() const { return NumUsers; }
 
@@ -917,6 +939,9 @@ struct AMDGPUQueueTy {
   /// The HSA queue.
   hsa_queue_t *Queue;
 
+  /// Indicates whether profiling is enabled on the underlying HSA queue.
+  bool ProfilingEnabled;
+
   /// Mutex to protect the acquiring and publishing of packets. For the moment,
   /// we need this mutex to prevent publishing packets that are not ready to be
   /// published in a multi-thread scenario. Without a queue lock, a thread T1
@@ -1144,6 +1169,18 @@ struct AMDGPUStreamTy {
     return {Curr, InputSignal};
   }
 
+  /// Roll back the last consumed slot after a submission failure so the stream
+  /// does not retain a slot for an operation that was never enqueued.
+  void rollbackConsumedSlot(uint32_t Slot) {
+    assert(NextSlot > 0 && "cannot roll back an empty stream");
+    assert(Slot + 1 == NextSlot && "can only roll back the last consumed slot");
+
+    Slots[Slot].Signal = nullptr;
+    Slots[Slot].Callbacks.clear();
+    Slots[Slot].ActionArgs.clear();
+    --NextSlot;
+  }
+
   /// Complete all pending post actions and reset the stream after synchronizing
   /// or positively querying the stream.
   Error complete() {
@@ -1643,8 +1680,8 @@ struct AMDGPUStreamTy {
 
   const AMDGPUQueueTy *getQueue() const { return Queue; }
 
-  /// Record the state of the stream on an event.
-  Error recordEvent(AMDGPUEventTy &Event) const;
+  /// Record an event by enqueuing a barrier marker packet on the stream.
+  Error recordEvent(AMDGPUEventTy &Event);
 
   /// Make the stream wait on an event.
   Error waitEvent(const AMDGPUEventTy &Event);
@@ -1652,25 +1689,47 @@ struct AMDGPUStreamTy {
   friend struct AMDGPUStreamManagerTy;
 };
 
-/// Class representing an event on AMDGPU. The event basically stores some
-/// information regarding the state of the recorded stream.
+/// Class representing an event on AMDGPU. The event stores the recorded stream
+/// point and retained timing state.
 struct AMDGPUEventTy {
   /// Create an empty event.
   AMDGPUEventTy(AMDGPUDeviceTy &Device)
-      : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
+      : Device(Device), RecordedStream(nullptr), RecordedSlot(-1),
+        RecordedSyncCycle(-1), TimingSignal(nullptr), TimingAgent({0}) {}
 
   /// Initialize and deinitialize.
-  Error init() { return Plugin::success(); }
-  Error deinit() { return Plugin::success(); }
+  Error init() { return resetState(); }
+  Error deinit() { return resetState(); }
+
+  /// Clear the current recording and retained timing state.
+  Error resetState() {
+    if (auto Err = releaseTimingSignal())
+      return Err;
+
+    RecordedStream = nullptr;
+    RecordedSlot = -1;
+    RecordedSyncCycle = -1;
+    TimingAgent = {0};
+
+    return Plugin::success();
+  }
 
   /// Record the state of a stream on the event.
   Error record(AMDGPUStreamTy &Stream) {
     std::lock_guard<std::mutex> Lock(Mutex);
 
     // Ignore the last recorded stream.
+    if (auto Err = resetState())
+      return Err;
+
     RecordedStream = &Stream;
 
-    return Stream.recordEvent(*this);
+    if (auto Err = Stream.recordEvent(*this)) {
+      llvm::consumeError(resetState());
+      return Err;
+    }
+
+    return Plugin::success();
   }
 
   /// Make a stream wait on the current event.
@@ -1708,38 +1767,80 @@ struct AMDGPUEventTy {
     return RecordedStream->synchronizeOn(*this);
   }
 
+  /// Return the elapsed time in milliseconds between this event and EndEvent.
+  Error elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime);
+
 protected:
+  /// Release the retained timing signal, if any, back to the signal manager.
+  Error releaseTimingSignal();
+
+  /// The device that owns this event.
+  AMDGPUDeviceTy &Device;
+
   /// The stream registered in this event.
   AMDGPUStreamTy *RecordedStream;
 
-  /// The recordered operation on the recorded stream.
+  /// The recorded operation on the recorded stream.
   int64_t RecordedSlot;
 
   /// The sync cycle when the stream was recorded. Used to detect stale events.
   int64_t RecordedSyncCycle;
 
+  /// The signal of the recorded barrier marker used for timing. Null means
+  /// timing is unavailable for the current recording.
+  AMDGPUSignalTy *TimingSignal;
+
+  /// The agent that owns the queue where the timing marker was recorded. A zero
+  /// handle means timing is unavailable for the current recording.
+  hsa_agent_t TimingAgent;
+
   /// Mutex to safely access event fields.
   mutable std::mutex Mutex;
 
   friend struct AMDGPUStreamTy;
 };
 
-Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
-  std::lock_guard<std::mutex> Lock(Mutex);
+Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
+  if (Queue == nullptr)
+    return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
+                         "target queue was nullptr");
+
+  // Retrieve an available signal for the operation's output.
+  AMDGPUSignalTy *OutputSignal = nullptr;
+  if (auto Err = SignalManager.getResource(OutputSignal))
+    return Err;
+  OutputSignal->reset();
+  OutputSignal->increaseUseCount();
 
-  if (size() > 0) {
-    // Record the synchronize identifier (to detect stale recordings) and
-    // the last valid stream's operation.
-    Event.RecordedSyncCycle = SyncCycle;
-    Event.RecordedSlot = last();
+  std::lock_guard<std::mutex> StreamLock(Mutex);
 
-    assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
-    assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
+  // Consume stream slot and compute dependencies.
+  auto [Curr, InputSignal] = consume(OutputSignal);
+
+  // Materialize the event as a real marker on the queue. Elapsed-time queries
+  // need a packet-backed completion signal to retrieve dispatch timing.
+  if (auto Err = Queue->pushBarrier(OutputSignal, InputSignal, nullptr)) {
+    rollbackConsumedSlot(Curr);
+    if (OutputSignal->decreaseUseCount())
+      llvm::consumeError(SignalManager.returnResource(OutputSignal));
+    return Err;
+  }
+
+  Event.RecordedSyncCycle = SyncCycle;
+  Event.RecordedSlot = Curr;
+
+  if (Queue->isProfilingEnabled()) {
+    OutputSignal->increaseUseCount();
+    Event.TimingSignal = OutputSignal;
+    Event.TimingAgent = Agent;
   } else {
-    // The stream is empty, everything already completed, record nothing.
-    Event.RecordedSyncCycle = -1;
-    Event.RecordedSlot = -1;
+    Event.TimingSignal = nullptr;
+    Event.TimingAgent = {0};
   }
+
+  assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
+  assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
+
   return Plugin::success();
 }
 
@@ -2124,6 +2225,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                          ClockFrequency) != HSA_STATUS_SUCCESS)
       ClockFrequency = 0;
 
+    // Retrieve the HSA system timestamp frequency for this runtime. A zero
+    // value means the frequency is unavailable.
+    if (hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY,
+                            &SystemTimestampFrequency) != HSA_STATUS_SUCCESS)
+      SystemTimestampFrequency = 0;
+
     // Load the grid values depending on the wavefront.
     if (WavefrontSize == 32)
       GridValues = getAMDGPUGridValues<32>();
@@ -2333,6 +2440,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Returns the clock frequency for the given AMDGPU device.
   uint64_t getClockFrequency() const override { return ClockFrequency; }
 
+  /// Returns the HSA system timestamp frequency. Zero means unavailable.
+  uint64_t getSystemTimestampFrequency() const {
+    return SystemTimestampFrequency;
+  }
+
   /// Allocate and construct an AMDGPU kernel.
   Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the AMDGPU kernel.
@@ -2813,12 +2925,19 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Create an event.
   Error createEventImpl(void **EventPtrStorage) override {
     AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
-    return AMDGPUEventManager.getResource(*Event);
+    if (auto Err = AMDGPUEventManager.getResource(*Event))
+      return Err;
+    return (*Event)->resetState();
   }
 
   /// Destroy a previously created event.
   Error destroyEventImpl(void *EventPtr) override {
     AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
+    assert(Event && "Invalid event");
+
+    if (auto Err = Event->resetState())
+      return Err;
+
     return AMDGPUEventManager.returnResource(Event);
   }
 
@@ -2874,8 +2993,18 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Get the elapsed time in milliseconds between two events.
   Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
                                 float *ElapsedTime) override {
-    return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet",
-                         __func__);
+    if (!ElapsedTime)
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                           "elapsed time output pointer is null");
+
+    AMDGPUEventTy *StartEvent =
+        reinterpret_cast<AMDGPUEventTy *>(StartEventPtr);
+    AMDGPUEventTy *EndEvent = reinterpret_cast<AMDGPUEventTy *>(EndEventPtr);
+
+    if (!StartEvent || !EndEvent)
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid event handle");
+
+    return StartEvent->elapsedTime(*EndEvent, *ElapsedTime);
   }
 
   /// Print information about the device.
@@ -3354,6 +3483,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// The frequency of the steady clock inside the device.
   uint64_t ClockFrequency;
 
+  /// The HSA system timestamp frequency reported by the runtime. Zero means
+  /// unavailable.
+  uint64_t SystemTimestampFrequency = 0;
+
   /// The total number of concurrent work items that can be running on the GPU.
   uint64_t HardwareParallelism;
 
@@ -3460,6 +3593,79 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
       StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
       UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
 
+Error AMDGPUEventTy::releaseTimingSignal() {
+  if (!TimingSignal)
+    return Plugin::success();
+
+  AMDGPUSignalTy *Signal = TimingSignal;
+  TimingSignal = nullptr;
+
+  if (Signal->decreaseUseCount())
+    return Device.getSignalManager().returnResource(Signal);
+
+  return Plugin::success();
+}
+
+Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
+  if (this == &EndEvent) {
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    if (!TimingSignal)
+      return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                           "event timing is not available");
+
+    if (TimingSignal->load())
+      return Plugin::error(ErrorCode::UNKNOWN, "event timing is not ready");
+
+    ElapsedTime = 0.0f;
+    return Plugin::success();
+  }
+
+  std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, EndEvent.Mutex);
+
+  if (&Device != &EndEvent.Device)
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "events belong to different devices");
+
+  if (!TimingSignal || !EndEvent.TimingSignal)
+    return Plugin::error(
+        ErrorCode::INVALID_ARGUMENT,
+        "timing information is not available for one or both events");
+
+  if (TimingSignal->load() || EndEvent.TimingSignal->load())
+    return Plugin::error(
+        ErrorCode::UNKNOWN,
+        "timing information is not ready for one or both events");
+
+  const uint64_t Frequency = Device.getSystemTimestampFrequency();
+  if (Frequency == 0)
+    return Plugin::error(ErrorCode::UNSUPPORTED,
+                         "HSA system timestamp frequency is unavailable");
+
+  hsa_amd_profiling_dispatch_time_t StartTime = {};
+  hsa_amd_profiling_dispatch_time_t StopTime = {};
+
+  hsa_status_t Status = hsa_amd_profiling_get_dispatch_time(
+      TimingAgent, TimingSignal->get(), &StartTime);
+  if (auto Err = Plugin::check(
+          Status, "error in hsa_amd_profiling_get_dispatch_time: %s"))
+    return Err;
+
+  Status = hsa_amd_profiling_get_dispatch_time(
+      EndEvent.TimingAgent, EndEvent.TimingSignal->get(), &StopTime);
+  if (auto Err = Plugin::check(
+          Status, "error in hsa_amd_profiling_get_dispatch_time: %s"))
+    return Err;
+
+  const int64_t DeltaTicks =
+      static_cast<int64_t>(StopTime.end) - static_cast<int64_t>(StartTime.end);
+
+  ElapsedTime = static_cast<float>(static_cast<double>(DeltaTicks) * 1000.0 /
+                                   static_cast<double>(Frequency));
+
+  return Plugin::success();
+}
+
 /// Class implementing the AMDGPU-specific functionalities of the global
 /// handler.
 struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
diff --git a/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
index 13da46a798140..aca2dccff72fe 100644
--- a/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
+++ b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
@@ -143,4 +143,4 @@ TEST_P(olGetEventElapsedTimeTest, InvalidNullElapsedTime) {
   ASSERT_SUCCESS(olDestroyEvent(EndEvent));
 }
 
-} // namespace
\ No newline at end of file
+} // namespace

>From beb718615a06a87b515e4bcd90951b2fe9f5443b Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 16:53:56 -0300
Subject: [PATCH 3/7] Apply clang-format fixes

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/liboffload/src/OffloadImpl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 48748074c79cf..3b794092a1d23 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -889,8 +889,8 @@ Error olGetEventInfoImplDetail(ol_event_handle_t Event,
   case OL_EVENT_INFO_QUEUE:
     return Info.write<ol_queue_handle_t>(Queue);
   case OL_EVENT_INFO_IS_COMPLETE: {
-    // Some backends do not materialize backend event state. Treat such events as
-    // trivially complete.
+    // Some backends do not materialize backend event state. Treat such events
+    // as trivially complete.
     if (!Event->EventInfo)
       return Info.write<bool>(true);
 

>From 2702292e1ed1b4e0135448f98c9f18ae4b4e9dfc Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 18:57:20 -0300
Subject: [PATCH 4/7] Remove redundant validation from
 olGetEventElapsedTime_impl

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/liboffload/src/OffloadImpl.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 3b794092a1d23..0e280586f746b 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -854,14 +854,6 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
 Error olGetEventElapsedTime_impl(ol_event_handle_t StartEvent,
                                  ol_event_handle_t EndEvent,
                                  float *ElapsedTime) {
-  if (!StartEvent || !EndEvent)
-    return createOffloadError(ErrorCode::INVALID_NULL_HANDLE,
-                              "olGetEventElapsedTime was given a NULL event");
-
-  if (!ElapsedTime)
-    return createOffloadError(ErrorCode::INVALID_NULL_POINTER,
-                              "ElapsedTime is null");
-
   if (StartEvent->Device != EndEvent->Device)
     return createOffloadError(
         ErrorCode::INVALID_DEVICE,

>From 6ac00d38d75cae0097ad749dfc97ef937d40f3f4 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Tue, 17 Mar 2026 16:36:46 -0300
Subject: [PATCH 5/7] Use clearer names in elapsed-time conversion

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 58dabbc374486..6bddfe65f38ee 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3637,8 +3637,8 @@ Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
         ErrorCode::UNKNOWN,
         "timing information is not ready for one or both events");
 
-  const uint64_t Frequency = Device.getSystemTimestampFrequency();
-  if (Frequency == 0)
+  const uint64_t TicksPerSecond = Device.getSystemTimestampFrequency();
+  if (TicksPerSecond == 0)
     return Plugin::error(ErrorCode::UNSUPPORTED,
                          "HSA system timestamp frequency is unavailable");
 
@@ -3659,9 +3659,11 @@ Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
 
   const int64_t DeltaTicks =
       static_cast<int64_t>(StopTime.end) - static_cast<int64_t>(StartTime.end);
+  constexpr double MillisecondsPerSecond = 1000.0;
 
-  ElapsedTime = static_cast<float>(static_cast<double>(DeltaTicks) * 1000.0 /
-                                   static_cast<double>(Frequency));
+  ElapsedTime = static_cast<float>(static_cast<double>(DeltaTicks) *
+                                   MillisecondsPerSecond /
+                                   static_cast<double>(TicksPerSecond));
 
   return Plugin::success();
 }

>From 218a76626371c22883ad28d860925cfcbcb5b083 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 19 Mar 2026 13:34:30 -0300
Subject: [PATCH 6/7] Clarify event timing comments

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 6bddfe65f38ee..5072185fa072f 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1718,7 +1718,7 @@ struct AMDGPUEventTy {
   Error record(AMDGPUStreamTy &Stream) {
     std::lock_guard<std::mutex> Lock(Mutex);
 
-    // Ignore the last recorded stream.
+    // Discard the previous recording and retained timing state.
     if (auto Err = resetState())
       return Err;
 
@@ -1786,12 +1786,12 @@ struct AMDGPUEventTy {
   /// The sync cycle when the stream was recorded. Used to detect stale events.
   int64_t RecordedSyncCycle;
 
-  /// The signal of the recorded barrier marker used for timing. Null means
-  /// timing is unavailable for the current recording.
+  /// The signal of the recorded timing barrier. Null means timing is
+  /// unavailable for the current recording.
   AMDGPUSignalTy *TimingSignal;
 
-  /// The agent that owns the queue where the timing marker was recorded. A zero
-  /// handle means timing is unavailable for the current recording.
+  /// The agent that owns the queue where the timing barrier was recorded. A
+  /// zero handle means timing is unavailable for the current recording.
   hsa_agent_t TimingAgent;
 
   /// Mutex to safely access event fields.

>From 10ff6db6174e57ec000ae3cc9265e9a4a0cc111a Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 19 Mar 2026 13:47:55 -0300
Subject: [PATCH 7/7] Clarify event recording comments

Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 5072185fa072f..c9aa3c66a45f8 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1703,18 +1703,18 @@ struct AMDGPUEventTy {
 
   /// Clear the current recording and retained timing state.
   Error resetState() {
-    if (auto Err = releaseTimingSignal())
-      return Err;
-
     RecordedStream = nullptr;
     RecordedSlot = -1;
     RecordedSyncCycle = -1;
     TimingAgent = {0};
 
+    if (auto Err = releaseTimingSignal())
+      return Err;
+
     return Plugin::success();
   }
 
-  /// Record the state of a stream on the event.
+  /// Record the current stream point on the event.
   Error record(AMDGPUStreamTy &Stream) {
     std::lock_guard<std::mutex> Lock(Mutex);
 
@@ -1826,8 +1826,8 @@ Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
     return Err;
   }
 
-  Event.RecordedSyncCycle = SyncCycle;
   Event.RecordedSlot = Curr;
+  Event.RecordedSyncCycle = SyncCycle;
 
   if (Queue->isProfilingEnabled()) {
     OutputSignal->increaseUseCount();