[llvm] [Offload] Add support for measuring elapsed time between events (PR #186856)
Leandro Lacerda via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 21 11:05:11 PDT 2026
https://github.com/leandrolcampos updated https://github.com/llvm/llvm-project/pull/186856
>From 1788404f910795fcfd1183393e06e184904b1aac Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 16:30:59 -0300
Subject: [PATCH 1/7] Add offload event timing
Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
offload/liboffload/API/Event.td | 17 +-
offload/liboffload/src/OffloadImpl.cpp | 60 ++++---
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 7 +
.../common/include/PluginInterface.h | 10 ++
.../common/src/PluginInterface.cpp | 21 +++
.../cuda/dynamic_cuda/cuda.cpp | 1 +
.../plugins-nextgen/cuda/dynamic_cuda/cuda.h | 1 +
offload/plugins-nextgen/cuda/src/rtl.cpp | 10 ++
offload/plugins-nextgen/host/src/rtl.cpp | 6 +
.../level_zero/include/L0Device.h | 6 +
offload/unittests/OffloadAPI/CMakeLists.txt | 1 +
.../event/olGetEventElapsedTime.cpp | 146 ++++++++++++++++++
12 files changed, 266 insertions(+), 20 deletions(-)
create mode 100644 offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
diff --git a/offload/liboffload/API/Event.td b/offload/liboffload/API/Event.td
index 075bf5bafaa64..be77500562a19 100644
--- a/offload/liboffload/API/Event.td
+++ b/offload/liboffload/API/Event.td
@@ -13,7 +13,8 @@
def olCreateEvent : Function {
let desc = "Enqueue an event to `Queue` and return it.";
let details = [
- "This event can be used with `olSyncEvent` and `olWaitEvents` and will be complete once all enqueued work prior to the `olCreateEvent` call is complete.",
+ "This event can be used with `olSyncEvent`, `olWaitEvents`, and `olGetEventElapsedTime`.",
+ "It will be complete once all enqueued work prior to the `olCreateEvent` call is complete.",
];
let params = [
Param<"ol_queue_handle_t", "Queue", "queue to create the event for", PARAM_IN>,
@@ -40,6 +41,20 @@ def olSyncEvent : Function {
let returns = [];
}
+def olGetEventElapsedTime : Function {
+ let desc = "Get the elapsed time in milliseconds between two events.";
+ let details = [
+ "The elapsed time is returned in milliseconds.",
+ "The queues associated with `StartEvent` and `EndEvent` must belong to the same device."
+ ];
+ let params = [
+ Param<"ol_event_handle_t", "StartEvent", "handle of the start event", PARAM_IN>,
+ Param<"ol_event_handle_t", "EndEvent", "handle of the end event", PARAM_IN>,
+ Param<"float*", "ElapsedTime", "output pointer for the elapsed time in milliseconds", PARAM_OUT>
+ ];
+ let returns = [];
+}
+
def ol_event_info_t : Enum {
let desc = "Supported event info.";
let is_typed = 1;
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index dd3ec0f61b4da..48748074c79cf 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -175,8 +175,8 @@ struct ol_event_impl_t {
ol_queue_handle_t Queue)
: EventInfo(EventInfo), Device(Device), QueueId(Queue->Id), Queue(Queue) {
}
- // EventInfo may be null, in which case the event should be considered always
- // complete
+ // Opaque backend-specific event state. This is expected to be non-null for
+ // backends that materialize real events.
void *EventInfo;
ol_device_handle_t Device;
size_t QueueId;
@@ -794,7 +794,8 @@ Error olWaitEvents_impl(ol_queue_handle_t Queue, ol_event_handle_t *Events,
return Plugin::error(ErrorCode::INVALID_NULL_HANDLE,
"olWaitEvents asked to wait on a NULL event");
- // Do nothing if the event is for this queue or the event is always complete
+ // Do nothing if the event is for this queue or the backend does not
+ // materialize event state for it.
if (Event->QueueId == Queue->Id || !Event->EventInfo)
continue;
@@ -839,7 +840,8 @@ Error olGetQueueInfoSize_impl(ol_queue_handle_t Queue, ol_queue_info_t PropName,
}
Error olSyncEvent_impl(ol_event_handle_t Event) {
- // No event info means that this event was complete on creation
+ // Some backends do not materialize backend event state. Treat such events as
+ // trivially complete.
if (!Event->EventInfo)
return Plugin::success();
@@ -849,6 +851,26 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
return Error::success();
}
+Error olGetEventElapsedTime_impl(ol_event_handle_t StartEvent,
+ ol_event_handle_t EndEvent,
+ float *ElapsedTime) {
+ if (!StartEvent || !EndEvent)
+ return createOffloadError(ErrorCode::INVALID_NULL_HANDLE,
+ "olGetEventElapsedTime was given a NULL event");
+
+ if (!ElapsedTime)
+ return createOffloadError(ErrorCode::INVALID_NULL_POINTER,
+ "ElapsedTime is null");
+
+ if (StartEvent->Device != EndEvent->Device)
+ return createOffloadError(
+ ErrorCode::INVALID_DEVICE,
+ "StartEvent and EndEvent must belong to the same device");
+
+ return StartEvent->Device->Device->getEventElapsedTime(
+ StartEvent->EventInfo, EndEvent->EventInfo, ElapsedTime);
+}
+
Error olDestroyEvent_impl(ol_event_handle_t Event) {
if (Event->EventInfo)
if (auto Res = Event->Device->Device->destroyEvent(Event->EventInfo))
@@ -867,7 +889,8 @@ Error olGetEventInfoImplDetail(ol_event_handle_t Event,
case OL_EVENT_INFO_QUEUE:
return Info.write<ol_queue_handle_t>(Queue);
case OL_EVENT_INFO_IS_COMPLETE: {
- // No event info means that this event was complete on creation
+ // Some backends do not materialize backend event state. Treat such events as
+ // trivially complete.
if (!Event->EventInfo)
return Info.write<bool>(true);
@@ -898,24 +921,23 @@ Error olGetEventInfoSize_impl(ol_event_handle_t Event, ol_event_info_t PropName,
}
Error olCreateEvent_impl(ol_queue_handle_t Queue, ol_event_handle_t *EventOut) {
- auto Pending = Queue->Device->Device->hasPendingWork(Queue->AsyncInfo);
- if (auto Err = Pending.takeError())
- return Err;
+ auto Event = std::make_unique<ol_event_impl_t>(nullptr, Queue->Device, Queue);
- *EventOut = new ol_event_impl_t(nullptr, Queue->Device, Queue);
- if (!*Pending)
- // Queue is empty, don't record an event and consider the event always
- // complete
- return Plugin::success();
+ if (auto Err = Queue->Device->Device->createEvent(&Event->EventInfo))
+ return Err;
- if (auto Res = Queue->Device->Device->createEvent(&(*EventOut)->EventInfo))
- return Res;
+ if (auto Err = Queue->Device->Device->recordEvent(Event->EventInfo,
+ Queue->AsyncInfo)) {
+ if (Event->EventInfo)
+ if (auto DestroyErr =
+ Queue->Device->Device->destroyEvent(Event->EventInfo))
+ return joinErrors(std::move(Err), std::move(DestroyErr));
- if (auto Res = Queue->Device->Device->recordEvent((*EventOut)->EventInfo,
- Queue->AsyncInfo))
- return Res;
+ return Err;
+ }
- return Plugin::success();
+ *EventOut = Event.release();
+ return Error::success();
}
Error olMemcpy_impl(ol_queue_handle_t Queue, void *DstPtr,
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 3f9064eaf00b3..70aff9f43c5b0 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2871,6 +2871,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Event->sync();
}
+ /// Get the elapsed time in milliseconds between two events.
+ Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+ float *ElapsedTime) override {
+ return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet",
+ __func__);
+ }
+
/// Print information about the device.
Expected<InfoTreeNode> obtainInfoImpl() override {
char TmpChar[1000];
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index b6a54f05b1dcc..04a4a78a24ba9 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -994,6 +994,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error syncEvent(void *EventPtr);
virtual Error syncEventImpl(void *EventPtr) = 0;
+ /// Get the elapsed time in milliseconds between two events.
+ Error getEventElapsedTime(void *StartEventPtr, void *EndEventPtr,
+ float *ElapsedTime);
+ virtual Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+ float *ElapsedTime) = 0;
+
/// Obtain information about the device.
Expected<InfoTreeNode> obtainInfo();
virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
@@ -1543,6 +1549,10 @@ struct GenericPluginTy {
/// Synchronize execution until an event is done.
int32_t sync_event(int32_t DeviceId, void *EventPtr);
+ /// Get the elapsed time in milliseconds between two events.
+ int32_t get_event_elapsed_time(int32_t DeviceId, void *StartEventPtr,
+ void *EndEventPtr, float *ElapsedTime);
+
/// Remove the event from the plugin.
int32_t destroy_event(int32_t DeviceId, void *EventPtr);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 4093d08044bc3..6cc462e6162dd 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1568,6 +1568,12 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
return syncEventImpl(EventPtr);
}
+Error GenericDeviceTy::getEventElapsedTime(void *StartEventPtr,
+ void *EndEventPtr,
+ float *ElapsedTime) {
+ return getEventElapsedTimeImpl(StartEventPtr, EndEventPtr, ElapsedTime);
+}
+
bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
Expected<bool> GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
@@ -2087,6 +2093,21 @@ int32_t GenericPluginTy::sync_event(int32_t DeviceId, void *EventPtr) {
return OFFLOAD_SUCCESS;
}
+int32_t GenericPluginTy::get_event_elapsed_time(int32_t DeviceId,
+ void *StartEventPtr,
+ void *EndEventPtr,
+ float *ElapsedTime) {
+ auto Err = getDevice(DeviceId).getEventElapsedTime(StartEventPtr, EndEventPtr,
+ ElapsedTime);
+ if (Err) {
+ REPORT() << "Failure to get elapsed time between events " << StartEventPtr
+ << " and " << EndEventPtr << ": " << toString(std::move(Err));
+ return OFFLOAD_FAIL;
+ }
+
+ return OFFLOAD_SUCCESS;
+}
+
int32_t GenericPluginTy::destroy_event(int32_t DeviceId, void *EventPtr) {
auto Err = getDevice(DeviceId).destroyEvent(EventPtr);
if (Err) {
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 80e3e418ae3fa..8fc8d0e43fab2 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -97,6 +97,7 @@ DLWRAP(cuEventRecord, 2)
DLWRAP(cuEventQuery, 1)
DLWRAP(cuStreamWaitEvent, 3)
DLWRAP(cuEventSynchronize, 1)
+DLWRAP(cuEventElapsedTime, 3)
DLWRAP(cuEventDestroy, 1)
DLWRAP_FINALIZE()
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index fa4f4634ecec3..dd47fb98dc03a 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -371,6 +371,7 @@ CUresult cuEventRecord(CUevent, CUstream);
CUresult cuEventQuery(CUevent);
CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
CUresult cuEventSynchronize(CUevent);
+CUresult cuEventElapsedTime(float *, CUevent, CUevent);
CUresult cuEventDestroy(CUevent);
CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 4de754265ea77..6d61da4fdc17b 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1086,6 +1086,16 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::check(Res, "error in cuEventSynchronize: %s");
}
+ /// Get the elapsed time in milliseconds between two events.
+ Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+ float *ElapsedTime) override {
+ CUevent StartEvent = reinterpret_cast<CUevent>(StartEventPtr);
+ CUevent EndEvent = reinterpret_cast<CUevent>(EndEventPtr);
+
+ CUresult Res = cuEventElapsedTime(ElapsedTime, StartEvent, EndEvent);
+ return Plugin::check(Res, "error in cuEventElapsedTime: %s");
+ }
+
/// Print information about the device.
Expected<InfoTreeNode> obtainInfoImpl() override {
char TmpChar[1000];
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 077dd14b959e0..b90478a520b39 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -360,6 +360,12 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return true;
}
Error syncEventImpl(void *EventPtr) override { return Plugin::success(); }
+ Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+ float *ElapsedTime) override {
+ if (ElapsedTime)
+ *ElapsedTime = 0.0f;
+ return Plugin::success();
+ }
/// Print information about the device.
Expected<InfoTreeNode> obtainInfoImpl() override {
diff --git a/offload/plugins-nextgen/level_zero/include/L0Device.h b/offload/plugins-nextgen/level_zero/include/L0Device.h
index 001a41ba77d7b..19a88ca0b70ad 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Device.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Device.h
@@ -626,6 +626,12 @@ class L0DeviceTy final : public GenericDeviceTy {
__func__);
}
+ Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
+ float *ElapsedTime) override {
+ return Plugin::error(error::ErrorCode::UNKNOWN, "%s not implemented yet\n",
+ __func__);
+ }
+
Expected<InfoTreeNode> obtainInfoImpl() override;
uint64_t getClockFrequency() const override { return getClockRate(); }
uint64_t getHardwareParallelism() const override { return getTotalThreads(); }
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 031dbea660fbc..39863391f27d6 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -13,6 +13,7 @@ add_offload_unittest("event"
event/olCreateEvent.cpp
event/olDestroyEvent.cpp
event/olSyncEvent.cpp
+ event/olGetEventElapsedTime.cpp
event/olGetEventInfo.cpp
event/olGetEventInfoSize.cpp)
diff --git a/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
new file mode 100644
index 0000000000000..13da46a798140
--- /dev/null
+++ b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
@@ -0,0 +1,146 @@
+//===------- Offload API tests - olGetEventElapsedTime --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include "llvm/Support/MemoryBuffer.h"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+namespace {
+
+struct olGetEventElapsedTimeTest : OffloadQueueTest {
+ void SetUp() override {
+ RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
+
+ ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin));
+ ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
+ DeviceBin->getBufferSize(), &Program));
+ ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &Kernel));
+
+ LaunchArgs.Dimensions = 1;
+ LaunchArgs.GroupSize = {64, 1, 1};
+ LaunchArgs.NumGroups = {1, 1, 1};
+ LaunchArgs.DynSharedMemory = 0;
+
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+ LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
+ }
+
+ void TearDown() override {
+ if (Mem)
+ ASSERT_SUCCESS(olMemFree(Mem));
+ if (Program)
+ ASSERT_SUCCESS(olDestroyProgram(Program));
+ RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown());
+ }
+
+ void launchFoo() {
+ struct {
+ void *Mem;
+ } Args{Mem};
+
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs));
+ }
+
+ std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
+ ol_program_handle_t Program = nullptr;
+ ol_symbol_handle_t Kernel = nullptr;
+ ol_kernel_launch_size_args_t LaunchArgs{};
+ void *Mem = nullptr;
+};
+
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetEventElapsedTimeTest);
+
+TEST_P(olGetEventElapsedTimeTest, Success) {
+ ol_event_handle_t StartEvent = nullptr;
+ ol_event_handle_t EndEvent = nullptr;
+
+ ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+ ASSERT_NE(StartEvent, nullptr);
+
+ launchFoo();
+
+ ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+ ASSERT_NE(EndEvent, nullptr);
+
+ ASSERT_SUCCESS(olSyncEvent(EndEvent));
+
+ float ElapsedTime = -1.0f;
+
+ ASSERT_SUCCESS(olGetEventElapsedTime(StartEvent, EndEvent, &ElapsedTime));
+ ASSERT_GE(ElapsedTime, 0.0f);
+
+ ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+ ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, SuccessMultipleCalls) {
+ ol_event_handle_t StartEvent = nullptr;
+ ol_event_handle_t EndEvent = nullptr;
+
+ ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+ ASSERT_NE(StartEvent, nullptr);
+
+ launchFoo();
+
+ ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+ ASSERT_NE(EndEvent, nullptr);
+
+ ASSERT_SUCCESS(olSyncEvent(EndEvent));
+
+ float ElapsedTimeA = -1.0f;
+ float ElapsedTimeB = -1.0f;
+
+ ASSERT_SUCCESS(olGetEventElapsedTime(StartEvent, EndEvent, &ElapsedTimeA));
+ ASSERT_SUCCESS(olGetEventElapsedTime(StartEvent, EndEvent, &ElapsedTimeB));
+
+ ASSERT_GE(ElapsedTimeA, 0.0f);
+ ASSERT_GE(ElapsedTimeB, 0.0f);
+
+ ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+ ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, InvalidNullStartEvent) {
+ ol_event_handle_t EndEvent = nullptr;
+ ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+
+ float ElapsedTime = 0.0f;
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+ olGetEventElapsedTime(nullptr, EndEvent, &ElapsedTime));
+
+ ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, InvalidNullEndEvent) {
+ ol_event_handle_t StartEvent = nullptr;
+ ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+
+ float ElapsedTime = 0.0f;
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+ olGetEventElapsedTime(StartEvent, nullptr, &ElapsedTime));
+
+ ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+}
+
+TEST_P(olGetEventElapsedTimeTest, InvalidNullElapsedTime) {
+ ol_event_handle_t StartEvent = nullptr;
+ ol_event_handle_t EndEvent = nullptr;
+
+ ASSERT_SUCCESS(olCreateEvent(Queue, &StartEvent));
+ ASSERT_SUCCESS(olCreateEvent(Queue, &EndEvent));
+
+ ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+ olGetEventElapsedTime(StartEvent, EndEvent, nullptr));
+
+ ASSERT_SUCCESS(olDestroyEvent(StartEvent));
+ ASSERT_SUCCESS(olDestroyEvent(EndEvent));
+}
+
+} // namespace
\ No newline at end of file
>From e1743262c9667c0306526baae0c1ae524b27c47c Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 16:36:12 -0300
Subject: [PATCH 2/7] Add event elapsed-time support for AMDGPU plugin
Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
.../amdgpu/dynamic_hsa/hsa.cpp | 2 +
.../plugins-nextgen/amdgpu/dynamic_hsa/hsa.h | 2 +
.../amdgpu/dynamic_hsa/hsa_ext_amd.h | 12 +
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 258 ++++++++++++++++--
.../event/olGetEventElapsedTime.cpp | 2 +-
5 files changed, 249 insertions(+), 27 deletions(-)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
index 37d12861eb387..279a296dd1618 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp
@@ -70,6 +70,8 @@ DLWRAP(hsa_amd_register_system_event_handler, 2)
DLWRAP(hsa_amd_signal_create, 5)
DLWRAP(hsa_amd_signal_async_handler, 5)
DLWRAP(hsa_amd_pointer_info, 5)
+DLWRAP(hsa_amd_profiling_get_dispatch_time, 3)
+DLWRAP(hsa_amd_profiling_set_profiler_enabled, 2)
DLWRAP(hsa_code_object_reader_create_from_memory, 3)
DLWRAP(hsa_code_object_reader_destroy, 1)
DLWRAP(hsa_executable_load_agent_code_object, 5)
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
index ad135f72fff12..f6e3337ddb3f4 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h
@@ -99,6 +99,8 @@ typedef enum {
typedef enum {
HSA_SYSTEM_INFO_VERSION_MAJOR = 0,
HSA_SYSTEM_INFO_VERSION_MINOR = 1,
+ HSA_SYSTEM_INFO_TIMESTAMP = 2,
+ HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY = 3,
} hsa_system_info_t;
typedef enum {
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index ddfa65c76cf2d..7ff77f8e2a2fa 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -169,6 +169,18 @@ hsa_status_t hsa_amd_pointer_info(const void* ptr,
uint32_t* num_agents_accessible,
hsa_agent_t** accessible);
+typedef struct hsa_amd_profiling_dispatch_time_s {
+ uint64_t start;
+ uint64_t end;
+} hsa_amd_profiling_dispatch_time_t;
+
+hsa_status_t
+hsa_amd_profiling_get_dispatch_time(hsa_agent_t agent, hsa_signal_t signal,
+ hsa_amd_profiling_dispatch_time_t *time);
+
+hsa_status_t hsa_amd_profiling_set_profiler_enabled(hsa_queue_t *queue,
+ int enable);
+
#ifdef __cplusplus
}
#endif
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 70aff9f43c5b0..58dabbc374486 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -714,16 +714,32 @@ using AMDGPUSignalManagerTy = GenericDeviceResourceManagerTy<AMDGPUSignalRef>;
/// Class holding an HSA queue to submit kernel and barrier packets.
struct AMDGPUQueueTy {
/// Create an empty queue.
- AMDGPUQueueTy() : Queue(nullptr), Mutex(), NumUsers(0) {}
+ AMDGPUQueueTy()
+ : Queue(nullptr), ProfilingEnabled(false), Mutex(), NumUsers(0) {}
/// Lazily initialize a new queue belonging to a specific agent.
Error init(GenericDeviceTy &Device, hsa_agent_t Agent, int32_t QueueSize) {
if (Queue)
return Plugin::success();
+
hsa_status_t Status =
hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
&Device, UINT32_MAX, UINT32_MAX, &Queue);
- return Plugin::check(Status, "error in hsa_queue_create: %s");
+ if (auto Err = Plugin::check(Status, "error in hsa_queue_create: %s"))
+ return Err;
+
+ // Keep queue profiling enabled from creation time onward, as HIP/ROCclr
+ // does. Elapsed-time queries rely on queue-level hardware profiling support
+ // to retrieve packet timing.
+ //
+ // If enabling profiling fails, leave the queue usable and preserve existing
+ // event behavior; only elapsed-time queries will later report that timing
+ // is unavailable.
+ Status = hsa_amd_profiling_set_profiler_enabled(Queue, 1);
+ if (Status == HSA_STATUS_SUCCESS)
+ ProfilingEnabled = true;
+
+ return Plugin::success();
}
/// Deinitialize the queue and destroy its resources.
@@ -731,10 +747,16 @@ struct AMDGPUQueueTy {
std::lock_guard<std::mutex> Lock(Mutex);
if (!Queue)
return Plugin::success();
+
+ ProfilingEnabled = false;
+
hsa_status_t Status = hsa_queue_destroy(Queue);
return Plugin::check(Status, "error in hsa_queue_destroy: %s");
}
+ /// Returns whether profiling is enabled on the underlying HSA queue.
+ bool isProfilingEnabled() const { return ProfilingEnabled; }
+
/// Returns the number of streams, this queue is currently assigned to.
bool getUserCount() const { return NumUsers; }
@@ -917,6 +939,9 @@ struct AMDGPUQueueTy {
/// The HSA queue.
hsa_queue_t *Queue;
+ /// Indicates whether profiling is enabled on the underlying HSA queue.
+ bool ProfilingEnabled;
+
/// Mutex to protect the acquiring and publishing of packets. For the moment,
/// we need this mutex to prevent publishing packets that are not ready to be
/// published in a multi-thread scenario. Without a queue lock, a thread T1
@@ -1144,6 +1169,18 @@ struct AMDGPUStreamTy {
return {Curr, InputSignal};
}
+ /// Roll back the last consumed slot after a submission failure so the stream
+ /// does not retain a slot for an operation that was never enqueued.
+ void rollbackConsumedSlot(uint32_t Slot) {
+ assert(NextSlot > 0 && "cannot roll back an empty stream");
+ assert(Slot + 1 == NextSlot && "can only roll back the last consumed slot");
+
+ Slots[Slot].Signal = nullptr;
+ Slots[Slot].Callbacks.clear();
+ Slots[Slot].ActionArgs.clear();
+ --NextSlot;
+ }
+
/// Complete all pending post actions and reset the stream after synchronizing
/// or positively querying the stream.
Error complete() {
@@ -1643,8 +1680,8 @@ struct AMDGPUStreamTy {
const AMDGPUQueueTy *getQueue() const { return Queue; }
- /// Record the state of the stream on an event.
- Error recordEvent(AMDGPUEventTy &Event) const;
+ /// Record an event by enqueuing a barrier marker packet on the stream.
+ Error recordEvent(AMDGPUEventTy &Event);
/// Make the stream wait on an event.
Error waitEvent(const AMDGPUEventTy &Event);
@@ -1652,25 +1689,47 @@ struct AMDGPUStreamTy {
friend struct AMDGPUStreamManagerTy;
};
-/// Class representing an event on AMDGPU. The event basically stores some
-/// information regarding the state of the recorded stream.
+/// Class representing an event on AMDGPU. The event stores the recorded stream
+/// point and retained timing state.
struct AMDGPUEventTy {
/// Create an empty event.
AMDGPUEventTy(AMDGPUDeviceTy &Device)
- : RecordedStream(nullptr), RecordedSlot(-1), RecordedSyncCycle(-1) {}
+ : Device(Device), RecordedStream(nullptr), RecordedSlot(-1),
+ RecordedSyncCycle(-1), TimingSignal(nullptr), TimingAgent({0}) {}
/// Initialize and deinitialize.
- Error init() { return Plugin::success(); }
- Error deinit() { return Plugin::success(); }
+ Error init() { return resetState(); }
+ Error deinit() { return resetState(); }
+
+ /// Clear the current recording and retained timing state.
+ Error resetState() {
+ if (auto Err = releaseTimingSignal())
+ return Err;
+
+ RecordedStream = nullptr;
+ RecordedSlot = -1;
+ RecordedSyncCycle = -1;
+ TimingAgent = {0};
+
+ return Plugin::success();
+ }
/// Record the state of a stream on the event.
Error record(AMDGPUStreamTy &Stream) {
std::lock_guard<std::mutex> Lock(Mutex);
// Ignore the last recorded stream.
+ if (auto Err = resetState())
+ return Err;
+
RecordedStream = &Stream;
- return Stream.recordEvent(*this);
+ if (auto Err = Stream.recordEvent(*this)) {
+ llvm::consumeError(resetState());
+ return Err;
+ }
+
+ return Plugin::success();
}
/// Make a stream wait on the current event.
@@ -1708,38 +1767,80 @@ struct AMDGPUEventTy {
return RecordedStream->synchronizeOn(*this);
}
+ /// Return the elapsed time in milliseconds between this event and EndEvent.
+ Error elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime);
+
protected:
+ /// Release the retained timing signal, if any, back to the signal manager.
+ Error releaseTimingSignal();
+
+ /// The device that owns this event.
+ AMDGPUDeviceTy &Device;
+
/// The stream registered in this event.
AMDGPUStreamTy *RecordedStream;
- /// The recordered operation on the recorded stream.
+ /// The recorded operation on the recorded stream.
int64_t RecordedSlot;
/// The sync cycle when the stream was recorded. Used to detect stale events.
int64_t RecordedSyncCycle;
+ /// The signal of the recorded barrier marker used for timing. Null means
+ /// timing is unavailable for the current recording.
+ AMDGPUSignalTy *TimingSignal;
+
+ /// The agent that owns the queue where the timing marker was recorded. A zero
+ /// handle means timing is unavailable for the current recording.
+ hsa_agent_t TimingAgent;
+
/// Mutex to safely access event fields.
mutable std::mutex Mutex;
friend struct AMDGPUStreamTy;
};
-Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
- std::lock_guard<std::mutex> Lock(Mutex);
+Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
+ if (Queue == nullptr)
+ return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
+ "target queue was nullptr");
+
+ // Retrieve an available signal for the operation's output.
+ AMDGPUSignalTy *OutputSignal = nullptr;
+ if (auto Err = SignalManager.getResource(OutputSignal))
+ return Err;
+ OutputSignal->reset();
+ OutputSignal->increaseUseCount();
- if (size() > 0) {
- // Record the synchronize identifier (to detect stale recordings) and
- // the last valid stream's operation.
- Event.RecordedSyncCycle = SyncCycle;
- Event.RecordedSlot = last();
+ std::lock_guard<std::mutex> StreamLock(Mutex);
- assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
- assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
+ // Consume stream slot and compute dependencies.
+ auto [Curr, InputSignal] = consume(OutputSignal);
+
+ // Materialize the event as a real marker on the queue. Elapsed-time queries
+ // need a packet-backed completion signal to retrieve dispatch timing.
+ if (auto Err = Queue->pushBarrier(OutputSignal, InputSignal, nullptr)) {
+ rollbackConsumedSlot(Curr);
+ if (OutputSignal->decreaseUseCount())
+ llvm::consumeError(SignalManager.returnResource(OutputSignal));
+ return Err;
+ }
+
+ Event.RecordedSyncCycle = SyncCycle;
+ Event.RecordedSlot = Curr;
+
+ if (Queue->isProfilingEnabled()) {
+ OutputSignal->increaseUseCount();
+ Event.TimingSignal = OutputSignal;
+ Event.TimingAgent = Agent;
} else {
- // The stream is empty, everything already completed, record nothing.
- Event.RecordedSyncCycle = -1;
- Event.RecordedSlot = -1;
+ Event.TimingSignal = nullptr;
+ Event.TimingAgent = {0};
}
+
+ assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
+ assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
+
return Plugin::success();
}
@@ -2124,6 +2225,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
ClockFrequency) != HSA_STATUS_SUCCESS)
ClockFrequency = 0;
+ // Retrieve the HSA system timestamp frequency for this runtime. A zero
+ // value means the frequency is unavailable.
+ if (hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY,
+ &SystemTimestampFrequency) != HSA_STATUS_SUCCESS)
+ SystemTimestampFrequency = 0;
+
// Load the grid values depending on the wavefront.
if (WavefrontSize == 32)
GridValues = getAMDGPUGridValues<32>();
@@ -2333,6 +2440,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Returns the clock frequency for the given AMDGPU device.
uint64_t getClockFrequency() const override { return ClockFrequency; }
+ /// Returns the HSA system timestamp frequency. Zero means unavailable.
+ uint64_t getSystemTimestampFrequency() const {
+ return SystemTimestampFrequency;
+ }
+
/// Allocate and construct an AMDGPU kernel.
Expected<GenericKernelTy &> constructKernel(const char *Name) override {
// Allocate and construct the AMDGPU kernel.
@@ -2813,12 +2925,19 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Create an event.
Error createEventImpl(void **EventPtrStorage) override {
AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
- return AMDGPUEventManager.getResource(*Event);
+ if (auto Err = AMDGPUEventManager.getResource(*Event))
+ return Err;
+ return (*Event)->resetState();
}
/// Destroy a previously created event.
Error destroyEventImpl(void *EventPtr) override {
AMDGPUEventTy *Event = reinterpret_cast<AMDGPUEventTy *>(EventPtr);
+ assert(Event && "Invalid event");
+
+ if (auto Err = Event->resetState())
+ return Err;
+
return AMDGPUEventManager.returnResource(Event);
}
@@ -2874,8 +2993,18 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Get the elapsed time in milliseconds between two events.
Error getEventElapsedTimeImpl(void *StartEventPtr, void *EndEventPtr,
float *ElapsedTime) override {
- return Plugin::error(ErrorCode::UNIMPLEMENTED, "%s not implemented yet",
- __func__);
+ if (!ElapsedTime)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "elapsed time output pointer is null");
+
+ AMDGPUEventTy *StartEvent =
+ reinterpret_cast<AMDGPUEventTy *>(StartEventPtr);
+ AMDGPUEventTy *EndEvent = reinterpret_cast<AMDGPUEventTy *>(EndEventPtr);
+
+ if (!StartEvent || !EndEvent)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT, "invalid event handle");
+
+ return StartEvent->elapsedTime(*EndEvent, *ElapsedTime);
}
/// Print information about the device.
@@ -3354,6 +3483,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// The frequency of the steady clock inside the device.
uint64_t ClockFrequency;
+ /// The HSA system timestamp frequency reported by the runtime. Zero means
+ /// unavailable.
+ uint64_t SystemTimestampFrequency = 0;
+
/// The total number of concurrent work items that can be running on the GPU.
uint64_t HardwareParallelism;
@@ -3460,6 +3593,79 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
+Error AMDGPUEventTy::releaseTimingSignal() {
+ if (!TimingSignal)
+ return Plugin::success();
+
+ AMDGPUSignalTy *Signal = TimingSignal;
+ TimingSignal = nullptr;
+
+ if (Signal->decreaseUseCount())
+ return Device.getSignalManager().returnResource(Signal);
+
+ return Plugin::success();
+}
+
+Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
+ if (this == &EndEvent) {
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ if (!TimingSignal)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "event timing is not available");
+
+ if (TimingSignal->load())
+ return Plugin::error(ErrorCode::UNKNOWN, "event timing is not ready");
+
+ ElapsedTime = 0.0f;
+ return Plugin::success();
+ }
+
+ std::scoped_lock<std::mutex, std::mutex> Lock(Mutex, EndEvent.Mutex);
+
+ if (&Device != &EndEvent.Device)
+ return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+ "events belong to different devices");
+
+ if (!TimingSignal || !EndEvent.TimingSignal)
+ return Plugin::error(
+ ErrorCode::INVALID_ARGUMENT,
+ "timing information is not available for one or both events");
+
+ if (TimingSignal->load() || EndEvent.TimingSignal->load())
+ return Plugin::error(
+ ErrorCode::UNKNOWN,
+ "timing information is not ready for one or both events");
+
+ const uint64_t Frequency = Device.getSystemTimestampFrequency();
+ if (Frequency == 0)
+ return Plugin::error(ErrorCode::UNSUPPORTED,
+ "HSA system timestamp frequency is unavailable");
+
+ hsa_amd_profiling_dispatch_time_t StartTime = {};
+ hsa_amd_profiling_dispatch_time_t StopTime = {};
+
+ hsa_status_t Status = hsa_amd_profiling_get_dispatch_time(
+ TimingAgent, TimingSignal->get(), &StartTime);
+ if (auto Err = Plugin::check(
+ Status, "error in hsa_amd_profiling_get_dispatch_time: %s"))
+ return Err;
+
+ Status = hsa_amd_profiling_get_dispatch_time(
+ EndEvent.TimingAgent, EndEvent.TimingSignal->get(), &StopTime);
+ if (auto Err = Plugin::check(
+ Status, "error in hsa_amd_profiling_get_dispatch_time: %s"))
+ return Err;
+
+ const int64_t DeltaTicks =
+ static_cast<int64_t>(StopTime.end) - static_cast<int64_t>(StartTime.end);
+
+ ElapsedTime = static_cast<float>(static_cast<double>(DeltaTicks) * 1000.0 /
+ static_cast<double>(Frequency));
+
+ return Plugin::success();
+}
+
/// Class implementing the AMDGPU-specific functionalities of the global
/// handler.
struct AMDGPUGlobalHandlerTy final : public GenericGlobalHandlerTy {
diff --git a/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
index 13da46a798140..aca2dccff72fe 100644
--- a/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
+++ b/offload/unittests/OffloadAPI/event/olGetEventElapsedTime.cpp
@@ -143,4 +143,4 @@ TEST_P(olGetEventElapsedTimeTest, InvalidNullElapsedTime) {
ASSERT_SUCCESS(olDestroyEvent(EndEvent));
}
-} // namespace
\ No newline at end of file
+} // namespace
>From beb718615a06a87b515e4bcd90951b2fe9f5443b Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 16:53:56 -0300
Subject: [PATCH 3/7] Apply clang-format fixes
Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
offload/liboffload/src/OffloadImpl.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 48748074c79cf..3b794092a1d23 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -889,8 +889,8 @@ Error olGetEventInfoImplDetail(ol_event_handle_t Event,
case OL_EVENT_INFO_QUEUE:
return Info.write<ol_queue_handle_t>(Queue);
case OL_EVENT_INFO_IS_COMPLETE: {
- // Some backends do not materialize backend event state. Treat such events as
- // trivially complete.
+ // Some backends do not materialize backend event state. Treat such events
+ // as trivially complete.
if (!Event->EventInfo)
return Info.write<bool>(true);
>From 2702292e1ed1b4e0135448f98c9f18ae4b4e9dfc Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Mon, 16 Mar 2026 18:57:20 -0300
Subject: [PATCH 4/7] Remove redundant validation from
olGetEventElapsedTime_impl
Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
offload/liboffload/src/OffloadImpl.cpp | 8 --------
1 file changed, 8 deletions(-)
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 3b794092a1d23..0e280586f746b 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -854,14 +854,6 @@ Error olSyncEvent_impl(ol_event_handle_t Event) {
Error olGetEventElapsedTime_impl(ol_event_handle_t StartEvent,
ol_event_handle_t EndEvent,
float *ElapsedTime) {
- if (!StartEvent || !EndEvent)
- return createOffloadError(ErrorCode::INVALID_NULL_HANDLE,
- "olGetEventElapsedTime was given a NULL event");
-
- if (!ElapsedTime)
- return createOffloadError(ErrorCode::INVALID_NULL_POINTER,
- "ElapsedTime is null");
-
if (StartEvent->Device != EndEvent->Device)
return createOffloadError(
ErrorCode::INVALID_DEVICE,
>From 6ac00d38d75cae0097ad749dfc97ef937d40f3f4 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Tue, 17 Mar 2026 16:36:46 -0300
Subject: [PATCH 5/7] Use clearer names in elapsed-time conversion
Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 58dabbc374486..6bddfe65f38ee 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3637,8 +3637,8 @@ Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
ErrorCode::UNKNOWN,
"timing information is not ready for one or both events");
- const uint64_t Frequency = Device.getSystemTimestampFrequency();
- if (Frequency == 0)
+ const uint64_t TicksPerSecond = Device.getSystemTimestampFrequency();
+ if (TicksPerSecond == 0)
return Plugin::error(ErrorCode::UNSUPPORTED,
"HSA system timestamp frequency is unavailable");
@@ -3659,9 +3659,11 @@ Error AMDGPUEventTy::elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime) {
const int64_t DeltaTicks =
static_cast<int64_t>(StopTime.end) - static_cast<int64_t>(StartTime.end);
+ constexpr double MillisecondsPerSecond = 1000.0;
- ElapsedTime = static_cast<float>(static_cast<double>(DeltaTicks) * 1000.0 /
- static_cast<double>(Frequency));
+ ElapsedTime = static_cast<float>(static_cast<double>(DeltaTicks) *
+ MillisecondsPerSecond /
+ static_cast<double>(TicksPerSecond));
return Plugin::success();
}
>From 218a76626371c22883ad28d860925cfcbcb5b083 Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 19 Mar 2026 13:34:30 -0300
Subject: [PATCH 6/7] Clarify event timing comments
Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 6bddfe65f38ee..5072185fa072f 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1718,7 +1718,7 @@ struct AMDGPUEventTy {
Error record(AMDGPUStreamTy &Stream) {
std::lock_guard<std::mutex> Lock(Mutex);
- // Ignore the last recorded stream.
+ // Discard the previous recording and retained timing state.
if (auto Err = resetState())
return Err;
@@ -1786,12 +1786,12 @@ struct AMDGPUEventTy {
/// The sync cycle when the stream was recorded. Used to detect stale events.
int64_t RecordedSyncCycle;
- /// The signal of the recorded barrier marker used for timing. Null means
- /// timing is unavailable for the current recording.
+ /// The signal of the recorded timing barrier. Null means timing is
+ /// unavailable for the current recording.
AMDGPUSignalTy *TimingSignal;
- /// The agent that owns the queue where the timing marker was recorded. A zero
- /// handle means timing is unavailable for the current recording.
+ /// The agent that owns the queue where the timing barrier was recorded. A
+ /// zero handle means timing is unavailable for the current recording.
hsa_agent_t TimingAgent;
/// Mutex to safely access event fields.
>From 10ff6db6174e57ec000ae3cc9265e9a4a0cc111a Mon Sep 17 00:00:00 2001
From: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
Date: Thu, 19 Mar 2026 13:47:55 -0300
Subject: [PATCH 7/7] Clarify event recording comments
Signed-off-by: Leandro Augusto Lacerda Campos <leandrolcampos at yahoo.com.br>
---
offload/plugins-nextgen/amdgpu/src/rtl.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 5072185fa072f..c9aa3c66a45f8 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1703,18 +1703,18 @@ struct AMDGPUEventTy {
/// Clear the current recording and retained timing state.
Error resetState() {
- if (auto Err = releaseTimingSignal())
- return Err;
-
RecordedStream = nullptr;
RecordedSlot = -1;
RecordedSyncCycle = -1;
TimingAgent = {0};
+ if (auto Err = releaseTimingSignal())
+ return Err;
+
return Plugin::success();
}
- /// Record the state of a stream on the event.
+ /// Record the current stream point on the event.
Error record(AMDGPUStreamTy &Stream) {
std::lock_guard<std::mutex> Lock(Mutex);
@@ -1826,8 +1826,8 @@ Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
return Err;
}
- Event.RecordedSyncCycle = SyncCycle;
Event.RecordedSlot = Curr;
+ Event.RecordedSyncCycle = SyncCycle;
if (Queue->isProfilingEnabled()) {
OutputSignal->increaseUseCount();
More information about the llvm-commits
mailing list