[llvm] [Offload] Add support for measuring elapsed time between events (PR #186856)

Thu Mar 26 21:21:08 PDT 2026

================
@@ -1708,38 +1767,80 @@ struct AMDGPUEventTy {
     return RecordedStream->synchronizeOn(*this);
   }
 
+  /// Return the elapsed time in milliseconds between this event and EndEvent.
+  Error elapsedTime(AMDGPUEventTy &EndEvent, float &ElapsedTime);
+
 protected:
+  /// Release the retained timing signal, if any, back to the signal manager.
+  Error releaseTimingSignal();
+
+  /// The device that owns this event.
+  AMDGPUDeviceTy &Device;
+
   /// The stream registered in this event.
   AMDGPUStreamTy *RecordedStream;
 
-  /// The recordered operation on the recorded stream.
+  /// The recorded operation on the recorded stream.
   int64_t RecordedSlot;
 
   /// The sync cycle when the stream was recorded. Used to detect stale events.
   int64_t RecordedSyncCycle;
 
+  /// The signal of the recorded timing barrier. Null means timing is
+  /// unavailable for the current recording.
+  AMDGPUSignalTy *TimingSignal;
+
+  /// The agent that owns the queue where the timing barrier was recorded. A
+  /// zero handle means timing is unavailable for the current recording.
+  hsa_agent_t TimingAgent;
+
   /// Mutex to safely access event fields.
   mutable std::mutex Mutex;
 
   friend struct AMDGPUStreamTy;
 };
 
-Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
-  std::lock_guard<std::mutex> Lock(Mutex);
+Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
+  if (Queue == nullptr)
+    return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
+                         "target queue was nullptr");
+
+  // Retrieve an available signal for the operation's output.
+  AMDGPUSignalTy *OutputSignal = nullptr;
+  if (auto Err = SignalManager.getResource(OutputSignal))
+    return Err;
+  OutputSignal->reset();
+  OutputSignal->increaseUseCount();
+
+  std::lock_guard<std::mutex> StreamLock(Mutex);
+
+  // Consume stream slot and compute dependencies.
+  auto [Curr, InputSignal] = consume(OutputSignal);
+
+  // Materialize the event as a real marker on the queue. Elapsed-time queries
+  // need a packet-backed completion signal to retrieve dispatch timing.
+  if (auto Err = Queue->pushBarrier(OutputSignal, InputSignal, nullptr)) {
+    rollbackConsumedSlot(Curr);
+    if (OutputSignal->decreaseUseCount())
+      llvm::consumeError(SignalManager.returnResource(OutputSignal));
+    return Err;
+  }
 
-  if (size() > 0) {
-    // Record the synchronize identifier (to detect stale recordings) and
-    // the last valid stream's operation.
-    Event.RecordedSyncCycle = SyncCycle;
-    Event.RecordedSlot = last();
+  Event.RecordedSlot = Curr;
+  Event.RecordedSyncCycle = SyncCycle;
 
-    assert(Event.RecordedSyncCycle >= 0 && "Invalid recorded sync cycle");
-    assert(Event.RecordedSlot >= 0 && "Invalid recorded slot");
+  if (Queue->isProfilingEnabled()) {
+    OutputSignal->increaseUseCount();
+    Event.TimingSignal = OutputSignal;
+    Event.TimingAgent = Agent;
----------------
leandrolcampos wrote:

Yes, here this is expected to be the same agent.

Previously, `TimingAgent` stored the agent of the queue where the timing barrier was recorded, which came from the associated stream. After looking at the code more closely, that agent is expected to match the agent of the event's owning device, and the event already retains its `AMDGPUDeviceTy`.

So I removed `TimingAgent` from the event state and now use `Device.getAgent()` directly. This relies on the existing implicit invariant that an event is recorded on a stream from the same device.

https://github.com/llvm/llvm-project/pull/186856