[llvm] [Offload] Add support for measuring elapsed time between events (PR #186856)

Mon Mar 30 22:59:46 PDT 2026

================
@@ -1708,38 +1767,77 @@ struct AMDGPUEventTy {
     return RecordedStream->synchronizeOn(*this);
   }
 
+  /// Return the elapsed time in milliseconds between this event and EndEvent.
+  Expected<float> getElapsedTime(AMDGPUEventTy &EndEvent);
+
 protected:
+  /// Release the retained timing signal, if any, back to the signal manager.
+  Error releaseTimingSignal();
+
+  /// The device that owns this event.
+  AMDGPUDeviceTy &Device;
+
   /// The stream registered in this event.
   AMDGPUStreamTy *RecordedStream;
 
-  /// The recordered operation on the recorded stream.
+  /// The recorded operation on the recorded stream.
   int64_t RecordedSlot;
 
   /// The sync cycle when the stream was recorded. Used to detect stale events.
   int64_t RecordedSyncCycle;
 
+  /// The signal of the recorded timing barrier. Null means timing is
+  /// unavailable for the current recording.
+  AMDGPUSignalTy *TimingSignal;
+
   /// Mutex to safely access event fields.
   mutable std::mutex Mutex;
 
   friend struct AMDGPUStreamTy;
 };
 
-Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) const {
-  std::lock_guard<std::mutex> Lock(Mutex);
+Error AMDGPUStreamTy::recordEvent(AMDGPUEventTy &Event) {
+  if (Queue == nullptr)
+    return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
+                         "target queue was nullptr");
+
+  // Retrieve an available signal for the operation's output.
+  AMDGPUSignalTy *OutputSignal = nullptr;
+  if (auto Err = SignalManager.getResource(OutputSignal))
+    return Err;
+  OutputSignal->reset();
+  OutputSignal->increaseUseCount();
+
+  std::lock_guard<std::mutex> StreamLock(Mutex);
+
+  // Consume stream slot and compute dependencies.
+  auto [Curr, InputSignal] = consume(OutputSignal);
+
+  // Materialize the event as a real marker on the queue. Elapsed-time queries
+  // need a packet-backed completion signal to retrieve dispatch timing.
+  if (auto Err = Queue->pushBarrier(OutputSignal, InputSignal, nullptr)) {
----------------
kevinsala wrote:

Thanks for the explanation. We will monitor a couple of applications on AMDGPU to verify it does not impact performance.

https://github.com/llvm/llvm-project/pull/186856