[libc-commits] [libc] [llvm] [libc] Support AMDGPU device interrupts for the RPC interface (PR #188067)

Mon Mar 23 19:04:41 PDT 2026

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/188067

>From 693e5eedd34443f54f7a2c518a30d2e195ee7ee3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 23 Mar 2026 11:09:28 -0500
Subject: [PATCH] [libc] Support AMDGPU device interrupts for the RPC interface

Summary:
One of the main disadvantages to using the RPC interface is that it
requires a server thread to spin on the mailboxes checking for work.
The vast majority of the time, there will be no work and work will come
in large bursts.

The HSA / KFD interface supports device-side interrupts and already has
handling for binding these events to an HSA signal. This means that we
can send interrupts from the GPU to wake a sleeping thread on the CPU.
The sleeping thread will be descheduled with a blocking HSA wait call
and woken up when its event ID is raised through the kernel driver's
interrupt.

This is very target-specific handling, but I believe it is valuable
enough to warrant it being in the protocol. It is completely optional,
as it is ignored if uninitialized. This should bring this support at
parity with the interface HIP expects.
---
 libc/shared/rpc.h                             | 70 +++++++++++++++++--
 libc/shared/rpc_util.h                        |  9 +++
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 39 +++++++++++
 .../common/include/PluginInterface.h          |  9 +++
 offload/plugins-nextgen/common/include/RPC.h  | 16 ++++-
 .../common/src/PluginInterface.cpp            |  2 +-
 offload/plugins-nextgen/common/src/RPC.cpp    | 35 +++++++---
 7 files changed, 160 insertions(+), 20 deletions(-)

diff --git a/libc/shared/rpc.h b/libc/shared/rpc.h
index f162c4cfae5e2..81cf9fc2c6d3f 100644
--- a/libc/shared/rpc.h
+++ b/libc/shared/rpc.h
@@ -31,6 +31,10 @@ namespace rpc {
   __atomic_fetch_or(src, val, ord)
 #define __scoped_atomic_fetch_and(src, val, ord, scp)                          \
   __atomic_fetch_and(src, val, ord)
+#define __scoped_atomic_fetch_add(src, val, ord, scp)                          \
+  __atomic_fetch_add(src, val, ord)
+#define __scoped_atomic_fetch_sub(src, val, ord, scp)                          \
+  __atomic_fetch_sub(src, val, ord)
 #endif
 #if !__has_builtin(__scoped_atomic_thread_fence)
 #define __scoped_atomic_thread_fence(ord, scp) __atomic_thread_fence(ord)
@@ -49,6 +53,14 @@ struct Buffer {
 };
 static_assert(sizeof(Buffer) == 64, "Buffer size mismatch");
 
+/// A target specific struct containing a doorbell to wake the server-side
+/// thread.
+struct alignas(64) Doorbell {
+  uint64_t *value;
+  uint64_t *mailbox;
+  uint32_t event_id;
+};
+
 /// The information associated with a packet. This indicates which operations to
 /// perform and which threads are active in the slots.
 struct Header {
@@ -80,6 +92,7 @@ template <bool Invert> struct Process {
   RPC_ATTRS ~Process() = default;
 
   const uint32_t port_count = 0;
+  Doorbell *const doorbell = nullptr;
   const uint32_t *const inbox = nullptr;
   uint32_t *const outbox = nullptr;
   Header *const header = nullptr;
@@ -89,8 +102,10 @@ template <bool Invert> struct Process {
   uint32_t lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
 
   RPC_ATTRS Process(uint32_t port_count, void *buffer)
-      : port_count(port_count), inbox(reinterpret_cast<uint32_t *>(
-                                    advance(buffer, inbox_offset(port_count)))),
+      : port_count(port_count), doorbell(reinterpret_cast<Doorbell *>(
+                                    advance(buffer, doorbell_offset()))),
+        inbox(reinterpret_cast<uint32_t *>(
+            advance(buffer, inbox_offset(port_count)))),
         outbox(reinterpret_cast<uint32_t *>(
             advance(buffer, outbox_offset(port_count)))),
         header(reinterpret_cast<Header *>(
@@ -102,6 +117,7 @@ template <bool Invert> struct Process {
   /// representation in memory.
   ///
   /// struct Equivalent {
+  ///   Doorbell doorbell;
   ///   Atomic<uint32_t> primary[port_count];
   ///   Atomic<uint32_t> secondary[port_count];
   ///   Header header[port_count];
@@ -112,6 +128,34 @@ template <bool Invert> struct Process {
     return buffer_offset(port_count) + buffer_bytes(port_count, lane_size);
   }
 
+  /// Ring the doorbell if the protocol was configured with one.
+  RPC_ATTRS void notify(uint64_t lane_mask) const {
+    if (!doorbell->value)
+      return;
+
+    uint32_t event_id = rpc::broadcast_value(lane_mask, doorbell->event_id);
+    if (rpc::is_first_lane(lane_mask)) {
+      if (!__scoped_atomic_fetch_add(doorbell->value, 1UL, __ATOMIC_RELAXED,
+                                     __MEMORY_SCOPE_SYSTEM)) {
+        __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM);
+        __scoped_atomic_store_n(doorbell->mailbox,
+                                static_cast<uint64_t>(doorbell->event_id),
+                                __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
+        signal_interrupt(event_id);
+      }
+    }
+  }
+
+  /// Decrement the doorbell signal if the protocol is using one.
+  RPC_ATTRS void finish(uint64_t lane_mask) const {
+    if (!doorbell->value)
+      return;
+
+    if (rpc::is_first_lane(lane_mask))
+      __scoped_atomic_fetch_sub(doorbell->value, 1UL, __ATOMIC_RELAXED,
+                                __MEMORY_SCOPE_SYSTEM);
+  }
+
   /// Retrieve the inbox state from memory shared between processes.
   RPC_ATTRS uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
     return rpc::broadcast_value(
@@ -235,19 +279,23 @@ template <bool Invert> struct Process {
     return port_count * lane_size * sizeof(Buffer);
   }
 
+  /// The offset to the doorbell interface.
+  RPC_ATTRS static constexpr uint64_t doorbell_offset() { return 0; }
+
   /// Offset of the inbox in memory. This is the same as the outbox if inverted.
   RPC_ATTRS static constexpr uint64_t inbox_offset(uint32_t port_count) {
-    return Invert ? mailbox_bytes(port_count) : 0;
+    return sizeof(Doorbell) + (Invert ? mailbox_bytes(port_count) : 0);
   }
 
   /// Offset of the outbox in memory. This is the same as the inbox if inverted.
   RPC_ATTRS static constexpr uint64_t outbox_offset(uint32_t port_count) {
-    return Invert ? 0 : mailbox_bytes(port_count);
+    return sizeof(Doorbell) + (Invert ? 0 : mailbox_bytes(port_count));
   }
 
   /// Offset of the header containing the opcode and mask after the mailboxes.
   RPC_ATTRS static constexpr uint64_t header_offset(uint32_t port_count) {
-    return align_up(2 * mailbox_bytes(port_count), alignof(Header));
+    return align_up(sizeof(Doorbell) + 2 * mailbox_bytes(port_count),
+                    alignof(Header));
   }
 
   /// Offset of the buffer containing the packets after the inbox and outbox.
@@ -310,7 +358,7 @@ template <bool T> struct Port {
 
   friend struct Client;
   friend struct Server;
-  friend class rpc::optional<Port<T>>;
+  friend struct rpc::optional<Port<T>>;
 
 public:
   template <typename U> RPC_ATTRS void recv(U use);
@@ -345,6 +393,8 @@ template <bool T> struct Port {
     if (owns_buffer && T)
       out = process.invert_outbox(lane_mask, index, out);
     process.unlock(lane_mask, index);
+    if constexpr (!T)
+      process.finish(lane_mask);
   }
 
   Process<T> &process;
@@ -392,6 +442,10 @@ struct Server {
     return Process<true>::allocation_size(port_count, lane_size);
   }
 
+  RPC_ATTRS static constexpr uint64_t doorbell_offset() {
+    return Process<true>::doorbell_offset();
+  }
+
 private:
   Process<true> process;
 };
@@ -581,6 +635,8 @@ template <uint32_t opcode> RPC_ATTRS Client::Port Client::open() {
       process.header[index].mask = lane_mask;
     }
     rpc::sync_lane(lane_mask);
+
+    process.notify(lane_mask);
     return Port(process, lane_mask, rpc::get_num_lanes(), index, out);
   }
 }
@@ -626,6 +682,8 @@ Server::try_open(uint32_t lane_size, uint32_t start) {
 #undef __scoped_atomic_store_n
 #undef __scoped_atomic_fetch_or
 #undef __scoped_atomic_fetch_and
+#undef __scoped_atomic_fetch_add
+#undef __scoped_atomic_fetch_sub
 #endif
 #if !__has_builtin(__scoped_atomic_thread_fence)
 #undef __scoped_atomic_thread_fence
diff --git a/libc/shared/rpc_util.h b/libc/shared/rpc_util.h
index 8dcdce0e13a21..c003db828fd74 100644
--- a/libc/shared/rpc_util.h
+++ b/libc/shared/rpc_util.h
@@ -417,6 +417,15 @@ RPC_ATTRS uint64_t ballot([[maybe_unused]] uint64_t lane_mask, bool x) {
 #endif
 }
 
+/// Signal an interrupt from the device to wake the server. Only supported for
+/// AMDGPU targets currently.
+RPC_ATTRS void signal_interrupt([[maybe_unused]] uint32_t event_id) {
+#ifdef __AMDGPU__
+  constexpr uint32_t MSG_INTERRUPT = 1;
+  __builtin_amdgcn_s_sendmsg(MSG_INTERRUPT, event_id);
+#endif
+}
+
 /// Return \p val aligned "upwards" according to \p align.
 template <typename V, typename A>
 RPC_ATTRS constexpr V align_up(V val, A align) {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index b25d3c9ab7214..7aeedab8932bc 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3583,6 +3583,10 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     if (auto Err = HostDevice->init())
       return std::move(Err);
 
+    // Setup the signal used to communicate with the RPG server.
+    if (auto Err = RPCSignal.init(0))
+      return Err;
+
     return NumDevices;
   }
 
@@ -3666,6 +3670,38 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     return KernelAgents;
   }
 
+  /// Create an HSA signal for the RPC doorbell and return the fields needed
+  /// for the GPU to fire interrupts that wake the server thread.
+  Error initRPCDoorbell(uint64_t *&Value, uint64_t *&Mailbox,
+                        uint32_t &EventID) override {
+    // Pull out the necessary fields to communicate with the signal from the
+    // device. These are not exposed but are unlikely to be changed.
+    struct AMDSignal {
+      int64_t kind;
+      int64_t value;
+      uint64_t event_mailbox_ptr;
+      uint32_t event_id;
+    };
+    auto *Doorbell = reinterpret_cast<AMDSignal *>(RPCSignal.get().handle);
+
+    // The event ID corresponds do the HSA signal's slot in the interrupt list.
+    Value = reinterpret_cast<uint64_t *>(&Doorbell->value);
+    Mailbox = reinterpret_cast<uint64_t *>(Doorbell->event_mailbox_ptr);
+    EventID = Doorbell->event_id;
+
+    hsa_signal_t Signal = RPCSignal.get();
+    getRPCServer().setSleepFunction(
+        [Signal]() {
+          hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_NE, 0,
+                                    /*timeout_hint=*/0, HSA_WAIT_STATE_BLOCKED);
+        },
+        [Signal]() { hsa_signal_store_screlease(Signal, 1); });
+
+    return Plugin::success();
+  }
+
+  Error deinitRPCDoorbell() override { return RPCSignal.deinit(); }
+
 private:
   /// Event handler that will be called by ROCr if an event is detected.
   static hsa_status_t eventHandler(const hsa_amd_event_t *Event,
@@ -3752,6 +3788,9 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
   /// only iterating functions. We cache the agents here for convenience.
   llvm::SmallVector<hsa_agent_t> KernelAgents;
 
+  /// HSA signal used as the RPC doorbell for GPU-to-host interrupts.
+  AMDGPUSignalTy RPCSignal;
+
   /// The device representing all HSA host agents.
   AMDHostDeviceTy *HostDevice;
 };
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index b6a54f05b1dcc..20f8279dfee2d 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1359,6 +1359,15 @@ struct GenericPluginTy {
     return *RPCServer;
   }
 
+  /// Initialize the RPC doorbell if used by the target.
+  virtual Error initRPCDoorbell(uint64_t *&Value, uint64_t *&Mailbox,
+                                uint32_t &EventID) {
+    return Plugin::success();
+  }
+
+  /// Tear down any target-specific doorbell resources.
+  virtual Error deinitRPCDoorbell() { return Plugin::success(); }
+
   /// Get a reference to the record and replay interface for the plugin.
   RecordReplayTy &getRecordReplay() {
     assert(RecordReplay && "RR interface not initialized");
diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h
index a4c6008ea5794..298baf99b3c37 100644
--- a/offload/plugins-nextgen/common/include/RPC.h
+++ b/offload/plugins-nextgen/common/include/RPC.h
@@ -23,6 +23,7 @@
 #include <atomic>
 #include <condition_variable>
 #include <cstdint>
+#include <functional>
 #include <mutex>
 #include <thread>
 
@@ -45,7 +46,7 @@ struct RPCServerTy {
   RPCServerTy(plugin::GenericPluginTy &Plugin);
 
   /// Deinitialize the associated memory and resources.
-  llvm::Error shutDown();
+  llvm::Error shutDown(plugin::GenericPluginTy &Plugin);
 
   /// Initialize the worker thread.
   llvm::Error startThread();
@@ -71,6 +72,10 @@ struct RPCServerTy {
   /// Register a custom callback for the RPC server to manage.
   void registerCallback(RPCServerCallbackTy FnPtr);
 
+  /// Set the sleep/wake functions for interrupt-driven RPC serving.
+  void setSleepFunction(std::function<void()> Sleep,
+                        std::function<void()> Wake);
+
 private:
   /// Array from this device's identifier to its attached devices.
   std::unique_ptr<void *[]> Buffers;
@@ -114,13 +119,20 @@ struct RPCServerTy {
     /// A reference to the associated generic device for the buffer.
     llvm::ArrayRef<plugin::GenericDeviceTy *> Devices;
 
+    // Sleep and wake functions to handle when the server is idle.
+    std::function<void()> SleepFunction;
+    std::function<void()> WakeFunction;
+
     /// Initialize the worker thread to run in the background.
     ServerThread(void *Buffers[], plugin::GenericDeviceTy *Devices[],
                  size_t Length, std::mutex &BufferMutex,
                  llvm::SmallSetVector<RPCServerCallbackTy, 0> &Callbacks)
         : Running(false), NumUsers(0), CV(), Mutex(), BufferMutex(BufferMutex),
           Callbacks(Callbacks), Buffers(Buffers, Length),
-          Devices(Devices, Length) {}
+          Devices(Devices, Length), SleepFunction([]() {
+            std::this_thread::sleep_for(std::chrono::microseconds(250));
+          }),
+          WakeFunction([]() {}) {}
 
     ~ServerThread() { assert(!Running && "Thread not shut down explicitly\n"); }
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 4093d08044bc3..8a32e4177d3d0 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1623,7 +1623,7 @@ Error GenericPluginTy::deinit() {
     delete GlobalHandler;
 
   if (RPCServer) {
-    if (Error Err = RPCServer->shutDown())
+    if (Error Err = RPCServer->shutDown(*this))
       return Err;
     delete RPCServer;
   }
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 83cde630ebf40..b37be170484bc 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -136,16 +136,15 @@ void RPCServerTy::ServerThread::shutDown() {
     std::lock_guard<decltype(Mutex)> Lock(Mutex);
     CV.notify_all();
   }
+  if (WakeFunction)
+    WakeFunction();
   if (Worker.joinable())
     Worker.join();
 }
 
 void RPCServerTy::ServerThread::run() {
-  static constexpr auto IdleTime = std::chrono::microseconds(25);
-  static constexpr auto IdleSleep = std::chrono::microseconds(250);
   std::unique_lock<decltype(Mutex)> Lock(Mutex);
 
-  auto LastUse = std::chrono::steady_clock::now();
   for (;;) {
     CV.wait(Lock, [&]() {
       return NumUsers.load(std::memory_order_acquire) > 0 ||
@@ -160,12 +159,8 @@ void RPCServerTy::ServerThread::run() {
     while (NumUsers.load(std::memory_order_relaxed) > 0 &&
            Running.load(std::memory_order_relaxed)) {
 
-      // Suspend this thread briefly if there is no current work.
-      auto Now = std::chrono::steady_clock::now();
-      if (!ClientInUse && Now - LastUse >= IdleTime)
-        std::this_thread::sleep_for(IdleSleep);
-      else if (ClientInUse)
-        LastUse = Now;
+      if (!ClientInUse)
+        SleepFunction();
 
       ClientInUse = false;
       std::lock_guard<decltype(Mutex)> Lock(BufferMutex);
@@ -196,9 +191,9 @@ llvm::Error RPCServerTy::startThread() {
   return Error::success();
 }
 
-llvm::Error RPCServerTy::shutDown() {
+llvm::Error RPCServerTy::shutDown(plugin::GenericPluginTy &Plugin) {
   Thread->shutDown();
-  return Error::success();
+  return Plugin.deinitRPCDoorbell();
 }
 
 llvm::Expected<bool>
@@ -225,6 +220,17 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
         error::ErrorCode::UNKNOWN,
         "failed to initialize RPC server for device %d", Device.getDeviceId());
 
+  // The doorbell is used by AMDGPU targets to let the server thread be
+  // descheduled. It is optional and will be ignored if the fields are null.
+  rpc::Doorbell Doorbell{};
+  if (auto Err = Device.Plugin.initRPCDoorbell(Doorbell.value, Doorbell.mailbox,
+                                               Doorbell.event_id))
+    return Err;
+
+  auto *DoorbellPtr = reinterpret_cast<rpc::Doorbell *>(
+      static_cast<uint8_t *>(RPCBuffer) + rpc::Server::doorbell_offset());
+  std::memcpy(DoorbellPtr, &Doorbell, sizeof(rpc::Doorbell));
+
   // Get the address of the RPC client from the device.
   plugin::GlobalTy ClientGlobal("__llvm_rpc_client", sizeof(rpc::Client));
   if (auto Err =
@@ -255,3 +261,10 @@ void RPCServerTy::registerCallback(RPCServerCallbackTy FnPtr) {
   std::lock_guard<decltype(BufferMutex)> Lock(BufferMutex);
   Callbacks.insert(FnPtr);
 }
+
+void RPCServerTy::setSleepFunction(std::function<void()> Sleep,
+                                   std::function<void()> Wake) {
+  std::lock_guard<decltype(BufferMutex)> Lock(BufferMutex);
+  Thread->SleepFunction = std::move(Sleep);
+  Thread->WakeFunction = std::move(Wake);
+}