[libc-commits] [libc] [llvm] [libc] Remove RPC server API and use the header directly (PR #117075)

Thu Nov 21 08:13:03 PST 2024

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/117075

>From 3e8b0f8ef848325d79165285aeeb33b2e9614402 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 20 Nov 2024 13:24:04 -0600
Subject: [PATCH 1/2] [libc] Move RPC interface to `libc/shared` to export it

Summary:
Previous patches have made the `rpc.h` header independent of the `libc`
internals. This allows us to include it directly rather than providing
an indirect C API. This patch only does the work to move the header. A
future patch will pull out the `rpc_server` interface and simply replace
it with a single function that handles the opcodes.
---
 libc/{src/__support/RPC => shared}/rpc.h      | 162 +++++++++---------
 libc/{src/__support/RPC => shared}/rpc_util.h | 109 ++++++------
 libc/src/__support/RPC/CMakeLists.txt         |  15 --
 libc/src/__support/RPC/rpc_client.cpp         |   2 +-
 libc/src/__support/RPC/rpc_client.h           |   8 +-
 .../startup/gpu/rpc_interface_test.cpp        |  50 ++++--
 .../startup/gpu/rpc_stream_test.cpp           |   6 +-
 .../test/integration/startup/gpu/rpc_test.cpp |  14 +-
 libc/utils/gpu/server/rpc_server.cpp          |  52 ++++--
 9 files changed, 222 insertions(+), 196 deletions(-)
 rename libc/{src/__support/RPC => shared}/rpc.h (80%)
 rename libc/{src/__support/RPC => shared}/rpc_util.h (61%)

diff --git a/libc/src/__support/RPC/rpc.h b/libc/shared/rpc.h
similarity index 80%
rename from libc/src/__support/RPC/rpc.h
rename to libc/shared/rpc.h
index 30dd2c1a8125d74..489a8cebfb807c4 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/shared/rpc.h
@@ -15,16 +15,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC___SUPPORT_RPC_RPC_H
-#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_H
+#ifndef LLVM_LIBC_SHARED_RPC_H
+#define LLVM_LIBC_SHARED_RPC_H
 
 #include "rpc_util.h"
-#include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
 
 #include <stdint.h>
 
-namespace LIBC_NAMESPACE_DECL {
+#ifndef RPC_INLINE
+#define RPC_INLINE inline
+#endif
+
 namespace rpc {
 
 /// Use scoped atomic variants if they are available for the target.
@@ -70,12 +71,12 @@ constexpr static uint64_t MAX_PORT_COUNT = 4096;
 ///   - The server will always start with a 'recv' operation.
 ///   - Every 'send' or 'recv' call is mirrored by the other process.
 template <bool Invert> struct Process {
-  LIBC_INLINE Process() = default;
-  LIBC_INLINE Process(const Process &) = delete;
-  LIBC_INLINE Process &operator=(const Process &) = delete;
-  LIBC_INLINE Process(Process &&) = default;
-  LIBC_INLINE Process &operator=(Process &&) = default;
-  LIBC_INLINE ~Process() = default;
+  RPC_INLINE Process() = default;
+  RPC_INLINE Process(const Process &) = delete;
+  RPC_INLINE Process &operator=(const Process &) = delete;
+  RPC_INLINE Process(Process &&) = default;
+  RPC_INLINE Process &operator=(Process &&) = default;
+  RPC_INLINE ~Process() = default;
 
   uint32_t port_count = 0;
   uint32_t *inbox = nullptr;
@@ -86,7 +87,7 @@ template <bool Invert> struct Process {
   static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
   uint32_t lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
 
-  LIBC_INLINE Process(uint32_t port_count, void *buffer)
+  RPC_INLINE Process(uint32_t port_count, void *buffer)
       : port_count(port_count), inbox(reinterpret_cast<uint32_t *>(
                                     advance(buffer, inbox_offset(port_count)))),
         outbox(reinterpret_cast<uint32_t *>(
@@ -105,20 +106,20 @@ template <bool Invert> struct Process {
   ///   Header header[port_count];
   ///   Buffer packet[port_count][lane_size];
   /// };
-  LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count,
-                                                        uint32_t lane_size) {
+  RPC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count,
+                                                       uint32_t lane_size) {
     return buffer_offset(port_count) + buffer_bytes(port_count, lane_size);
   }
 
   /// Retrieve the inbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
+  RPC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
     return rpc::broadcast_value(
         lane_mask, __scoped_atomic_load_n(&inbox[index], __ATOMIC_RELAXED,
                                           __MEMORY_SCOPE_SYSTEM));
   }
 
   /// Retrieve the outbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
+  RPC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
     return rpc::broadcast_value(
         lane_mask, __scoped_atomic_load_n(&outbox[index], __ATOMIC_RELAXED,
                                           __MEMORY_SCOPE_SYSTEM));
@@ -128,7 +129,7 @@ template <bool Invert> struct Process {
   /// Equivalent to loading outbox followed by store of the inverted value
   /// The outbox is write only by this warp and tracking the value locally is
   /// cheaper than calling load_outbox to get the value to store.
-  LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
+  RPC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
     uint32_t inverted_outbox = !current_outbox;
     __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM);
     __scoped_atomic_store_n(&outbox[index], inverted_outbox, __ATOMIC_RELAXED,
@@ -138,8 +139,8 @@ template <bool Invert> struct Process {
 
   // Given the current outbox and inbox values, wait until the inbox changes
   // to indicate that this thread owns the buffer element.
-  LIBC_INLINE void wait_for_ownership(uint64_t lane_mask, uint32_t index,
-                                      uint32_t outbox, uint32_t in) {
+  RPC_INLINE void wait_for_ownership(uint64_t lane_mask, uint32_t index,
+                                     uint32_t outbox, uint32_t in) {
     while (buffer_unavailable(in, outbox)) {
       sleep_briefly();
       in = load_inbox(lane_mask, index);
@@ -150,14 +151,14 @@ template <bool Invert> struct Process {
   /// The packet is a linearly allocated array of buffers used to communicate
   /// with the other process. This function returns the appropriate slot in this
   /// array such that the process can operate on an entire warp or wavefront.
-  LIBC_INLINE Buffer *get_packet(uint32_t index, uint32_t lane_size) {
+  RPC_INLINE Buffer *get_packet(uint32_t index, uint32_t lane_size) {
     return &packet[index * lane_size];
   }
 
   /// Determines if this process needs to wait for ownership of the buffer. We
   /// invert the condition on one of the processes to indicate that if one
   /// process owns the buffer then the other does not.
-  LIBC_INLINE static bool buffer_unavailable(uint32_t in, uint32_t out) {
+  RPC_INLINE static bool buffer_unavailable(uint32_t in, uint32_t out) {
     bool cond = in != out;
     return Invert ? !cond : cond;
   }
@@ -166,7 +167,7 @@ template <bool Invert> struct Process {
   /// lane_mask is a bitmap of the threads in the warp that would hold the
   /// single lock on success, e.g. the result of rpc::get_lane_mask()
   /// The lock is held when the n-th bit of the lock bitfield is set.
-  LIBC_INLINE bool try_lock(uint64_t lane_mask, uint32_t index) {
+  RPC_INLINE bool try_lock(uint64_t lane_mask, uint32_t index) {
     // On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
     // On volta, need to handle differences between the threads running and
     // the threads that were detected in the previous call to get_lane_mask()
@@ -206,7 +207,7 @@ template <bool Invert> struct Process {
 
   /// Unlock the lock at index. We need a lane sync to keep this function
   /// convergent, otherwise the compiler will sink the store and deadlock.
-  LIBC_INLINE void unlock(uint64_t lane_mask, uint32_t index) {
+  RPC_INLINE void unlock(uint64_t lane_mask, uint32_t index) {
     // Do not move any writes past the unlock.
     __scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_DEVICE);
 
@@ -219,40 +220,40 @@ template <bool Invert> struct Process {
   }
 
   /// Number of bytes to allocate for an inbox or outbox.
-  LIBC_INLINE static constexpr uint64_t mailbox_bytes(uint32_t port_count) {
+  RPC_INLINE static constexpr uint64_t mailbox_bytes(uint32_t port_count) {
     return port_count * sizeof(uint32_t);
   }
 
   /// Number of bytes to allocate for the buffer containing the packets.
-  LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count,
-                                                     uint32_t lane_size) {
+  RPC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count,
+                                                    uint32_t lane_size) {
     return port_count * lane_size * sizeof(Buffer);
   }
 
   /// Offset of the inbox in memory. This is the same as the outbox if inverted.
-  LIBC_INLINE static constexpr uint64_t inbox_offset(uint32_t port_count) {
+  RPC_INLINE static constexpr uint64_t inbox_offset(uint32_t port_count) {
     return Invert ? mailbox_bytes(port_count) : 0;
   }
 
   /// Offset of the outbox in memory. This is the same as the inbox if inverted.
-  LIBC_INLINE static constexpr uint64_t outbox_offset(uint32_t port_count) {
+  RPC_INLINE static constexpr uint64_t outbox_offset(uint32_t port_count) {
     return Invert ? 0 : mailbox_bytes(port_count);
   }
 
   /// Offset of the buffer containing the packets after the inbox and outbox.
-  LIBC_INLINE static constexpr uint64_t header_offset(uint32_t port_count) {
+  RPC_INLINE static constexpr uint64_t header_offset(uint32_t port_count) {
     return align_up(2 * mailbox_bytes(port_count), alignof(Header));
   }
 
   /// Offset of the buffer containing the packets after the inbox and outbox.
-  LIBC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) {
+  RPC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) {
     return align_up(header_offset(port_count) + port_count * sizeof(Header),
                     alignof(Buffer));
   }
 
   /// Conditionally set the n-th bit in the atomic bitfield.
-  LIBC_INLINE static constexpr uint32_t set_nth(uint32_t *bits, uint32_t index,
-                                                bool cond) {
+  RPC_INLINE static constexpr uint32_t set_nth(uint32_t *bits, uint32_t index,
+                                               bool cond) {
     uint32_t slot = index / NUM_BITS_IN_WORD;
     uint32_t bit = index % NUM_BITS_IN_WORD;
     return __scoped_atomic_fetch_or(&bits[slot],
@@ -262,8 +263,8 @@ template <bool Invert> struct Process {
   }
 
   /// Conditionally clear the n-th bit in the atomic bitfield.
-  LIBC_INLINE static constexpr uint32_t clear_nth(uint32_t *bits,
-                                                  uint32_t index, bool cond) {
+  RPC_INLINE static constexpr uint32_t clear_nth(uint32_t *bits, uint32_t index,
+                                                 bool cond) {
     uint32_t slot = index / NUM_BITS_IN_WORD;
     uint32_t bit = index % NUM_BITS_IN_WORD;
     return __scoped_atomic_fetch_and(&bits[slot],
@@ -275,8 +276,8 @@ template <bool Invert> struct Process {
 
 /// Invokes a function accross every active buffer across the total lane size.
 template <typename F>
-LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
-                                   uint64_t lane_mask, Buffer *slot) {
+RPC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
+                                  uint64_t lane_mask, Buffer *slot) {
   if constexpr (is_process_gpu()) {
     fn(&slot[rpc::get_lane_id()], rpc::get_lane_id());
   } else {
@@ -290,40 +291,40 @@ LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
 /// processes. A port is conceptually an index into the memory provided by the
 /// underlying process that is guarded by a lock bit.
 template <bool T> struct Port {
-  LIBC_INLINE Port(Process<T> &process, uint64_t lane_mask, uint32_t lane_size,
-                   uint32_t index, uint32_t out)
+  RPC_INLINE Port(Process<T> &process, uint64_t lane_mask, uint32_t lane_size,
+                  uint32_t index, uint32_t out)
       : process(process), lane_mask(lane_mask), lane_size(lane_size),
         index(index), out(out), receive(false), owns_buffer(true) {}
-  LIBC_INLINE ~Port() = default;
+  RPC_INLINE ~Port() = default;
 
 private:
-  LIBC_INLINE Port(const Port &) = delete;
-  LIBC_INLINE Port &operator=(const Port &) = delete;
-  LIBC_INLINE Port(Port &&) = default;
-  LIBC_INLINE Port &operator=(Port &&) = default;
+  RPC_INLINE Port(const Port &) = delete;
+  RPC_INLINE Port &operator=(const Port &) = delete;
+  RPC_INLINE Port(Port &&) = default;
+  RPC_INLINE Port &operator=(Port &&) = default;
 
   friend struct Client;
   friend struct Server;
   friend class rpc::optional<Port<T>>;
 
 public:
-  template <typename U> LIBC_INLINE void recv(U use);
-  template <typename F> LIBC_INLINE void send(F fill);
+  template <typename U> RPC_INLINE void recv(U use);
+  template <typename F> RPC_INLINE void send(F fill);
   template <typename F, typename U>
-  LIBC_INLINE void send_and_recv(F fill, U use);
-  template <typename W> LIBC_INLINE void recv_and_send(W work);
-  LIBC_INLINE void send_n(const void *const *src, uint64_t *size);
-  LIBC_INLINE void send_n(const void *src, uint64_t size);
+  RPC_INLINE void send_and_recv(F fill, U use);
+  template <typename W> RPC_INLINE void recv_and_send(W work);
+  RPC_INLINE void send_n(const void *const *src, uint64_t *size);
+  RPC_INLINE void send_n(const void *src, uint64_t size);
   template <typename A>
-  LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
+  RPC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
 
-  LIBC_INLINE uint32_t get_opcode() const {
+  RPC_INLINE uint32_t get_opcode() const {
     return process.header[index].opcode;
   }
 
-  LIBC_INLINE uint32_t get_index() const { return index; }
+  RPC_INLINE uint32_t get_index() const { return index; }
 
-  LIBC_INLINE void close() {
+  RPC_INLINE void close() {
     // Wait for all lanes to finish using the port.
     rpc::sync_lane(lane_mask);
 
@@ -346,16 +347,16 @@ template <bool T> struct Port {
 
 /// The RPC client used to make requests to the server.
 struct Client {
-  LIBC_INLINE Client() = default;
-  LIBC_INLINE Client(const Client &) = delete;
-  LIBC_INLINE Client &operator=(const Client &) = delete;
-  LIBC_INLINE ~Client() = default;
+  RPC_INLINE Client() = default;
+  RPC_INLINE Client(const Client &) = delete;
+  RPC_INLINE Client &operator=(const Client &) = delete;
+  RPC_INLINE ~Client() = default;
 
-  LIBC_INLINE Client(uint32_t port_count, void *buffer)
+  RPC_INLINE Client(uint32_t port_count, void *buffer)
       : process(port_count, buffer) {}
 
   using Port = rpc::Port<false>;
-  template <uint32_t opcode> LIBC_INLINE Port open();
+  template <uint32_t opcode> RPC_INLINE Port open();
 
 private:
   Process<false> process;
@@ -363,21 +364,21 @@ struct Client {
 
 /// The RPC server used to respond to the client.
 struct Server {
-  LIBC_INLINE Server() = default;
-  LIBC_INLINE Server(const Server &) = delete;
-  LIBC_INLINE Server &operator=(const Server &) = delete;
-  LIBC_INLINE ~Server() = default;
+  RPC_INLINE Server() = default;
+  RPC_INLINE Server(const Server &) = delete;
+  RPC_INLINE Server &operator=(const Server &) = delete;
+  RPC_INLINE ~Server() = default;
 
-  LIBC_INLINE Server(uint32_t port_count, void *buffer)
+  RPC_INLINE Server(uint32_t port_count, void *buffer)
       : process(port_count, buffer) {}
 
   using Port = rpc::Port<true>;
-  LIBC_INLINE rpc::optional<Port> try_open(uint32_t lane_size,
-                                           uint32_t start = 0);
-  LIBC_INLINE Port open(uint32_t lane_size);
+  RPC_INLINE rpc::optional<Port> try_open(uint32_t lane_size,
+                                          uint32_t start = 0);
+  RPC_INLINE Port open(uint32_t lane_size);
 
-  LIBC_INLINE static uint64_t allocation_size(uint32_t lane_size,
-                                              uint32_t port_count) {
+  RPC_INLINE static uint64_t allocation_size(uint32_t lane_size,
+                                             uint32_t port_count) {
     return Process<true>::allocation_size(port_count, lane_size);
   }
 
@@ -386,7 +387,7 @@ struct Server {
 };
 
 /// Applies \p fill to the shared buffer and initiates a send operation.
-template <bool T> template <typename F> LIBC_INLINE void Port<T>::send(F fill) {
+template <bool T> template <typename F> RPC_INLINE void Port<T>::send(F fill) {
   uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
 
   // We need to wait until we own the buffer before sending.
@@ -401,7 +402,7 @@ template <bool T> template <typename F> LIBC_INLINE void Port<T>::send(F fill) {
 }
 
 /// Applies \p use to the shared buffer and acknowledges the send.
-template <bool T> template <typename U> LIBC_INLINE void Port<T>::recv(U use) {
+template <bool T> template <typename U> RPC_INLINE void Port<T>::recv(U use) {
   // We only exchange ownership of the buffer during a receive if we are waiting
   // for a previous receive to finish.
   if (receive) {
@@ -424,7 +425,7 @@ template <bool T> template <typename U> LIBC_INLINE void Port<T>::recv(U use) {
 /// Combines a send and receive into a single function.
 template <bool T>
 template <typename F, typename U>
-LIBC_INLINE void Port<T>::send_and_recv(F fill, U use) {
+RPC_INLINE void Port<T>::send_and_recv(F fill, U use) {
   send(fill);
   recv(use);
 }
@@ -434,7 +435,7 @@ LIBC_INLINE void Port<T>::send_and_recv(F fill, U use) {
 /// the copy back.
 template <bool T>
 template <typename W>
-LIBC_INLINE void Port<T>::recv_and_send(W work) {
+RPC_INLINE void Port<T>::recv_and_send(W work) {
   recv(work);
   send([](Buffer *, uint32_t) { /* no-op */ });
 }
@@ -442,7 +443,7 @@ LIBC_INLINE void Port<T>::recv_and_send(W work) {
 /// Helper routine to simplify the interface when sending from the GPU using
 /// thread private pointers to the underlying value.
 template <bool T>
-LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
+RPC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
   const void **src_ptr = &src;
   uint64_t *size_ptr = &size;
   send_n(src_ptr, size_ptr);
@@ -451,7 +452,7 @@ LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
 /// Sends an arbitrarily sized data buffer \p src across the shared channel in
 /// multiples of the packet length.
 template <bool T>
-LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
+RPC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
   uint64_t num_sends = 0;
   send([&](Buffer *buffer, uint32_t id) {
     reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
@@ -482,7 +483,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
 /// size of the data so that we can initialize the size of the \p dst buffer.
 template <bool T>
 template <typename A>
-LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
+RPC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
   uint64_t num_recvs = 0;
   recv([&](Buffer *buffer, uint32_t id) {
     lane_value(size, id) = reinterpret_cast<uint64_t *>(buffer->data)[0];
@@ -516,7 +517,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
 /// port. Each port instance uses an associated \p opcode to tell the server
 /// what to do. The Client interface provides the appropriate lane size to the
 /// port using the platform's returned value.
-template <uint32_t opcode> LIBC_INLINE Client::Port Client::open() {
+template <uint32_t opcode> RPC_INLINE Client::Port Client::open() {
   // Repeatedly perform a naive linear scan for a port that can be opened to
   // send data.
   for (uint32_t index = 0;; ++index) {
@@ -550,7 +551,7 @@ template <uint32_t opcode> LIBC_INLINE Client::Port Client::open() {
 
 /// Attempts to open a port to use as the server. The server can only open a
 /// port if it has a pending receive operation
-LIBC_INLINE rpc::optional<typename Server::Port>
+RPC_INLINE rpc::optional<typename Server::Port>
 Server::try_open(uint32_t lane_size, uint32_t start) {
   // Perform a naive linear scan for a port that has a pending request.
   for (uint32_t index = start; index < process.port_count; ++index) {
@@ -580,7 +581,7 @@ Server::try_open(uint32_t lane_size, uint32_t start) {
   return rpc::nullopt;
 }
 
-LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
+RPC_INLINE Server::Port Server::open(uint32_t lane_size) {
   for (;;) {
     if (rpc::optional<Server::Port> p = try_open(lane_size))
       return rpc::move(p.value());
@@ -599,6 +600,5 @@ LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
 #endif
 
 } // namespace rpc
-} // namespace LIBC_NAMESPACE_DECL
 
-#endif
+#endif // LLVM_LIBC_SHARED_RPC_H
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/shared/rpc_util.h
similarity index 61%
rename from libc/src/__support/RPC/rpc_util.h
rename to libc/shared/rpc_util.h
index 7067dfc974eb31f..502014d839ae949 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/shared/rpc_util.h
@@ -6,11 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
-#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
-
-#include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
+#ifndef LLVM_LIBC_SHARED_RPC_UTIL_H
+#define LLVM_LIBC_SHARED_RPC_UTIL_H
 
 #include <stddef.h>
 #include <stdint.h>
@@ -20,7 +17,10 @@
 #define RPC_TARGET_IS_GPU
 #endif
 
-namespace LIBC_NAMESPACE_DECL {
+#ifndef RPC_INLINE
+#define RPC_INLINE inline
+#endif
+
 namespace rpc {
 
 template <typename T> struct type_identity {
@@ -40,26 +40,26 @@ template <class T> struct is_const<const T> : type_constant<bool, true> {};
 
 /// Freestanding implementation of std::move.
 template <class T>
-LIBC_INLINE constexpr typename remove_reference<T>::type &&move(T &&t) {
+RPC_INLINE constexpr typename remove_reference<T>::type &&move(T &&t) {
   return static_cast<typename remove_reference<T>::type &&>(t);
 }
 
 /// Freestanding implementation of std::forward.
 template <typename T>
-LIBC_INLINE constexpr T &&forward(typename remove_reference<T>::type &value) {
+RPC_INLINE constexpr T &&forward(typename remove_reference<T>::type &value) {
   return static_cast<T &&>(value);
 }
 template <typename T>
-LIBC_INLINE constexpr T &&forward(typename remove_reference<T>::type &&value) {
+RPC_INLINE constexpr T &&forward(typename remove_reference<T>::type &&value) {
   return static_cast<T &&>(value);
 }
 
 struct in_place_t {
-  LIBC_INLINE explicit in_place_t() = default;
+  RPC_INLINE explicit in_place_t() = default;
 };
 
 struct nullopt_t {
-  LIBC_INLINE constexpr explicit nullopt_t() = default;
+  RPC_INLINE constexpr explicit nullopt_t() = default;
 };
 
 constexpr inline in_place_t in_place{};
@@ -75,15 +75,15 @@ template <typename T> class optional {
 
     bool in_use = false;
 
-    LIBC_INLINE ~OptionalStorage() { reset(); }
+    RPC_INLINE ~OptionalStorage() { reset(); }
 
-    LIBC_INLINE constexpr OptionalStorage() : empty() {}
+    RPC_INLINE constexpr OptionalStorage() : empty() {}
 
     template <typename... Args>
-    LIBC_INLINE constexpr explicit OptionalStorage(in_place_t, Args &&...args)
+    RPC_INLINE constexpr explicit OptionalStorage(in_place_t, Args &&...args)
         : stored_value(forward<Args>(args)...) {}
 
-    LIBC_INLINE constexpr void reset() {
+    RPC_INLINE constexpr void reset() {
       if (in_use)
         stored_value.~U();
       in_use = false;
@@ -93,60 +93,54 @@ template <typename T> class optional {
   OptionalStorage<T> storage;
 
 public:
-  LIBC_INLINE constexpr optional() = default;
-  LIBC_INLINE constexpr optional(nullopt_t) {}
+  RPC_INLINE constexpr optional() = default;
+  RPC_INLINE constexpr optional(nullopt_t) {}
 
-  LIBC_INLINE constexpr optional(const T &t) : storage(in_place, t) {
+  RPC_INLINE constexpr optional(const T &t) : storage(in_place, t) {
     storage.in_use = true;
   }
-  LIBC_INLINE constexpr optional(const optional &) = default;
+  RPC_INLINE constexpr optional(const optional &) = default;
 
-  LIBC_INLINE constexpr optional(T &&t) : storage(in_place, move(t)) {
+  RPC_INLINE constexpr optional(T &&t) : storage(in_place, move(t)) {
     storage.in_use = true;
   }
-  LIBC_INLINE constexpr optional(optional &&O) = default;
+  RPC_INLINE constexpr optional(optional &&O) = default;
 
-  LIBC_INLINE constexpr optional &operator=(T &&t) {
+  RPC_INLINE constexpr optional &operator=(T &&t) {
     storage = move(t);
     return *this;
   }
-  LIBC_INLINE constexpr optional &operator=(optional &&) = default;
+  RPC_INLINE constexpr optional &operator=(optional &&) = default;
 
-  LIBC_INLINE constexpr optional &operator=(const T &t) {
+  RPC_INLINE constexpr optional &operator=(const T &t) {
     storage = t;
     return *this;
   }
-  LIBC_INLINE constexpr optional &operator=(const optional &) = default;
+  RPC_INLINE constexpr optional &operator=(const optional &) = default;
 
-  LIBC_INLINE constexpr void reset() { storage.reset(); }
+  RPC_INLINE constexpr void reset() { storage.reset(); }
 
-  LIBC_INLINE constexpr const T &value() const & {
-    return storage.stored_value;
-  }
+  RPC_INLINE constexpr const T &value() const & { return storage.stored_value; }
 
-  LIBC_INLINE constexpr T &value() & { return storage.stored_value; }
+  RPC_INLINE constexpr T &value() & { return storage.stored_value; }
 
-  LIBC_INLINE constexpr explicit operator bool() const {
-    return storage.in_use;
-  }
-  LIBC_INLINE constexpr bool has_value() const { return storage.in_use; }
-  LIBC_INLINE constexpr const T *operator->() const {
+  RPC_INLINE constexpr explicit operator bool() const { return storage.in_use; }
+  RPC_INLINE constexpr bool has_value() const { return storage.in_use; }
+  RPC_INLINE constexpr const T *operator->() const {
     return &storage.stored_value;
   }
-  LIBC_INLINE constexpr T *operator->() { return &storage.stored_value; }
-  LIBC_INLINE constexpr const T &operator*() const & {
+  RPC_INLINE constexpr T *operator->() { return &storage.stored_value; }
+  RPC_INLINE constexpr const T &operator*() const & {
     return storage.stored_value;
   }
-  LIBC_INLINE constexpr T &operator*() & { return storage.stored_value; }
+  RPC_INLINE constexpr T &operator*() & { return storage.stored_value; }
 
-  LIBC_INLINE constexpr T &&value() && { return move(storage.stored_value); }
-  LIBC_INLINE constexpr T &&operator*() && {
-    return move(storage.stored_value);
-  }
+  RPC_INLINE constexpr T &&value() && { return move(storage.stored_value); }
+  RPC_INLINE constexpr T &&operator*() && { return move(storage.stored_value); }
 };
 
 /// Suspend the thread briefly to assist the thread scheduler during busy loops.
-LIBC_INLINE void sleep_briefly() {
+RPC_INLINE void sleep_briefly() {
 #if defined(LIBC_TARGET_ARCH_IS_NVPTX)
   if (__nvvm_reflect("__CUDA_ARCH") >= 700)
     asm("nanosleep.u32 64;" ::: "memory");
@@ -164,7 +158,7 @@ LIBC_INLINE void sleep_briefly() {
 }
 
 /// Conditional to indicate if this process is running on the GPU.
-LIBC_INLINE constexpr bool is_process_gpu() {
+RPC_INLINE constexpr bool is_process_gpu() {
 #ifdef RPC_TARGET_IS_GPU
   return true;
 #else
@@ -173,14 +167,14 @@ LIBC_INLINE constexpr bool is_process_gpu() {
 }
 
 /// Wait for all lanes in the group to complete.
-LIBC_INLINE void sync_lane(uint64_t lane_mask) {
+RPC_INLINE void sync_lane(uint64_t lane_mask) {
 #ifdef RPC_TARGET_IS_GPU
   return __gpu_sync_lane(lane_mask);
 #endif
 }
 
 /// Copies the value from the first active thread to the rest.
-LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
+RPC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
 #ifdef RPC_TARGET_IS_GPU
   return __gpu_read_first_lane_u32(lane_mask, x);
 #else
@@ -189,7 +183,7 @@ LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
 }
 
 /// Returns the number lanes that participate in the RPC interface.
-LIBC_INLINE uint32_t get_num_lanes() {
+RPC_INLINE uint32_t get_num_lanes() {
 #ifdef RPC_TARGET_IS_GPU
   return __gpu_num_lanes();
 #else
@@ -198,7 +192,7 @@ LIBC_INLINE uint32_t get_num_lanes() {
 }
 
 /// Returns the id of the thread inside of an AMD wavefront executing together.
-LIBC_INLINE uint64_t get_lane_mask() {
+RPC_INLINE uint64_t get_lane_mask() {
 #ifdef RPC_TARGET_IS_GPU
   return __gpu_lane_mask();
 #else
@@ -207,7 +201,7 @@ LIBC_INLINE uint64_t get_lane_mask() {
 }
 
 /// Returns the id of the thread inside of an AMD wavefront executing together.
-LIBC_INLINE uint32_t get_lane_id() {
+RPC_INLINE uint32_t get_lane_id() {
 #ifdef RPC_TARGET_IS_GPU
   return __gpu_lane_id();
 #else
@@ -216,7 +210,7 @@ LIBC_INLINE uint32_t get_lane_id() {
 }
 
 /// Conditional that is only true for a single thread in a lane.
-LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
+RPC_INLINE bool is_first_lane(uint64_t lane_mask) {
 #ifdef RPC_TARGET_IS_GPU
   return __gpu_is_first_in_lane(lane_mask);
 #else
@@ -225,7 +219,7 @@ LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
 }
 
 /// Returns a bitmask of threads in the current lane for which \p x is true.
-LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+RPC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
 #ifdef RPC_TARGET_IS_GPU
   return __gpu_ballot(lane_mask, x);
 #else
@@ -235,7 +229,7 @@ LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
 
 /// Return \p val aligned "upwards" according to \p align.
 template <typename V, typename A>
-LIBC_INLINE constexpr V align_up(V val, A align) {
+RPC_INLINE constexpr V align_up(V val, A align) {
   return ((val + V(align) - 1) / V(align)) * V(align);
 }
 
@@ -243,14 +237,14 @@ LIBC_INLINE constexpr V align_up(V val, A align) {
 /// model. On the GPU stack variables are always private to a lane so we can
 /// simply use the variable passed in. On the CPU we need to allocate enough
 /// space for the whole lane and index into it.
-template <typename V> LIBC_INLINE V &lane_value(V *val, uint32_t id) {
+template <typename V> RPC_INLINE V &lane_value(V *val, uint32_t id) {
   if constexpr (is_process_gpu())
     return *val;
   return val[id];
 }
 
 /// Advance the \p p by \p bytes.
-template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
+template <typename T, typename U> RPC_INLINE T *advance(T *ptr, U bytes) {
   if constexpr (is_const<T>::value)
     return reinterpret_cast<T *>(reinterpret_cast<const uint8_t *>(ptr) +
                                  bytes);
@@ -259,15 +253,14 @@ template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
 }
 
 /// Wrapper around the optimal memory copy implementation for the target.
-LIBC_INLINE void rpc_memcpy(void *dst, const void *src, size_t count) {
+RPC_INLINE void rpc_memcpy(void *dst, const void *src, size_t count) {
   __builtin_memcpy(dst, src, count);
 }
 
-template <class T> LIBC_INLINE constexpr const T &max(const T &a, const T &b) {
+template <class T> RPC_INLINE constexpr const T &max(const T &a, const T &b) {
   return (a < b) ? b : a;
 }
 
 } // namespace rpc
-} // namespace LIBC_NAMESPACE_DECL
 
-#endif // LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
+#endif // LLVM_LIBC_SHARED_RPC_UTIL_H
diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
index 183fc6f8683e067..0a7141fb60bf039 100644
--- a/libc/src/__support/RPC/CMakeLists.txt
+++ b/libc/src/__support/RPC/CMakeLists.txt
@@ -2,20 +2,6 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
   return()
 endif()
 
-add_header_library(
-  rpc
-  HDRS
-    rpc.h
-    rpc_util.h
-  DEPENDS
-    libc.src.__support.common
-    libc.src.__support.CPP.algorithm
-    libc.src.__support.CPP.atomic
-    libc.src.__support.CPP.functional
-    libc.src.__support.CPP.optional
-    libc.src.__support.GPU.utils
-)
-
 add_object_library(
   rpc_client
   SRCS
@@ -25,5 +11,4 @@ add_object_library(
   DEPENDS
     libc.include.gpu_rpc
     libc.src.__support.GPU.utils
-    .rpc
 )
diff --git a/libc/src/__support/RPC/rpc_client.cpp b/libc/src/__support/RPC/rpc_client.cpp
index 232b20d008d1d59..c26cf9ca2ddbe6f 100644
--- a/libc/src/__support/RPC/rpc_client.cpp
+++ b/libc/src/__support/RPC/rpc_client.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "rpc_client.h"
-#include "rpc.h"
+
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/__support/RPC/rpc_client.h b/libc/src/__support/RPC/rpc_client.h
index 7bd6d0b5e00b478..8923e62e0e22a0b 100644
--- a/libc/src/__support/RPC/rpc_client.h
+++ b/libc/src/__support/RPC/rpc_client.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_RPC_RPC_CLIENT_H
 #define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_CLIENT_H
 
-#include "rpc.h"
+#include "shared/rpc.h"
 
 #include "include/llvm-libc-types/rpc_opcodes_t.h"
 #include "src/__support/CPP/type_traits.h"
@@ -18,6 +18,12 @@
 namespace LIBC_NAMESPACE_DECL {
 namespace rpc {
 
+using ::rpc::Buffer;
+using ::rpc::Client;
+using ::rpc::Port;
+using ::rpc::Process;
+using ::rpc::Server;
+
 static_assert(cpp::is_trivially_copyable<Client>::value &&
                   sizeof(Process<true>) == sizeof(Process<false>),
               "The client is not trivially copyable from the server");
diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
index 2dafa911783ffc4..b05ffb92699bf79 100644
--- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
@@ -17,27 +17,43 @@ using namespace LIBC_NAMESPACE;
 // as long as they are mirrored.
 static void test_interface(bool end_with_send) {
   uint64_t cnt = 0;
-  rpc::Client::Port port = rpc::client.open<RPC_TEST_INTERFACE>();
-  port.send(
-      [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = end_with_send; });
-  port.send(
-      [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
-  port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
-  port.send(
-      [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
-  port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
-  port.send(
-      [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
-  port.send(
-      [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
-  port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
-  port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+  LIBC_NAMESPACE::rpc::Client::Port port =
+      LIBC_NAMESPACE::rpc::client.open<RPC_TEST_INTERFACE>();
+  port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+    buffer->data[0] = end_with_send;
+  });
+  port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+    buffer->data[0] = cnt = cnt + 1;
+  });
+  port.recv([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+    cnt = buffer->data[0];
+  });
+  port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+    buffer->data[0] = cnt = cnt + 1;
+  });
+  port.recv([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+    cnt = buffer->data[0];
+  });
+  port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+    buffer->data[0] = cnt = cnt + 1;
+  });
+  port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+    buffer->data[0] = cnt = cnt + 1;
+  });
+  port.recv([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+    cnt = buffer->data[0];
+  });
+  port.recv([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+    cnt = buffer->data[0];
+  });
   if (end_with_send)
-    port.send([&](rpc::Buffer *buffer, uint32_t) {
+    port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = cnt = cnt + 1;
     });
   else
-    port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    port.recv([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+      cnt = buffer->data[0];
+    });
   port.close();
 
   ASSERT_TRUE(cnt == 9 && "Invalid number of increments");
diff --git a/libc/test/integration/startup/gpu/rpc_stream_test.cpp b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
index 09a4ae67256e3a6..208130bcfd9a96e 100644
--- a/libc/test/integration/startup/gpu/rpc_stream_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
@@ -34,7 +34,8 @@ static void test_stream() {
 
   inline_memcpy(send_ptr, str, send_size);
   ASSERT_TRUE(inline_memcmp(send_ptr, str, send_size) == 0 && "Data mismatch");
-  rpc::Client::Port port = rpc::client.open<RPC_TEST_STREAM>();
+  LIBC_NAMESPACE::rpc::Client::Port port =
+      LIBC_NAMESPACE::rpc::client.open<RPC_TEST_STREAM>();
   port.send_n(send_ptr, send_size);
   port.recv_n(&recv_ptr, &recv_size,
               [](uint64_t size) { return malloc(size); });
@@ -77,7 +78,8 @@ static void test_divergent() {
   inline_memcpy(buffer, &data[offset], offset);
   ASSERT_TRUE(inline_memcmp(buffer, &data[offset], offset) == 0 &&
               "Data mismatch");
-  rpc::Client::Port port = rpc::client.open<RPC_TEST_STREAM>();
+  LIBC_NAMESPACE::rpc::Client::Port port =
+      LIBC_NAMESPACE::rpc::client.open<RPC_TEST_STREAM>();
   port.send_n(buffer, offset);
   inline_memset(buffer, offset, 0);
   port.recv_n(&recv_ptr, &recv_size, [&](uint64_t) { return buffer; });
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
index bec8171180a0550..3deb72b9f85dab7 100644
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -18,12 +18,13 @@ static void test_add_simple() {
       10 + 10 * gpu::get_thread_id() + 10 * gpu::get_block_id();
   uint64_t cnt = 0;
   for (uint32_t i = 0; i < num_additions; ++i) {
-    rpc::Client::Port port = rpc::client.open<RPC_TEST_INCREMENT>();
+    LIBC_NAMESPACE::rpc::Client::Port port =
+        LIBC_NAMESPACE::rpc::client.open<RPC_TEST_INCREMENT>();
     port.send_and_recv(
-        [=](rpc::Buffer *buffer, uint32_t) {
+        [=](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
           reinterpret_cast<uint64_t *>(buffer->data)[0] = cnt;
         },
-        [&](rpc::Buffer *buffer, uint32_t) {
+        [&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
           cnt = reinterpret_cast<uint64_t *>(buffer->data)[0];
         });
     port.close();
@@ -33,8 +34,11 @@ static void test_add_simple() {
 
 // Test to ensure that the RPC mechanism doesn't hang on divergence.
 static void test_noop(uint8_t data) {
-  rpc::Client::Port port = rpc::client.open<RPC_NOOP>();
-  port.send([=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = data; });
+  LIBC_NAMESPACE::rpc::Client::Port port =
+      LIBC_NAMESPACE::rpc::client.open<RPC_NOOP>();
+  port.send([=](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+    buffer->data[0] = data;
+  });
   port.close();
 }
 
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 972601aaf1d5e0f..d877cbc25a13d01 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -14,15 +14,16 @@
 // Make sure these are included first so they don't conflict with the system.
 #include <limits.h>
 
+#include "shared/rpc.h"
+
 #include "llvmlibc_rpc_server.h"
 
-#include "src/__support/RPC/rpc.h"
+#include "include/llvm-libc-types/rpc_opcodes_t.h"
 #include "src/__support/arg_list.h"
 #include "src/stdio/printf_core/converter.h"
 #include "src/stdio/printf_core/parser.h"
 #include "src/stdio/printf_core/writer.h"
 
-#include "src/stdio/gpu/file.h"
 #include <algorithm>
 #include <atomic>
 #include <cstdio>
@@ -53,6 +54,26 @@ struct TempStorage {
 };
 } // namespace
 
+enum Stream {
+  File = 0,
+  Stdin = 1,
+  Stdout = 2,
+  Stderr = 3,
+};
+
+// Get the associated stream out of an encoded number.
+LIBC_INLINE ::FILE *to_stream(uintptr_t f) {
+  ::FILE *stream = reinterpret_cast<FILE *>(f & ~0x3ull);
+  Stream type = static_cast<Stream>(f & 0x3ull);
+  if (type == Stdin)
+    return stdin;
+  if (type == Stdout)
+    return stdout;
+  if (type == Stderr)
+    return stderr;
+  return stream;
+}
+
 template <bool packed, uint32_t lane_size>
 static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
   FILE *files[lane_size] = {nullptr};
@@ -260,7 +281,7 @@ rpc_status_t handle_server_impl(
     port->recv([&](rpc::Buffer *buffer, uint32_t id) {
       data[id] = temp_storage.alloc(buffer->data[0]);
       sizes[id] =
-          fread(data[id], 1, buffer->data[0], file::to_stream(buffer->data[1]));
+          fread(data[id], 1, buffer->data[0], to_stream(buffer->data[1]));
     });
     port->send_n(data, sizes);
     port->send([&](rpc::Buffer *buffer, uint32_t id) {
@@ -273,9 +294,8 @@ rpc_status_t handle_server_impl(
     void *data[lane_size] = {nullptr};
     port->recv([&](rpc::Buffer *buffer, uint32_t id) {
       data[id] = temp_storage.alloc(buffer->data[0]);
-      const char *str =
-          fgets(reinterpret_cast<char *>(data[id]), buffer->data[0],
-                file::to_stream(buffer->data[1]));
+      const char *str = fgets(reinterpret_cast<char *>(data[id]),
+                              buffer->data[0], to_stream(buffer->data[1]));
       sizes[id] = !str ? 0 : std::strlen(str) + 1;
     });
     port->send_n(data, sizes);
@@ -335,46 +355,46 @@ rpc_status_t handle_server_impl(
   }
   case RPC_FEOF: {
     port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
-      buffer->data[0] = feof(file::to_stream(buffer->data[0]));
+      buffer->data[0] = feof(to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_FERROR: {
     port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
-      buffer->data[0] = ferror(file::to_stream(buffer->data[0]));
+      buffer->data[0] = ferror(to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_CLEARERR: {
     port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
-      clearerr(file::to_stream(buffer->data[0]));
+      clearerr(to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_FSEEK: {
     port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
-      buffer->data[0] = fseek(file::to_stream(buffer->data[0]),
-                              static_cast<long>(buffer->data[1]),
-                              static_cast<int>(buffer->data[2]));
+      buffer->data[0] =
+          fseek(to_stream(buffer->data[0]), static_cast<long>(buffer->data[1]),
+                static_cast<int>(buffer->data[2]));
     });
     break;
   }
   case RPC_FTELL: {
     port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
-      buffer->data[0] = ftell(file::to_stream(buffer->data[0]));
+      buffer->data[0] = ftell(to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_FFLUSH: {
     port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
-      buffer->data[0] = fflush(file::to_stream(buffer->data[0]));
+      buffer->data[0] = fflush(to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_UNGETC: {
     port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
-      buffer->data[0] = ungetc(static_cast<int>(buffer->data[0]),
-                               file::to_stream(buffer->data[1]));
+      buffer->data[0] =
+          ungetc(static_cast<int>(buffer->data[0]), to_stream(buffer->data[1]));
     });
     break;
   }

>From 1144109437f9575519847005c68ad67acc15e5fd Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 20 Nov 2024 16:45:37 -0600
Subject: [PATCH 2/2] [libc] Remove RPC server API and use the header directly

Summary:
This patch removes much of the `llvmlibc_rpc_server` interface. This
pretty much deletes all of this code and just replaces it with including
`rpc.h` directly. We still maintain the file to let `libc` handle the
opcodes, since those depend on the `printf` impelmentation.

This will need to be cleaned up more, but I don't want to put too much
into a single patch.
---
 libc/shared/rpc.h                             |   7 +
 libc/utils/gpu/loader/Loader.h                | 204 +++++-----
 .../utils/gpu/loader/amdgpu/amdhsa-loader.cpp | 123 +++----
 libc/utils/gpu/loader/nvptx/nvptx-loader.cpp  |  96 ++---
 libc/utils/gpu/server/llvmlibc_rpc_server.h   |  94 +----
 libc/utils/gpu/server/rpc_server.cpp          | 348 +++++-------------
 offload/plugins-nextgen/common/CMakeLists.txt |   1 +
 offload/plugins-nextgen/common/include/RPC.h  |   2 +-
 offload/plugins-nextgen/common/src/RPC.cpp    | 126 +++----
 9 files changed, 339 insertions(+), 662 deletions(-)

diff --git a/libc/shared/rpc.h b/libc/shared/rpc.h
index 489a8cebfb807c4..c5e4277286c3999 100644
--- a/libc/shared/rpc.h
+++ b/libc/shared/rpc.h
@@ -42,6 +42,13 @@ namespace rpc {
 #define __scoped_atomic_thread_fence(ord, scp) __atomic_thread_fence(ord)
 #endif
 
+/// Generic codes that can be used whem implementing the server.
+enum Status {
+  SUCCESS = 0x0,
+  ERROR = 0x1000,
+  UNHANDLED_OPCODE = 0x1001,
+};
+
 /// A fixed size channel used to communicate between the RPC client and server.
 struct Buffer {
   uint64_t data[8];
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index 8be8c0d5f85532f..fd5105b34709e6c 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -13,6 +13,7 @@
 
 #include "include/llvm-libc-types/rpc_opcodes_t.h"
 #include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "shared/rpc.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -103,129 +104,90 @@ inline void handle_error_impl(const char *file, int32_t line, const char *msg) {
   fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, msg);
   exit(EXIT_FAILURE);
 }
-
-inline void handle_error_impl(const char *file, int32_t line,
-                              rpc_status_t err) {
-  fprintf(stderr, "%s:%d:0: Error: %d\n", file, line, err);
-  exit(EXIT_FAILURE);
-}
 #define handle_error(X) handle_error_impl(__FILE__, __LINE__, X)
 
-template <uint32_t lane_size>
-inline void register_rpc_callbacks(rpc_device_t device) {
-  static_assert(lane_size == 32 || lane_size == 64, "Invalid Lane size");
-  // Register the ping test for the `libc` tests.
-  rpc_register_callback(
-      device, static_cast<rpc_opcode_t>(RPC_TEST_INCREMENT),
-      [](rpc_port_t port, void *data) {
-        rpc_recv_and_send(
-            port,
-            [](rpc_buffer_t *buffer, void *data) {
-              reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
-            },
-            data);
-      },
-      nullptr);
-
-  // Register the interface test callbacks.
-  rpc_register_callback(
-      device, static_cast<rpc_opcode_t>(RPC_TEST_INTERFACE),
-      [](rpc_port_t port, void *data) {
-        uint64_t cnt = 0;
-        bool end_with_recv;
-        rpc_recv(
-            port,
-            [](rpc_buffer_t *buffer, void *data) {
-              *reinterpret_cast<bool *>(data) = buffer->data[0];
-            },
-            &end_with_recv);
-        rpc_recv(
-            port,
-            [](rpc_buffer_t *buffer, void *data) {
-              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
-            },
-            &cnt);
-        rpc_send(
-            port,
-            [](rpc_buffer_t *buffer, void *data) {
-              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
-              buffer->data[0] = cnt = cnt + 1;
-            },
-            &cnt);
-        rpc_recv(
-            port,
-            [](rpc_buffer_t *buffer, void *data) {
-              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
-            },
-            &cnt);
-        rpc_send(
-            port,
-            [](rpc_buffer_t *buffer, void *data) {
-              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
-              buffer->data[0] = cnt = cnt + 1;
-            },
-            &cnt);
-        rpc_recv(
-            port,
-            [](rpc_buffer_t *buffer, void *data) {
-              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
-            },
-            &cnt);
-        rpc_recv(
-            port,
-            [](rpc_buffer_t *buffer, void *data) {
-              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
-            },
-            &cnt);
-        rpc_send(
-            port,
-            [](rpc_buffer_t *buffer, void *data) {
-              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
-              buffer->data[0] = cnt = cnt + 1;
-            },
-            &cnt);
-        rpc_send(
-            port,
-            [](rpc_buffer_t *buffer, void *data) {
-              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
-              buffer->data[0] = cnt = cnt + 1;
-            },
-            &cnt);
-        if (end_with_recv)
-          rpc_recv(
-              port,
-              [](rpc_buffer_t *buffer, void *data) {
-                *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
-              },
-              &cnt);
-        else
-          rpc_send(
-              port,
-              [](rpc_buffer_t *buffer, void *data) {
-                uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
-                buffer->data[0] = cnt = cnt + 1;
-              },
-              &cnt);
-      },
-      nullptr);
-
-  // Register the stream test handler.
-  rpc_register_callback(
-      device, static_cast<rpc_opcode_t>(RPC_TEST_STREAM),
-      [](rpc_port_t port, void *data) {
-        uint64_t sizes[lane_size] = {0};
-        void *dst[lane_size] = {nullptr};
-        rpc_recv_n(
-            port, dst, sizes,
-            [](uint64_t size, void *) -> void * { return new char[size]; },
-            nullptr);
-        rpc_send_n(port, dst, sizes);
-        for (uint64_t i = 0; i < lane_size; ++i) {
-          if (dst[i])
-            delete[] reinterpret_cast<uint8_t *>(dst[i]);
-        }
-      },
-      nullptr);
+template <uint32_t num_lanes, typename Alloc, typename Free>
+inline uint32_t handle_server(rpc::Server &server, uint32_t index,
+                              Alloc &&alloc, Free &&free) {
+  auto port = server.try_open(num_lanes, index);
+  if (!port)
+    return 0;
+  index = port->get_index() + 1;
+
+  switch (port->get_opcode()) {
+  case RPC_TEST_INCREMENT: {
+    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+      reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
+    });
+    break;
+  }
+  case RPC_TEST_INTERFACE: {
+    bool end_with_recv;
+    uint64_t cnt;
+    port->recv([&](rpc::Buffer *buffer, uint32_t) {
+      end_with_recv = buffer->data[0];
+    });
+    port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    port->send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = cnt = cnt + 1;
+    });
+    port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    port->send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = cnt = cnt + 1;
+    });
+    port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    port->send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = cnt = cnt + 1;
+    });
+    port->send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = cnt = cnt + 1;
+    });
+    if (end_with_recv)
+      port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+    else
+      port->send([&](rpc::Buffer *buffer, uint32_t) {
+        buffer->data[0] = cnt = cnt + 1;
+      });
+
+    break;
+  }
+  case RPC_TEST_STREAM: {
+    uint64_t sizes[num_lanes] = {0};
+    void *dst[num_lanes] = {nullptr};
+    port->recv_n(dst, sizes,
+                 [](uint64_t size) -> void * { return new char[size]; });
+    port->send_n(dst, sizes);
+    for (uint64_t i = 0; i < num_lanes; ++i) {
+      if (dst[i])
+        delete[] reinterpret_cast<uint8_t *>(dst[i]);
+    }
+    break;
+  }
+  case RPC_MALLOC: {
+    port->recv_and_send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = reinterpret_cast<uintptr_t>(alloc(buffer->data[0]));
+    });
+    break;
+  }
+  case RPC_FREE: {
+    port->recv([&](rpc::Buffer *buffer, uint32_t) {
+      free(reinterpret_cast<void *>(buffer->data[0]));
+    });
+    break;
+  }
+  default:
+    break;
+  }
+
+  // Handle all of the `libc` specific opcodes.
+  int status = libc_handle_rpc_port(&*port, num_lanes);
+  if (status != rpc::SUCCESS)
+    handle_error("Error handling RPC server");
+
+  port->close();
+
+  return index;
 }
 
 #endif
diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
index cb81a866622f937..4849022e89d3306 100644
--- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
@@ -160,7 +160,7 @@ template <typename args_t>
 hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
                            hsa_amd_memory_pool_t kernargs_pool,
                            hsa_amd_memory_pool_t coarsegrained_pool,
-                           hsa_queue_t *queue, rpc_device_t device,
+                           hsa_queue_t *queue, rpc::Server &server,
                            const LaunchParameters &params,
                            const char *kernel_name, args_t kernel_args,
                            bool print_resource_usage) {
@@ -170,37 +170,10 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
           executable, kernel_name, &dev_agent, &symbol))
     return err;
 
-  // Register RPC callbacks for the malloc and free functions on HSA.
-  auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
-  rpc_register_callback(
-      device, RPC_MALLOC,
-      [](rpc_port_t port, void *data) {
-        auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
-          auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data);
-          uint64_t size = buffer->data[0];
-          void *dev_ptr = nullptr;
-          if (hsa_status_t err =
-                  hsa_amd_memory_pool_allocate(pool, size,
-                                               /*flags=*/0, &dev_ptr))
-            dev_ptr = nullptr;
-          hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
-          buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr);
-        };
-        rpc_recv_and_send(port, malloc_handler, data);
-      },
-      &tuple);
-  rpc_register_callback(
-      device, RPC_FREE,
-      [](rpc_port_t port, void *data) {
-        auto free_handler = [](rpc_buffer_t *buffer, void *) {
-          if (hsa_status_t err = hsa_amd_memory_pool_free(
-                  reinterpret_cast<void *>(buffer->data[0])))
-            handle_error(err);
-        };
-        rpc_recv_and_send(port, free_handler, data);
-      },
-      nullptr);
-
+  uint32_t wavefront_size = 0;
+  if (hsa_status_t err = hsa_agent_get_info(
+          dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
+    handle_error(err);
   // Retrieve different properties of the kernel symbol used for launch.
   uint64_t kernel;
   uint32_t args_size;
@@ -292,14 +265,38 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 
   std::atomic<bool> finished = false;
-  std::thread server(
-      [](std::atomic<bool> *finished, rpc_device_t device) {
-        while (!*finished) {
-          if (rpc_status_t err = rpc_handle_server(device))
+  std::thread server_thread(
+      [](std::atomic<bool> *finished, rpc::Server *server,
+         uint32_t wavefront_size, hsa_agent_t dev_agent,
+         hsa_amd_memory_pool_t coarsegrained_pool) {
+        // Register RPC callbacks for the malloc and free functions on HSA.
+        auto malloc_handler = [&](size_t size) -> void * {
+          void *dev_ptr = nullptr;
+          if (hsa_status_t err =
+                  hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
+                                               /*flags=*/0, &dev_ptr))
+            dev_ptr = nullptr;
+          hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
+          return dev_ptr;
+        };
+
+        auto free_handler = [](void *ptr) -> void {
+          if (hsa_status_t err =
+                  hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr)))
             handle_error(err);
+        };
+
+        uint32_t index = 0;
+        while (!*finished) {
+          if (wavefront_size == 32)
+            index =
+                handle_server<32>(*server, index, malloc_handler, free_handler);
+          else
+            index =
+                handle_server<64>(*server, index, malloc_handler, free_handler);
         }
       },
-      &finished, device);
+      &finished, &server, wavefront_size, dev_agent, coarsegrained_pool);
 
   // Wait until the kernel has completed execution on the device. Periodically
   // check the RPC client for work to be performed on the server.
@@ -309,8 +306,8 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
     ;
 
   finished = true;
-  if (server.joinable())
-    server.join();
+  if (server_thread.joinable())
+    server_thread.join();
 
   // Destroy the resources acquired to launch the kernel and return.
   if (hsa_status_t err = hsa_amd_memory_pool_free(args))
@@ -452,34 +449,22 @@ int load(int argc, const char **argv, const char **envp, void *image,
     handle_error(err);
 
   // Set up the RPC server.
-  auto tuple = std::make_tuple(dev_agent, finegrained_pool);
-  auto rpc_alloc = [](uint64_t size, void *data) {
-    auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data);
-    void *dev_ptr = nullptr;
-    if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
-                                                        /*flags=*/0, &dev_ptr))
-      handle_error(err);
-    hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
-    return dev_ptr;
-  };
-  rpc_device_t device;
-  if (rpc_status_t err = rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
-                                         wavefront_size, rpc_alloc, &tuple))
+  void *rpc_buffer;
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+          finegrained_pool,
+          rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT),
+          /*flags=*/0, &rpc_buffer))
     handle_error(err);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer);
 
-  // Register callbacks for the RPC unit tests.
-  if (wavefront_size == 32)
-    register_rpc_callbacks<32>(device);
-  else if (wavefront_size == 64)
-    register_rpc_callbacks<64>(device);
-  else
-    handle_error("Invalid wavefront size");
+  rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
+  rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
 
   // Initialize the RPC client on the device by copying the local data to the
   // device's internal pointer.
   hsa_executable_symbol_t rpc_client_sym;
   if (hsa_status_t err = hsa_executable_get_symbol_by_name(
-          executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
+          executable, "__llvm_libc_rpc_client", &dev_agent, &rpc_client_sym))
     handle_error(err);
 
   void *rpc_client_host;
@@ -502,19 +487,17 @@ int load(int argc, const char **argv, const char **envp, void *image,
 
   void *rpc_client_buffer;
   if (hsa_status_t err =
-          hsa_amd_memory_lock(const_cast<void *>(rpc_get_client_buffer(device)),
-                              rpc_get_client_size(),
+          hsa_amd_memory_lock(&client, sizeof(rpc::Client),
                               /*agents=*/nullptr, 0, &rpc_client_buffer))
     handle_error(err);
 
   // Copy the RPC client buffer to the address pointed to by the symbol.
   if (hsa_status_t err =
           hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
-                     rpc_client_buffer, host_agent, rpc_get_client_size()))
+                     rpc_client_buffer, host_agent, sizeof(rpc::Client)))
     handle_error(err);
 
-  if (hsa_status_t err = hsa_amd_memory_unlock(
-          const_cast<void *>(rpc_get_client_buffer(device))))
+  if (hsa_status_t err = hsa_amd_memory_unlock(&client))
     handle_error(err);
   if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
     handle_error(err);
@@ -566,7 +549,7 @@ int load(int argc, const char **argv, const char **envp, void *image,
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
   begin_args_t init_args = {argc, dev_argv, dev_envp};
   if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
-                                       coarsegrained_pool, queue, device,
+                                       coarsegrained_pool, queue, server,
                                        single_threaded_params, "_begin.kd",
                                        init_args, print_resource_usage))
     handle_error(err);
@@ -574,7 +557,7 @@ int load(int argc, const char **argv, const char **envp, void *image,
   start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
   if (hsa_status_t err = launch_kernel(
           dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
-          device, params, "_start.kd", args, print_resource_usage))
+          server, params, "_start.kd", args, print_resource_usage))
     handle_error(err);
 
   void *host_ret;
@@ -593,14 +576,12 @@ int load(int argc, const char **argv, const char **envp, void *image,
 
   end_args_t fini_args = {ret};
   if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
-                                       coarsegrained_pool, queue, device,
+                                       coarsegrained_pool, queue, server,
                                        single_threaded_params, "_end.kd",
                                        fini_args, print_resource_usage))
     handle_error(err);
 
-  if (rpc_status_t err = rpc_server_shutdown(
-          device, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); },
-          nullptr))
+  if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
     handle_error(err);
 
   // Free the memory allocated for the device.
diff --git a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
index 58e5e5f04d0a709..0ba217451feaea9 100644
--- a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
@@ -167,10 +167,9 @@ void print_kernel_resources(CUmodule binary, const char *kernel_name) {
 }
 
 template <typename args_t>
-CUresult launch_kernel(CUmodule binary, CUstream stream,
-                       rpc_device_t rpc_device, const LaunchParameters &params,
-                       const char *kernel_name, args_t kernel_args,
-                       bool print_resource_usage) {
+CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
+                       const LaunchParameters &params, const char *kernel_name,
+                       args_t kernel_args, bool print_resource_usage) {
   // look up the '_start' kernel in the loaded module.
   CUfunction function;
   if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
@@ -181,23 +180,21 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
   void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
                          CU_LAUNCH_PARAM_END};
+  if (print_resource_usage)
+    print_kernel_resources(binary, kernel_name);
 
-  // Initialize a non-blocking CUDA stream to allocate memory if needed. This
-  // needs to be done on a separate stream or else it will deadlock with the
-  // executing kernel.
+  // Initialize a non-blocking CUDA stream to allocate memory if needed.
+  // This needs to be done on a separate stream or else it will deadlock
+  // with the executing kernel.
   CUstream memory_stream;
   if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
     handle_error(err);
 
-  // Register RPC callbacks for the malloc and free functions on HSA.
-  register_rpc_callbacks<32>(rpc_device);
-
-  rpc_register_callback(
-      rpc_device, RPC_MALLOC,
-      [](rpc_port_t port, void *data) {
-        auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
-          CUstream memory_stream = *static_cast<CUstream *>(data);
-          uint64_t size = buffer->data[0];
+  std::atomic<bool> finished = false;
+  std::thread server_thread(
+      [](std::atomic<bool> *finished, rpc::Server *server,
+         CUstream memory_stream) {
+        auto malloc_handler = [&](size_t size) -> void * {
           CUdeviceptr dev_ptr;
           if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
             dev_ptr = 0UL;
@@ -205,36 +202,22 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
           // Wait until the memory allocation is complete.
           while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
             ;
-          buffer->data[0] = static_cast<uintptr_t>(dev_ptr);
+          return reinterpret_cast<void *>(dev_ptr);
         };
-        rpc_recv_and_send(port, malloc_handler, data);
-      },
-      &memory_stream);
-  rpc_register_callback(
-      rpc_device, RPC_FREE,
-      [](rpc_port_t port, void *data) {
-        auto free_handler = [](rpc_buffer_t *buffer, void *data) {
-          CUstream memory_stream = *static_cast<CUstream *>(data);
-          if (CUresult err = cuMemFreeAsync(
-                  static_cast<CUdeviceptr>(buffer->data[0]), memory_stream))
+
+        auto free_handler = [&](void *ptr) -> void {
+          if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
+                                            memory_stream))
             handle_error(err);
         };
-        rpc_recv_and_send(port, free_handler, data);
-      },
-      &memory_stream);
 
-  if (print_resource_usage)
-    print_kernel_resources(binary, kernel_name);
-
-  std::atomic<bool> finished = false;
-  std::thread server(
-      [](std::atomic<bool> *finished, rpc_device_t device) {
+        uint32_t index = 0;
         while (!*finished) {
-          if (rpc_status_t err = rpc_handle_server(device))
-            handle_error(err);
+          index =
+              handle_server<32>(*server, index, malloc_handler, free_handler);
         }
       },
-      &finished, rpc_device);
+      &finished, &server, memory_stream);
 
   // Call the kernel with the given arguments.
   if (CUresult err = cuLaunchKernel(
@@ -247,8 +230,8 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
     handle_error(err);
 
   finished = true;
-  if (server.joinable())
-    server.join();
+  if (server_thread.joinable())
+    server_thread.join();
 
   return CUDA_SUCCESS;
 }
@@ -318,23 +301,20 @@ int load(int argc, const char **argv, const char **envp, void *image,
     handle_error(err);
 
   uint32_t warp_size = 32;
-  auto rpc_alloc = [](uint64_t size, void *) -> void * {
-    void *dev_ptr;
-    if (CUresult err = cuMemAllocHost(&dev_ptr, size))
-      handle_error(err);
-    return dev_ptr;
-  };
-  rpc_device_t rpc_device;
-  if (rpc_status_t err = rpc_server_init(&rpc_device, RPC_MAXIMUM_PORT_COUNT,
-                                         warp_size, rpc_alloc, nullptr))
+  void *rpc_buffer = nullptr;
+  if (CUresult err = cuMemAllocHost(
+          &rpc_buffer,
+          rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
     handle_error(err);
+  rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
+  rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
 
   // Initialize the RPC client on the device by copying the local data to the
   // device's internal pointer.
   CUdeviceptr rpc_client_dev = 0;
   uint64_t client_ptr_size = sizeof(void *);
   if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
-                                       binary, rpc_client_symbol_name))
+                                       binary, "__llvm_libc_rpc_client"))
     handle_error(err);
 
   CUdeviceptr rpc_client_host = 0;
@@ -342,20 +322,19 @@ int load(int argc, const char **argv, const char **envp, void *image,
           cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *)))
     handle_error(err);
   if (CUresult err =
-          cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(rpc_device),
-                       rpc_get_client_size()))
+          cuMemcpyHtoD(rpc_client_host, &client, sizeof(rpc::Client)))
     handle_error(err);
 
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
   begin_args_t init_args = {argc, dev_argv, dev_envp};
   if (CUresult err =
-          launch_kernel(binary, stream, rpc_device, single_threaded_params,
+          launch_kernel(binary, stream, server, single_threaded_params,
                         "_begin", init_args, print_resource_usage))
     handle_error(err);
 
   start_args_t args = {argc, dev_argv, dev_envp,
                        reinterpret_cast<void *>(dev_ret)};
-  if (CUresult err = launch_kernel(binary, stream, rpc_device, params, "_start",
+  if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
                                    args, print_resource_usage))
     handle_error(err);
 
@@ -369,8 +348,8 @@ int load(int argc, const char **argv, const char **envp, void *image,
 
   end_args_t fini_args = {host_ret};
   if (CUresult err =
-          launch_kernel(binary, stream, rpc_device, single_threaded_params,
-                        "_end", fini_args, print_resource_usage))
+          launch_kernel(binary, stream, server, single_threaded_params, "_end",
+                        fini_args, print_resource_usage))
     handle_error(err);
 
   // Free the memory allocated for the device.
@@ -380,8 +359,7 @@ int load(int argc, const char **argv, const char **envp, void *image,
     handle_error(err);
   if (CUresult err = cuMemFreeHost(dev_argv))
     handle_error(err);
-  if (rpc_status_t err = rpc_server_shutdown(
-          rpc_device, [](void *ptr, void *) { cuMemFreeHost(ptr); }, nullptr))
+  if (CUresult err = cuMemFreeHost(rpc_buffer))
     handle_error(err);
 
   // Destroy the context and the loaded binary.
diff --git a/libc/utils/gpu/server/llvmlibc_rpc_server.h b/libc/utils/gpu/server/llvmlibc_rpc_server.h
index 98df882afa21cf6..b7f173734345c0a 100644
--- a/libc/utils/gpu/server/llvmlibc_rpc_server.h
+++ b/libc/utils/gpu/server/llvmlibc_rpc_server.h
@@ -15,99 +15,7 @@
 extern "C" {
 #endif
 
-/// The maximum number of ports that can be opened for any server.
-const uint64_t RPC_MAXIMUM_PORT_COUNT = 4096;
-
-/// The symbol name associated with the client for use with the LLVM C library
-/// implementation.
-const char *const rpc_client_symbol_name = "__llvm_libc_rpc_client";
-
-/// status codes.
-typedef enum {
-  RPC_STATUS_SUCCESS = 0x0,
-  RPC_STATUS_CONTINUE = 0x1,
-  RPC_STATUS_ERROR = 0x1000,
-  RPC_STATUS_UNHANDLED_OPCODE = 0x1001,
-  RPC_STATUS_INVALID_LANE_SIZE = 0x1002,
-} rpc_status_t;
-
-/// A struct containing an opaque handle to an RPC port. This is what allows the
-/// server to communicate with the client.
-typedef struct rpc_port_s {
-  uint64_t handle;
-  uint32_t lane_size;
-} rpc_port_t;
-
-/// A fixed-size buffer containing the payload sent from the client.
-typedef struct rpc_buffer_s {
-  uint64_t data[8];
-} rpc_buffer_t;
-
-/// An opaque handle to an RPC server that can be attached to a device.
-typedef struct rpc_device_s {
-  uintptr_t handle;
-} rpc_device_t;
-
-/// A function used to allocate \p bytes for use by the RPC server and client.
-/// The memory should support asynchronous and atomic access from both the
-/// client and server.
-typedef void *(*rpc_alloc_ty)(uint64_t size, void *data);
-
-/// A function used to free the \p ptr previously allocated.
-typedef void (*rpc_free_ty)(void *ptr, void *data);
-
-/// A callback function provided with a \p port to communicate with the RPC
-/// client. This will be called by the server to handle an opcode.
-typedef void (*rpc_opcode_callback_ty)(rpc_port_t port, void *data);
-
-/// A callback function to use the port to receive or send a \p buffer.
-typedef void (*rpc_port_callback_ty)(rpc_buffer_t *buffer, void *data);
-
-/// Initialize the server for a given device and return it in \p device.
-rpc_status_t rpc_server_init(rpc_device_t *rpc_device, uint64_t num_ports,
-                             uint32_t lane_size, rpc_alloc_ty alloc,
-                             void *data);
-
-/// Shut down the server for a given device.
-rpc_status_t rpc_server_shutdown(rpc_device_t rpc_device, rpc_free_ty dealloc,
-                                 void *data);
-
-/// Queries the RPC clients at least once and performs server-side work if there
-/// are any active requests. Runs until all work on the server is completed.
-rpc_status_t rpc_handle_server(rpc_device_t rpc_device);
-
-/// Register a callback to handle an opcode from the RPC client. The associated
-/// data must remain accessible as long as the user intends to handle the server
-/// with this callback.
-rpc_status_t rpc_register_callback(rpc_device_t rpc_device, uint32_t opcode,
-                                   rpc_opcode_callback_ty callback, void *data);
-
-/// Obtain a pointer to a local client buffer that can be copied directly to the
-/// other process using the address stored at the rpc client symbol name.
-const void *rpc_get_client_buffer(rpc_device_t device);
-
-/// Returns the size of the client in bytes to be used for a memory copy.
-uint64_t rpc_get_client_size();
-
-/// Use the \p port to send a buffer using the \p callback.
-void rpc_send(rpc_port_t port, rpc_port_callback_ty callback, void *data);
-
-/// Use the \p port to send \p bytes using the \p callback. The input is an
-/// array of at least the configured lane size.
-void rpc_send_n(rpc_port_t port, const void *const *src, uint64_t *size);
-
-/// Use the \p port to recieve a buffer using the \p callback.
-void rpc_recv(rpc_port_t port, rpc_port_callback_ty callback, void *data);
-
-/// Use the \p port to recieve \p bytes using the \p callback. The inputs is an
-/// array of at least the configured lane size. The \p alloc function allocates
-/// memory for the recieved bytes.
-void rpc_recv_n(rpc_port_t port, void **dst, uint64_t *size, rpc_alloc_ty alloc,
-                void *data);
-
-/// Use the \p port to receive and send a buffer using the \p callback.
-void rpc_recv_and_send(rpc_port_t port, rpc_port_callback_ty callback,
-                       void *data);
+int libc_handle_rpc_port(void *port, uint32_t num_lanes);
 
 #ifdef __cplusplus
 }
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index d877cbc25a13d01..1fdbb79df7e3e02 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -37,12 +37,6 @@
 using namespace LIBC_NAMESPACE;
 using namespace LIBC_NAMESPACE::printf_core;
 
-static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer),
-              "Buffer size mismatch");
-
-static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT,
-              "Incorrect maximum port count");
-
 namespace {
 struct TempStorage {
   char *alloc(size_t size) {
@@ -74,9 +68,9 @@ LIBC_INLINE ::FILE *to_stream(uintptr_t f) {
   return stream;
 }
 
-template <bool packed, uint32_t lane_size>
+template <bool packed, uint32_t num_lanes>
 static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
-  FILE *files[lane_size] = {nullptr};
+  FILE *files[num_lanes] = {nullptr};
   // Get the appropriate output stream to use.
   if (port.get_opcode() == RPC_PRINTF_TO_STREAM ||
       port.get_opcode() == RPC_PRINTF_TO_STREAM_PACKED)
@@ -85,22 +79,22 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
     });
   else if (port.get_opcode() == RPC_PRINTF_TO_STDOUT ||
            port.get_opcode() == RPC_PRINTF_TO_STDOUT_PACKED)
-    std::fill(files, files + lane_size, stdout);
+    std::fill(files, files + num_lanes, stdout);
   else
-    std::fill(files, files + lane_size, stderr);
+    std::fill(files, files + num_lanes, stderr);
 
-  uint64_t format_sizes[lane_size] = {0};
-  void *format[lane_size] = {nullptr};
+  uint64_t format_sizes[num_lanes] = {0};
+  void *format[num_lanes] = {nullptr};
 
-  uint64_t args_sizes[lane_size] = {0};
-  void *args[lane_size] = {nullptr};
+  uint64_t args_sizes[num_lanes] = {0};
+  void *args[num_lanes] = {nullptr};
 
   // Recieve the format string and arguments from the client.
   port.recv_n(format, format_sizes,
               [&](uint64_t size) { return temp_storage.alloc(size); });
 
   // Parse the format string to get the expected size of the buffer.
-  for (uint32_t lane = 0; lane < lane_size; ++lane) {
+  for (uint32_t lane = 0; lane < num_lanes; ++lane) {
     if (!format[lane])
       continue;
 
@@ -125,9 +119,9 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
 
   // Identify any arguments that are actually pointers to strings on the client.
   // Additionally we want to determine how much buffer space we need to print.
-  std::vector<void *> strs_to_copy[lane_size];
-  int buffer_size[lane_size] = {0};
-  for (uint32_t lane = 0; lane < lane_size; ++lane) {
+  std::vector<void *> strs_to_copy[num_lanes];
+  int buffer_size[num_lanes] = {0};
+  for (uint32_t lane = 0; lane < num_lanes; ++lane) {
     if (!format[lane])
       continue;
 
@@ -159,7 +153,7 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
   }
 
   // Recieve any strings from the client and push them into a buffer.
-  std::vector<void *> copied_strs[lane_size];
+  std::vector<void *> copied_strs[num_lanes];
   while (std::any_of(std::begin(strs_to_copy), std::end(strs_to_copy),
                      [](const auto &v) { return !v.empty() && v.back(); })) {
     port.send([&](rpc::Buffer *buffer, uint32_t id) {
@@ -168,11 +162,11 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
       if (!strs_to_copy[id].empty())
         strs_to_copy[id].pop_back();
     });
-    uint64_t str_sizes[lane_size] = {0};
-    void *strs[lane_size] = {nullptr};
+    uint64_t str_sizes[num_lanes] = {0};
+    void *strs[num_lanes] = {nullptr};
     port.recv_n(strs, str_sizes,
                 [&](uint64_t size) { return temp_storage.alloc(size); });
-    for (uint32_t lane = 0; lane < lane_size; ++lane) {
+    for (uint32_t lane = 0; lane < num_lanes; ++lane) {
       if (!strs[lane])
         continue;
 
@@ -182,8 +176,8 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
   }
 
   // Perform the final formatting and printing using the LLVM C library printf.
-  int results[lane_size] = {0};
-  for (uint32_t lane = 0; lane < lane_size; ++lane) {
+  int results[num_lanes] = {0};
+  for (uint32_t lane = 0; lane < num_lanes; ++lane) {
     if (!format[lane])
       continue;
 
@@ -233,42 +227,34 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
   });
 }
 
-template <uint32_t lane_size>
-rpc_status_t handle_server_impl(
-    rpc::Server &server,
-    const std::unordered_map<uint32_t, rpc_opcode_callback_ty> &callbacks,
-    const std::unordered_map<uint32_t, void *> &callback_data,
-    uint32_t &index) {
-  auto port = server.try_open(lane_size, index);
-  if (!port)
-    return RPC_STATUS_SUCCESS;
-
+template <uint32_t num_lanes>
+rpc::Status handle_port_impl(rpc::Server::Port &port) {
   TempStorage temp_storage;
 
-  switch (port->get_opcode()) {
+  switch (port.get_opcode()) {
   case RPC_WRITE_TO_STREAM:
   case RPC_WRITE_TO_STDERR:
   case RPC_WRITE_TO_STDOUT:
   case RPC_WRITE_TO_STDOUT_NEWLINE: {
-    uint64_t sizes[lane_size] = {0};
-    void *strs[lane_size] = {nullptr};
-    FILE *files[lane_size] = {nullptr};
-    if (port->get_opcode() == RPC_WRITE_TO_STREAM) {
-      port->recv([&](rpc::Buffer *buffer, uint32_t id) {
+    uint64_t sizes[num_lanes] = {0};
+    void *strs[num_lanes] = {nullptr};
+    FILE *files[num_lanes] = {nullptr};
+    if (port.get_opcode() == RPC_WRITE_TO_STREAM) {
+      port.recv([&](rpc::Buffer *buffer, uint32_t id) {
         files[id] = reinterpret_cast<FILE *>(buffer->data[0]);
       });
-    } else if (port->get_opcode() == RPC_WRITE_TO_STDERR) {
-      std::fill(files, files + lane_size, stderr);
+    } else if (port.get_opcode() == RPC_WRITE_TO_STDERR) {
+      std::fill(files, files + num_lanes, stderr);
     } else {
-      std::fill(files, files + lane_size, stdout);
+      std::fill(files, files + num_lanes, stdout);
     }
 
-    port->recv_n(strs, sizes,
-                 [&](uint64_t size) { return temp_storage.alloc(size); });
-    port->send([&](rpc::Buffer *buffer, uint32_t id) {
+    port.recv_n(strs, sizes,
+                [&](uint64_t size) { return temp_storage.alloc(size); });
+    port.send([&](rpc::Buffer *buffer, uint32_t id) {
       flockfile(files[id]);
       buffer->data[0] = fwrite_unlocked(strs[id], 1, sizes[id], files[id]);
-      if (port->get_opcode() == RPC_WRITE_TO_STDOUT_NEWLINE &&
+      if (port.get_opcode() == RPC_WRITE_TO_STDOUT_NEWLINE &&
           buffer->data[0] == sizes[id])
         buffer->data[0] += fwrite_unlocked("\n", 1, 1, files[id]);
       funlockfile(files[id]);
@@ -276,37 +262,37 @@ rpc_status_t handle_server_impl(
     break;
   }
   case RPC_READ_FROM_STREAM: {
-    uint64_t sizes[lane_size] = {0};
-    void *data[lane_size] = {nullptr};
-    port->recv([&](rpc::Buffer *buffer, uint32_t id) {
+    uint64_t sizes[num_lanes] = {0};
+    void *data[num_lanes] = {nullptr};
+    port.recv([&](rpc::Buffer *buffer, uint32_t id) {
       data[id] = temp_storage.alloc(buffer->data[0]);
       sizes[id] =
           fread(data[id], 1, buffer->data[0], to_stream(buffer->data[1]));
     });
-    port->send_n(data, sizes);
-    port->send([&](rpc::Buffer *buffer, uint32_t id) {
+    port.send_n(data, sizes);
+    port.send([&](rpc::Buffer *buffer, uint32_t id) {
       std::memcpy(buffer->data, &sizes[id], sizeof(uint64_t));
     });
     break;
   }
   case RPC_READ_FGETS: {
-    uint64_t sizes[lane_size] = {0};
-    void *data[lane_size] = {nullptr};
-    port->recv([&](rpc::Buffer *buffer, uint32_t id) {
+    uint64_t sizes[num_lanes] = {0};
+    void *data[num_lanes] = {nullptr};
+    port.recv([&](rpc::Buffer *buffer, uint32_t id) {
       data[id] = temp_storage.alloc(buffer->data[0]);
       const char *str = fgets(reinterpret_cast<char *>(data[id]),
                               buffer->data[0], to_stream(buffer->data[1]));
       sizes[id] = !str ? 0 : std::strlen(str) + 1;
     });
-    port->send_n(data, sizes);
+    port.send_n(data, sizes);
     break;
   }
   case RPC_OPEN_FILE: {
-    uint64_t sizes[lane_size] = {0};
-    void *paths[lane_size] = {nullptr};
-    port->recv_n(paths, sizes,
-                 [&](uint64_t size) { return temp_storage.alloc(size); });
-    port->recv_and_send([&](rpc::Buffer *buffer, uint32_t id) {
+    uint64_t sizes[num_lanes] = {0};
+    void *paths[num_lanes] = {nullptr};
+    port.recv_n(paths, sizes,
+                [&](uint64_t size) { return temp_storage.alloc(size); });
+    port.recv_and_send([&](rpc::Buffer *buffer, uint32_t id) {
       FILE *file = fopen(reinterpret_cast<char *>(paths[id]),
                          reinterpret_cast<char *>(buffer->data));
       buffer->data[0] = reinterpret_cast<uintptr_t>(file);
@@ -314,7 +300,7 @@ rpc_status_t handle_server_impl(
     break;
   }
   case RPC_CLOSE_FILE: {
-    port->recv_and_send([&](rpc::Buffer *buffer, uint32_t id) {
+    port.recv_and_send([&](rpc::Buffer *buffer, uint32_t id) {
       FILE *file = reinterpret_cast<FILE *>(buffer->data[0]);
       buffer->data[0] = fclose(file);
     });
@@ -322,8 +308,8 @@ rpc_status_t handle_server_impl(
   }
   case RPC_EXIT: {
     // Send a response to the client to signal that we are ready to exit.
-    port->recv_and_send([](rpc::Buffer *, uint32_t) {});
-    port->recv([](rpc::Buffer *buffer, uint32_t) {
+    port.recv_and_send([](rpc::Buffer *, uint32_t) {});
+    port.recv([](rpc::Buffer *buffer, uint32_t) {
       int status = 0;
       std::memcpy(&status, buffer->data, sizeof(int));
       exit(status);
@@ -332,47 +318,47 @@ rpc_status_t handle_server_impl(
   }
   case RPC_ABORT: {
     // Send a response to the client to signal that we are ready to abort.
-    port->recv_and_send([](rpc::Buffer *, uint32_t) {});
-    port->recv([](rpc::Buffer *, uint32_t) {});
+    port.recv_and_send([](rpc::Buffer *, uint32_t) {});
+    port.recv([](rpc::Buffer *, uint32_t) {});
     abort();
     break;
   }
   case RPC_HOST_CALL: {
-    uint64_t sizes[lane_size] = {0};
-    unsigned long long results[lane_size] = {0};
-    void *args[lane_size] = {nullptr};
-    port->recv_n(args, sizes,
-                 [&](uint64_t size) { return temp_storage.alloc(size); });
-    port->recv([&](rpc::Buffer *buffer, uint32_t id) {
+    uint64_t sizes[num_lanes] = {0};
+    unsigned long long results[num_lanes] = {0};
+    void *args[num_lanes] = {nullptr};
+    port.recv_n(args, sizes,
+                [&](uint64_t size) { return temp_storage.alloc(size); });
+    port.recv([&](rpc::Buffer *buffer, uint32_t id) {
       using func_ptr_t = unsigned long long (*)(void *);
       auto func = reinterpret_cast<func_ptr_t>(buffer->data[0]);
       results[id] = func(args[id]);
     });
-    port->send([&](rpc::Buffer *buffer, uint32_t id) {
+    port.send([&](rpc::Buffer *buffer, uint32_t id) {
       buffer->data[0] = static_cast<uint64_t>(results[id]);
     });
     break;
   }
   case RPC_FEOF: {
-    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+    port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = feof(to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_FERROR: {
-    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+    port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = ferror(to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_CLEARERR: {
-    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+    port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       clearerr(to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_FSEEK: {
-    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+    port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] =
           fseek(to_stream(buffer->data[0]), static_cast<long>(buffer->data[1]),
                 static_cast<int>(buffer->data[2]));
@@ -380,19 +366,19 @@ rpc_status_t handle_server_impl(
     break;
   }
   case RPC_FTELL: {
-    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+    port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = ftell(to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_FFLUSH: {
-    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+    port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = fflush(to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_UNGETC: {
-    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+    port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] =
           ungetc(static_cast<int>(buffer->data[0]), to_stream(buffer->data[1]));
     });
@@ -401,36 +387,36 @@ rpc_status_t handle_server_impl(
   case RPC_PRINTF_TO_STREAM_PACKED:
   case RPC_PRINTF_TO_STDOUT_PACKED:
   case RPC_PRINTF_TO_STDERR_PACKED: {
-    handle_printf<true, lane_size>(*port, temp_storage);
+    handle_printf<true, num_lanes>(port, temp_storage);
     break;
   }
   case RPC_PRINTF_TO_STREAM:
   case RPC_PRINTF_TO_STDOUT:
   case RPC_PRINTF_TO_STDERR: {
-    handle_printf<false, lane_size>(*port, temp_storage);
+    handle_printf<false, num_lanes>(port, temp_storage);
     break;
   }
   case RPC_REMOVE: {
-    uint64_t sizes[lane_size] = {0};
-    void *args[lane_size] = {nullptr};
-    port->recv_n(args, sizes,
-                 [&](uint64_t size) { return temp_storage.alloc(size); });
-    port->send([&](rpc::Buffer *buffer, uint32_t id) {
+    uint64_t sizes[num_lanes] = {0};
+    void *args[num_lanes] = {nullptr};
+    port.recv_n(args, sizes,
+                [&](uint64_t size) { return temp_storage.alloc(size); });
+    port.send([&](rpc::Buffer *buffer, uint32_t id) {
       buffer->data[0] = static_cast<uint64_t>(
           remove(reinterpret_cast<const char *>(args[id])));
     });
     break;
   }
   case RPC_RENAME: {
-    uint64_t oldsizes[lane_size] = {0};
-    uint64_t newsizes[lane_size] = {0};
-    void *oldpath[lane_size] = {nullptr};
-    void *newpath[lane_size] = {nullptr};
-    port->recv_n(oldpath, oldsizes,
-                 [&](uint64_t size) { return temp_storage.alloc(size); });
-    port->recv_n(newpath, newsizes,
-                 [&](uint64_t size) { return temp_storage.alloc(size); });
-    port->send([&](rpc::Buffer *buffer, uint32_t id) {
+    uint64_t oldsizes[num_lanes] = {0};
+    uint64_t newsizes[num_lanes] = {0};
+    void *oldpath[num_lanes] = {nullptr};
+    void *newpath[num_lanes] = {nullptr};
+    port.recv_n(oldpath, oldsizes,
+                [&](uint64_t size) { return temp_storage.alloc(size); });
+    port.recv_n(newpath, newsizes,
+                [&](uint64_t size) { return temp_storage.alloc(size); });
+    port.send([&](rpc::Buffer *buffer, uint32_t id) {
       buffer->data[0] = static_cast<uint64_t>(
           rename(reinterpret_cast<const char *>(oldpath[id]),
                  reinterpret_cast<const char *>(newpath[id])));
@@ -438,168 +424,36 @@ rpc_status_t handle_server_impl(
     break;
   }
   case RPC_SYSTEM: {
-    uint64_t sizes[lane_size] = {0};
-    void *args[lane_size] = {nullptr};
-    port->recv_n(args, sizes,
-                 [&](uint64_t size) { return temp_storage.alloc(size); });
-    port->send([&](rpc::Buffer *buffer, uint32_t id) {
+    uint64_t sizes[num_lanes] = {0};
+    void *args[num_lanes] = {nullptr};
+    port.recv_n(args, sizes,
+                [&](uint64_t size) { return temp_storage.alloc(size); });
+    port.send([&](rpc::Buffer *buffer, uint32_t id) {
       buffer->data[0] = static_cast<uint64_t>(
           system(reinterpret_cast<const char *>(args[id])));
     });
     break;
   }
   case RPC_NOOP: {
-    port->recv([](rpc::Buffer *, uint32_t) {});
+    port.recv([](rpc::Buffer *, uint32_t) {});
     break;
   }
-  default: {
-    auto handler =
-        callbacks.find(static_cast<rpc_opcode_t>(port->get_opcode()));
-
-    // We error out on an unhandled opcode.
-    if (handler == callbacks.end())
-      return RPC_STATUS_UNHANDLED_OPCODE;
-
-    // Invoke the registered callback with a reference to the port.
-    void *data =
-        callback_data.at(static_cast<rpc_opcode_t>(port->get_opcode()));
-    rpc_port_t port_ref{reinterpret_cast<uint64_t>(&*port), lane_size};
-    (handler->second)(port_ref, data);
+  default:
+    return rpc::UNHANDLED_OPCODE;
   }
-  }
-
-  // Increment the index so we start the scan after this port.
-  index = port->get_index() + 1;
-  port->close();
 
-  return RPC_STATUS_CONTINUE;
+  return rpc::SUCCESS;
 }
 
-struct Device {
-  Device(uint32_t lane_size, uint32_t num_ports, void *buffer)
-      : lane_size(lane_size), buffer(buffer), server(num_ports, buffer),
-        client(num_ports, buffer) {}
-
-  rpc_status_t handle_server(uint32_t &index) {
-    switch (lane_size) {
-    case 1:
-      return handle_server_impl<1>(server, callbacks, callback_data, index);
-    case 32:
-      return handle_server_impl<32>(server, callbacks, callback_data, index);
-    case 64:
-      return handle_server_impl<64>(server, callbacks, callback_data, index);
-    default:
-      return RPC_STATUS_INVALID_LANE_SIZE;
-    }
+int libc_handle_rpc_port(void *port, uint32_t num_lanes) {
+  switch (num_lanes) {
+  case 1:
+    return handle_port_impl<1>(*reinterpret_cast<rpc::Server::Port *>(port));
+  case 32:
+    return handle_port_impl<32>(*reinterpret_cast<rpc::Server::Port *>(port));
+  case 64:
+    return handle_port_impl<64>(*reinterpret_cast<rpc::Server::Port *>(port));
+  default:
+    return rpc::ERROR;
   }
-
-  uint32_t lane_size;
-  void *buffer;
-  rpc::Server server;
-  rpc::Client client;
-  std::unordered_map<uint32_t, rpc_opcode_callback_ty> callbacks;
-  std::unordered_map<uint32_t, void *> callback_data;
-};
-
-rpc_status_t rpc_server_init(rpc_device_t *rpc_device, uint64_t num_ports,
-                             uint32_t lane_size, rpc_alloc_ty alloc,
-                             void *data) {
-  if (!rpc_device)
-    return RPC_STATUS_ERROR;
-  if (lane_size != 1 && lane_size != 32 && lane_size != 64)
-    return RPC_STATUS_INVALID_LANE_SIZE;
-
-  uint64_t size = rpc::Server::allocation_size(lane_size, num_ports);
-  void *buffer = alloc(size, data);
-
-  if (!buffer)
-    return RPC_STATUS_ERROR;
-
-  Device *device = new Device(lane_size, num_ports, buffer);
-  if (!device)
-    return RPC_STATUS_ERROR;
-
-  rpc_device->handle = reinterpret_cast<uintptr_t>(device);
-  return RPC_STATUS_SUCCESS;
-}
-
-rpc_status_t rpc_server_shutdown(rpc_device_t rpc_device, rpc_free_ty dealloc,
-                                 void *data) {
-  if (!rpc_device.handle)
-    return RPC_STATUS_ERROR;
-
-  Device *device = reinterpret_cast<Device *>(rpc_device.handle);
-  dealloc(device->buffer, data);
-  delete device;
-
-  return RPC_STATUS_SUCCESS;
-}
-
-rpc_status_t rpc_handle_server(rpc_device_t rpc_device) {
-  if (!rpc_device.handle)
-    return RPC_STATUS_ERROR;
-
-  Device *device = reinterpret_cast<Device *>(rpc_device.handle);
-  uint32_t index = 0;
-  for (;;) {
-    rpc_status_t status = device->handle_server(index);
-    if (status != RPC_STATUS_CONTINUE)
-      return status;
-  }
-}
-
-rpc_status_t rpc_register_callback(rpc_device_t rpc_device, uint32_t opcode,
-                                   rpc_opcode_callback_ty callback,
-                                   void *data) {
-  if (!rpc_device.handle)
-    return RPC_STATUS_ERROR;
-
-  Device *device = reinterpret_cast<Device *>(rpc_device.handle);
-
-  device->callbacks[opcode] = callback;
-  device->callback_data[opcode] = data;
-  return RPC_STATUS_SUCCESS;
-}
-
-const void *rpc_get_client_buffer(rpc_device_t rpc_device) {
-  if (!rpc_device.handle)
-    return nullptr;
-  Device *device = reinterpret_cast<Device *>(rpc_device.handle);
-  return &device->client;
-}
-
-uint64_t rpc_get_client_size() { return sizeof(rpc::Client); }
-
-void rpc_send(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
-  auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
-  port->send([=](rpc::Buffer *buffer, uint32_t) {
-    callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
-  });
-}
-
-void rpc_send_n(rpc_port_t ref, const void *const *src, uint64_t *size) {
-  auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
-  port->send_n(src, size);
-}
-
-void rpc_recv(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
-  auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
-  port->recv([=](rpc::Buffer *buffer, uint32_t) {
-    callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
-  });
-}
-
-void rpc_recv_n(rpc_port_t ref, void **dst, uint64_t *size, rpc_alloc_ty alloc,
-                void *data) {
-  auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
-  auto alloc_fn = [=](uint64_t size) { return alloc(size, data); };
-  port->recv_n(dst, size, alloc_fn);
-}
-
-void rpc_recv_and_send(rpc_port_t ref, rpc_port_callback_ty callback,
-                       void *data) {
-  auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
-  port->recv_and_send([=](rpc::Buffer *buffer, uint32_t) {
-    callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
-  });
 }
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index fde4b2f930349e8..3ed5c02ed4a3bbb 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -34,6 +34,7 @@ elseif(${LIBOMPTARGET_GPU_LIBC_SUPPORT})
     # We may need to get the headers directly from the 'libc' source directory.
     target_include_directories(PluginCommon PRIVATE
                                ${CMAKE_SOURCE_DIR}/../libc/utils/gpu/server
+                               ${CMAKE_SOURCE_DIR}/../libc/
                                ${CMAKE_SOURCE_DIR}/../libc/include)
   endif()
 endif()
diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h
index 01bf539bcb3f32d..5b9b7ffd086b57f 100644
--- a/offload/plugins-nextgen/common/include/RPC.h
+++ b/offload/plugins-nextgen/common/include/RPC.h
@@ -61,7 +61,7 @@ struct RPCServerTy {
 
 private:
   /// Array from this device's identifier to its attached devices.
-  llvm::SmallVector<uintptr_t> Handles;
+  llvm::SmallVector<void *> Buffers;
 };
 
 } // namespace llvm::omp::target
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index faa2cbd4f02fe15..48ed612cf853b6d 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -12,9 +12,11 @@
 
 #include "PluginInterface.h"
 
+// TODO: This should be included unconditionally and cleaned up.
 #if defined(LIBOMPTARGET_RPC_SUPPORT)
-#include "llvm-libc-types/rpc_opcodes_t.h"
+#include "include/llvm-libc-types/rpc_opcodes_t.h"
 #include "llvmlibc_rpc_server.h"
+#include "shared/rpc.h"
 #endif
 
 using namespace llvm;
@@ -22,14 +24,14 @@ using namespace omp;
 using namespace target;
 
 RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
-    : Handles(Plugin.getNumDevices()) {}
+    : Buffers(Plugin.getNumDevices()) {}
 
 llvm::Expected<bool>
 RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
                               plugin::GenericGlobalHandlerTy &Handler,
                               plugin::DeviceImageTy &Image) {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
-  return Handler.isSymbolInImage(Device, Image, rpc_client_symbol_name);
+  return Handler.isSymbolInImage(Device, Image, "__llvm_libc_rpc_client");
 #else
   return false;
 #endif
@@ -39,59 +41,18 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
                               plugin::GenericGlobalHandlerTy &Handler,
                               plugin::DeviceImageTy &Image) {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
-  auto Alloc = [](uint64_t Size, void *Data) {
-    plugin::GenericDeviceTy &Device =
-        *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-    return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST);
-  };
   uint64_t NumPorts =
-      std::min(Device.requestedRPCPortCount(), RPC_MAXIMUM_PORT_COUNT);
-  rpc_device_t RPCDevice;
-  if (rpc_status_t Err = rpc_server_init(&RPCDevice, NumPorts,
-                                         Device.getWarpSize(), Alloc, &Device))
+      std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT);
+  void *RPCBuffer = Device.allocate(
+      rpc::Server::allocation_size(Device.getWarpSize(), NumPorts), nullptr,
+      TARGET_ALLOC_HOST);
+  if (!RPCBuffer)
     return plugin::Plugin::error(
-        "Failed to initialize RPC server for device %d: %d",
-        Device.getDeviceId(), Err);
-
-  // Register a custom opcode handler to perform plugin specific allocation.
-  auto MallocHandler = [](rpc_port_t Port, void *Data) {
-    rpc_recv_and_send(
-        Port,
-        [](rpc_buffer_t *Buffer, void *Data) {
-          plugin::GenericDeviceTy &Device =
-              *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-          Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
-              Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
-        },
-        Data);
-  };
-  if (rpc_status_t Err =
-          rpc_register_callback(RPCDevice, RPC_MALLOC, MallocHandler, &Device))
-    return plugin::Plugin::error(
-        "Failed to register RPC malloc handler for device %d: %d\n",
-        Device.getDeviceId(), Err);
-
-  // Register a custom opcode handler to perform plugin specific deallocation.
-  auto FreeHandler = [](rpc_port_t Port, void *Data) {
-    rpc_recv(
-        Port,
-        [](rpc_buffer_t *Buffer, void *Data) {
-          plugin::GenericDeviceTy &Device =
-              *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-          Device.free(reinterpret_cast<void *>(Buffer->data[0]),
-                      TARGET_ALLOC_DEVICE_NON_BLOCKING);
-        },
-        Data);
-  };
-  if (rpc_status_t Err =
-          rpc_register_callback(RPCDevice, RPC_FREE, FreeHandler, &Device))
-    return plugin::Plugin::error(
-        "Failed to register RPC free handler for device %d: %d\n",
-        Device.getDeviceId(), Err);
+        "Failed to initialize RPC server for device %d", Device.getDeviceId());
 
   // Get the address of the RPC client from the device.
   void *ClientPtr;
-  plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *));
+  plugin::GlobalTy ClientGlobal("__llvm_libc_rpc_client", sizeof(void *));
   if (auto Err =
           Handler.getGlobalMetadataFromDevice(Device, Image, ClientGlobal))
     return Err;
@@ -100,38 +61,63 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
                                      sizeof(void *), nullptr))
     return Err;
 
-  const void *ClientBuffer = rpc_get_client_buffer(RPCDevice);
-  if (auto Err = Device.dataSubmit(ClientPtr, ClientBuffer,
-                                   rpc_get_client_size(), nullptr))
+  rpc::Client client(NumPorts, RPCBuffer);
+  if (auto Err =
+          Device.dataSubmit(ClientPtr, &client, sizeof(rpc::Client), nullptr))
     return Err;
-  Handles[Device.getDeviceId()] = RPCDevice.handle;
+  Buffers[Device.getDeviceId()] = RPCBuffer;
+
+  return Error::success();
+
 #endif
   return Error::success();
 }
 
 Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
-  rpc_device_t RPCDevice{Handles[Device.getDeviceId()]};
-  if (rpc_status_t Err = rpc_handle_server(RPCDevice))
-    return plugin::Plugin::error(
-        "Error while running RPC server on device %d: %d", Device.getDeviceId(),
-        Err);
+  uint64_t NumPorts =
+      std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT);
+  rpc::Server Server(NumPorts, Buffers[Device.getDeviceId()]);
+
+  auto port = Server.try_open(Device.getWarpSize());
+  if (!port)
+    return Error::success();
+
+  switch (port->get_opcode()) {
+  case RPC_MALLOC: {
+    port->recv_and_send([&](rpc::Buffer *Buffer, uint32_t) {
+      Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
+          Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
+    });
+    break;
+  }
+  case RPC_FREE: {
+    port->recv([&](rpc::Buffer *Buffer, uint32_t) {
+      Device.free(reinterpret_cast<void *>(Buffer->data[0]),
+                  TARGET_ALLOC_DEVICE_NON_BLOCKING);
+    });
+    break;
+  }
+  default:
+    break;
+  }
+
+  // Let the `libc` library handle and unhandled opcodes.
+  int Status = libc_handle_rpc_port(&*port, Device.getWarpSize());
+  if (Status != rpc::SUCCESS)
+    return createStringError("RPC server given invalid opcode!");
+
+  port->close();
+
+  return Error::success();
 #endif
   return Error::success();
 }
 
 Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
-  rpc_device_t RPCDevice{Handles[Device.getDeviceId()]};
-  auto Dealloc = [](void *Ptr, void *Data) {
-    plugin::GenericDeviceTy &Device =
-        *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
-    Device.free(Ptr, TARGET_ALLOC_HOST);
-  };
-  if (rpc_status_t Err = rpc_server_shutdown(RPCDevice, Dealloc, &Device))
-    return plugin::Plugin::error(
-        "Failed to shut down RPC server for device %d: %d",
-        Device.getDeviceId(), Err);
+  Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST);
+  return Error::success();
 #endif
   return Error::success();
 }