[libc-commits] [libc] [libc] Rework the RPC interface to accept runtime wave sizes (PR #80914)

Wed Feb 7 10:26:46 PST 2024

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/80914

>From b7d1bb79cda07374ec7b315dd9dd66848a29a02c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Tue, 6 Feb 2024 18:16:30 -0600
Subject: [PATCH] [libc] Rework the RPC interface to accept runtime wave sizes

Summary:
The RPC interface needs to handle an entire warp or wavefront at once.
This is currently done by using a compile time constant indicating the
size of the buffer, which right now defaults to some value on the client
(GPU) side. However, there are currently attempts to move the `libc`
library to a single IR build. This is problematic as the size of the
wave fronts changes between ISAs on AMDGPU. The builitin
`__builtin_amdgcn_wavefrontsize()` will return the appropriate value,
but it is only known at runtime now.

In order to support this, this patch restructures the packet. Now
instead of having an array of arrays, we simply have a large array of
buffers and slice it according to the runtime value if we don't know it
ahead of time. This also somewhat has the advantage of making the buffer
contiguous within a page now that the header has been moved out of it.
---
 libc/docs/gpu/rpc.rst                         |   2 +-
 libc/src/__support/GPU/amdgpu/utils.h         |  10 +-
 libc/src/__support/GPU/generic/utils.h        |   4 +-
 libc/src/__support/GPU/nvptx/utils.h          |   7 +-
 libc/src/__support/OSUtil/gpu/io.cpp          |   2 +-
 libc/src/__support/OSUtil/gpu/quick_exit.cpp  |   2 +-
 libc/src/__support/RPC/rpc.h                  | 127 ++++++++++--------
 libc/src/__support/RPC/rpc_client.cpp         |   2 +-
 libc/src/__support/RPC/rpc_client.h           |   2 +-
 libc/src/gpu/rpc_host_call.cpp                |   2 +-
 libc/src/stdio/gpu/clearerr.cpp               |   2 +-
 libc/src/stdio/gpu/fclose.cpp                 |   2 +-
 libc/src/stdio/gpu/feof.cpp                   |   2 +-
 libc/src/stdio/gpu/ferror.cpp                 |   2 +-
 libc/src/stdio/gpu/fflush.cpp                 |   2 +-
 libc/src/stdio/gpu/fgets.cpp                  |   2 +-
 libc/src/stdio/gpu/file.h                     |   4 +-
 libc/src/stdio/gpu/fopen.cpp                  |   2 +-
 libc/src/stdio/gpu/fseek.cpp                  |   2 +-
 libc/src/stdio/gpu/ftell.cpp                  |   2 +-
 libc/src/stdio/gpu/ungetc.cpp                 |   2 +-
 libc/src/stdlib/gpu/abort.cpp                 |   2 +-
 libc/src/stdlib/gpu/free.cpp                  |   2 +-
 libc/src/stdlib/gpu/malloc.cpp                |   2 +-
 .../startup/gpu/rpc_interface_test.cpp        |   2 +-
 .../startup/gpu/rpc_stream_test.cpp           |   4 +-
 .../test/integration/startup/gpu/rpc_test.cpp |   4 +-
 .../test/src/__support/RPC/rpc_smoke_test.cpp |   8 +-
 libc/utils/gpu/server/rpc_server.cpp          |   4 +-
 29 files changed, 110 insertions(+), 102 deletions(-)

diff --git a/libc/docs/gpu/rpc.rst b/libc/docs/gpu/rpc.rst
index 78ae778671881a..c58177c9403872 100644
--- a/libc/docs/gpu/rpc.rst
+++ b/libc/docs/gpu/rpc.rst
@@ -125,7 +125,7 @@ done. It can be omitted if asynchronous execution is desired.
 .. code-block:: c++
 
   void rpc_host_call(void *fn, void *data, size_t size) {
-    rpc::Client::Port port = rpc::client.open<RPC_HOST_CALL>();
+    rpc::Client<>::Port port = rpc::client.open<RPC_HOST_CALL>();
     port.send_n(data, size);
     port.send([=](rpc::Buffer *buffer) {
       buffer->data[0] = reinterpret_cast<uintptr_t>(fn);
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 58bbe29cb3a7d7..9432b7b39f7836 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -17,9 +17,6 @@
 namespace LIBC_NAMESPACE {
 namespace gpu {
 
-/// The number of threads that execute in lock-step in a lane.
-constexpr const uint64_t LANE_SIZE = __AMDGCN_WAVEFRONT_SIZE;
-
 /// Type aliases to the address spaces used by the AMDGPU backend.
 template <typename T> using Private = [[clang::opencl_private]] T;
 template <typename T> using Constant = [[clang::opencl_constant]] T;
@@ -108,8 +105,11 @@ LIBC_INLINE uint64_t get_thread_id() {
          get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
 }
 
-/// Returns the size of an AMD wavefront. Either 32 or 64 depending on hardware.
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+/// Returns the size of an AMD wavefront, either 32 or 64 depending on hardware
+/// and compilation options.
+LIBC_INLINE uint32_t get_lane_size() {
+  return __builtin_amdgcn_wavefrontsize();
+}
 
 /// Returns the id of the thread inside of an AMD wavefront executing together.
 [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
diff --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
index 00b59837ccc671..58db88dce1ca8c 100644
--- a/libc/src/__support/GPU/generic/utils.h
+++ b/libc/src/__support/GPU/generic/utils.h
@@ -16,8 +16,6 @@
 namespace LIBC_NAMESPACE {
 namespace gpu {
 
-constexpr const uint64_t LANE_SIZE = 1;
-
 template <typename T> using Private = T;
 template <typename T> using Constant = T;
 template <typename T> using Shared = T;
@@ -55,7 +53,7 @@ LIBC_INLINE uint32_t get_thread_id_z() { return 0; }
 
 LIBC_INLINE uint64_t get_thread_id() { return 0; }
 
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+LIBC_INLINE uint32_t get_lane_size() { return 1; }
 
 LIBC_INLINE uint32_t get_lane_id() { return 0; }
 
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index e7e297adf7ecca..6c4bb5a7720a50 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -16,9 +16,6 @@
 namespace LIBC_NAMESPACE {
 namespace gpu {
 
-/// The number of threads that execute in lock-step in a warp.
-constexpr const uint64_t LANE_SIZE = 32;
-
 /// Type aliases to the address spaces used by the NVPTX backend.
 template <typename T> using Private = [[clang::opencl_private]] T;
 template <typename T> using Constant = [[clang::opencl_constant]] T;
@@ -95,8 +92,8 @@ LIBC_INLINE uint64_t get_thread_id() {
          get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
 }
 
-/// Returns the size of a CUDA warp.
-LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
+/// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
+LIBC_INLINE uint32_t get_lane_size() { return 32; }
 
 /// Returns the id of the thread inside of a CUDA warp executing together.
 [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
diff --git a/libc/src/__support/OSUtil/gpu/io.cpp b/libc/src/__support/OSUtil/gpu/io.cpp
index fec4d9f7b35d27..6d243e36353f9a 100644
--- a/libc/src/__support/OSUtil/gpu/io.cpp
+++ b/libc/src/__support/OSUtil/gpu/io.cpp
@@ -14,7 +14,7 @@
 namespace LIBC_NAMESPACE {
 
 void write_to_stderr(cpp::string_view msg) {
-  rpc::Client::Port port = rpc::client.open<RPC_WRITE_TO_STDERR>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_WRITE_TO_STDERR>();
   port.send_n(msg.data(), msg.size());
   port.recv([](rpc::Buffer *) { /* void */ });
   port.close();
diff --git a/libc/src/__support/OSUtil/gpu/quick_exit.cpp b/libc/src/__support/OSUtil/gpu/quick_exit.cpp
index 1a03be0ace6728..6cf1395ee62b22 100644
--- a/libc/src/__support/OSUtil/gpu/quick_exit.cpp
+++ b/libc/src/__support/OSUtil/gpu/quick_exit.cpp
@@ -18,7 +18,7 @@ namespace LIBC_NAMESPACE {
 
 void quick_exit(int status) {
   // We want to first make sure the server is listening before we exit.
-  rpc::Client::Port port = rpc::client.open<RPC_EXIT>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_EXIT>();
   port.send_and_recv([](rpc::Buffer *) {}, [](rpc::Buffer *) {});
   port.send([&](rpc::Buffer *buffer) {
     reinterpret_cast<uint32_t *>(buffer->data)[0] = status;
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 7924d4cec2ac84..e657ab274e8278 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -43,22 +43,13 @@ struct Header {
   uint16_t opcode;
 };
 
-/// The data payload for the associated packet. We provide enough space for each
-/// thread in the cooperating lane to have a buffer.
-template <uint32_t lane_size = gpu::LANE_SIZE> struct Payload {
-  Buffer slot[lane_size];
-};
-
-/// A packet used to share data between the client and server across an entire
-/// lane. We use a lane as the minimum granularity for execution.
-template <uint32_t lane_size = gpu::LANE_SIZE> struct alignas(64) Packet {
-  Header header;
-  Payload<lane_size> payload;
-};
-
 /// The maximum number of parallel ports that the RPC interface can support.
 constexpr uint64_t MAX_PORT_COUNT = 4096;
 
+/// The default lane size indicates that the actual lane size is not a compile
+/// time constant and needs to be instead be queried from the platform.
+constexpr uint32_t VARIABLE_LANE_SIZE = 0;
+
 /// A common process used to synchronize communication between a client and a
 /// server. The process contains a read-only inbox and a write-only outbox used
 /// for signaling ownership of the shared buffer between both sides. We assign
@@ -71,7 +62,7 @@ constexpr uint64_t MAX_PORT_COUNT = 4096;
 ///   - The client will always start with a 'send' operation.
 ///   - The server will always start with a 'recv' operation.
 ///   - Every 'send' or 'recv' call is mirrored by the other process.
-template <bool Invert, typename Packet> struct Process {
+template <bool Invert, uint32_t lane_size> struct Process {
   LIBC_INLINE Process() = default;
   LIBC_INLINE Process(const Process &) = delete;
   LIBC_INLINE Process &operator=(const Process &) = delete;
@@ -82,7 +73,8 @@ template <bool Invert, typename Packet> struct Process {
   uint32_t port_count = 0;
   cpp::Atomic<uint32_t> *inbox = nullptr;
   cpp::Atomic<uint32_t> *outbox = nullptr;
-  Packet *packet = nullptr;
+  Header *header = nullptr;
+  Buffer *packet = nullptr;
 
   static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
   cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
@@ -92,7 +84,9 @@ template <bool Invert, typename Packet> struct Process {
                                     advance(buffer, inbox_offset(port_count)))),
         outbox(reinterpret_cast<cpp::Atomic<uint32_t> *>(
             advance(buffer, outbox_offset(port_count)))),
-        packet(reinterpret_cast<Packet *>(
+        header(reinterpret_cast<Header *>(
+            advance(buffer, header_offset(port_count)))),
+        packet(reinterpret_cast<Buffer *>(
             advance(buffer, buffer_offset(port_count)))) {}
 
   /// Allocate a memory buffer sufficient to store the following equivalent
@@ -101,7 +95,8 @@ template <bool Invert, typename Packet> struct Process {
   /// struct Equivalent {
   ///   Atomic<uint32_t> primary[port_count];
   ///   Atomic<uint32_t> secondary[port_count];
-  ///   Packet buffer[port_count];
+  ///   Header header[port_count];
+  ///   Buffer packet[port_count][lane_size];
   /// };
   LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count) {
     return buffer_offset(port_count) + buffer_bytes(port_count);
@@ -144,6 +139,17 @@ template <bool Invert, typename Packet> struct Process {
     atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
   }
 
+  /// The packet is a linearly allocated array of buffers used to communicate
+  /// with the other process. This function returns the appropriate slot in this
+  /// array such that the process can operate on an entire warp or wavefront at
+  /// once.
+  LIBC_INLINE Buffer *get_packet(uint32_t index) {
+    if constexpr (lane_size == VARIABLE_LANE_SIZE)
+      return &packet[index * gpu::get_lane_size()];
+    else
+      return &packet[index * lane_size];
+  }
+
   /// Determines if this process needs to wait for ownership of the buffer. We
   /// invert the condition on one of the processes to indicate that if one
   /// process owns the buffer then the other does not.
@@ -220,7 +226,8 @@ template <bool Invert, typename Packet> struct Process {
 
   /// Number of bytes to allocate for the buffer containing the packets.
   LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count) {
-    return port_count * sizeof(Packet);
+    static_assert(lane_size != VARIABLE_LANE_SIZE, "Size must be constexpr");
+    return port_count * lane_size * sizeof(Buffer);
   }
 
   /// Offset of the inbox in memory. This is the same as the outbox if inverted.
@@ -233,9 +240,15 @@ template <bool Invert, typename Packet> struct Process {
     return Invert ? 0 : mailbox_bytes(port_count);
   }
 
+  /// Offset of the buffer containing the packets after the inbox and outbox.
+  LIBC_INLINE static constexpr uint64_t header_offset(uint32_t port_count) {
+    return align_up(2 * mailbox_bytes(port_count), alignof(Header));
+  }
+
   /// Offset of the buffer containing the packets after the inbox and outbox.
   LIBC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) {
-    return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
+    return align_up(header_offset(port_count) + port_count * sizeof(Header),
+                    alignof(Buffer));
   }
 
   /// Conditionally set the n-th bit in the atomic bitfield.
@@ -264,33 +277,35 @@ template <bool Invert, typename Packet> struct Process {
 /// Invokes a function accross every active buffer across the total lane size.
 template <uint32_t lane_size>
 static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *)> fn,
-                                   Packet<lane_size> &packet) {
+                                   Buffer *slot, uint64_t mask) {
   if constexpr (is_process_gpu()) {
-    fn(&packet.payload.slot[gpu::get_lane_id()]);
+    fn(&slot[gpu::get_lane_id()]);
   } else {
-    for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
-      if (packet.header.mask & 1ul << i)
-        fn(&packet.payload.slot[i]);
+    auto sz = lane_size == VARIABLE_LANE_SIZE ? gpu::get_lane_size() : lane_size;
+    for (uint32_t i = 0; i < sz; i += gpu::get_lane_size())
+      if (mask & 1ul << i)
+        fn(&slot[i]);
   }
 }
 
 /// Alternate version that also provides the index of the current lane.
 template <uint32_t lane_size>
 static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *, uint32_t)> fn,
-                                   Packet<lane_size> &packet) {
+                                   Buffer *slot, uint64_t mask) {
   if constexpr (is_process_gpu()) {
-    fn(&packet.payload.slot[gpu::get_lane_id()], gpu::get_lane_id());
+    fn(&slot[gpu::get_lane_id()], gpu::get_lane_id());
   } else {
-    for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
-      if (packet.header.mask & 1ul << i)
-        fn(&packet.payload.slot[i], i);
+    auto sz = lane_size == VARIABLE_LANE_SIZE ? gpu::get_lane_size() : lane_size;
+    for (uint32_t i = 0; i < sz; i += gpu::get_lane_size())
+      if (mask & 1ul << i)
+        fn(&slot[i], i);
   }
 }
 
 /// The port provides the interface to communicate between the multiple
 /// processes. A port is conceptually an index into the memory provided by the
 /// underlying process that is guarded by a lock bit.
-template <bool T, typename S> struct Port {
+template <bool T, uint32_t S> struct Port {
   LIBC_INLINE Port(Process<T, S> &process, uint64_t lane_mask, uint32_t index,
                    uint32_t out)
       : process(process), lane_mask(lane_mask), index(index), out(out),
@@ -303,7 +318,7 @@ template <bool T, typename S> struct Port {
   LIBC_INLINE Port(Port &&) = default;
   LIBC_INLINE Port &operator=(Port &&) = default;
 
-  friend struct Client;
+  template <uint32_t U> friend struct Client;
   template <uint32_t U> friend struct Server;
   friend class cpp::optional<Port<T, S>>;
 
@@ -319,7 +334,7 @@ template <bool T, typename S> struct Port {
   LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
 
   LIBC_INLINE uint16_t get_opcode() const {
-    return process.packet[index].header.opcode;
+    return process.header[index].opcode;
   }
 
   LIBC_INLINE uint16_t get_index() const { return index; }
@@ -342,7 +357,7 @@ template <bool T, typename S> struct Port {
 };
 
 /// The RPC client used to make requests to the server.
-struct Client {
+template <uint32_t lane_size = VARIABLE_LANE_SIZE> struct Client {
   LIBC_INLINE Client() = default;
   LIBC_INLINE Client(const Client &) = delete;
   LIBC_INLINE Client &operator=(const Client &) = delete;
@@ -351,15 +366,14 @@ struct Client {
   LIBC_INLINE Client(uint32_t port_count, void *buffer)
       : process(port_count, buffer) {}
 
-  using Port = rpc::Port<false, Packet<gpu::LANE_SIZE>>;
+  using Port = rpc::Port<false, lane_size>;
   template <uint16_t opcode> LIBC_INLINE Port open();
 
 private:
-  Process<false, Packet<gpu::LANE_SIZE>> process;
+  Process<false, lane_size> process;
 };
-static_assert(cpp::is_trivially_copyable<Client>::value &&
-                  sizeof(Process<false, Packet<1>>) ==
-                      sizeof(Process<false, Packet<32>>),
+static_assert(cpp::is_trivially_copyable<Client<>>::value &&
+                  sizeof(Client<1>) == sizeof(Client<32>),
               "The client is not trivially copyable from the server");
 
 /// The RPC server used to respond to the client.
@@ -372,20 +386,20 @@ template <uint32_t lane_size> struct Server {
   LIBC_INLINE Server(uint32_t port_count, void *buffer)
       : process(port_count, buffer) {}
 
-  using Port = rpc::Port<true, Packet<lane_size>>;
+  using Port = rpc::Port<true, lane_size>;
   LIBC_INLINE cpp::optional<Port> try_open(uint32_t start = 0);
   LIBC_INLINE Port open();
 
   LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) {
-    return Process<true, Packet<lane_size>>::allocation_size(port_count);
+    return Process<true, lane_size>::allocation_size(port_count);
   }
 
 private:
-  Process<true, Packet<lane_size>> process;
+  Process<true, lane_size> process;
 };
 
 /// Applies \p fill to the shared buffer and initiates a send operation.
-template <bool T, typename S>
+template <bool T, uint32_t S>
 template <typename F>
 LIBC_INLINE void Port<T, S>::send(F fill) {
   uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
@@ -394,14 +408,14 @@ LIBC_INLINE void Port<T, S>::send(F fill) {
   process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p fill function to initialize the buffer and release the memory.
-  invoke_rpc(fill, process.packet[index]);
+  invoke_rpc<S>(fill, process.get_packet(index), process.header[index].mask);
   out = process.invert_outbox(index, out);
   owns_buffer = false;
   receive = false;
 }
 
 /// Applies \p use to the shared buffer and acknowledges the send.
-template <bool T, typename S>
+template <bool T, uint32_t S>
 template <typename U>
 LIBC_INLINE void Port<T, S>::recv(U use) {
   // We only exchange ownership of the buffer during a receive if we are waiting
@@ -417,13 +431,13 @@ LIBC_INLINE void Port<T, S>::recv(U use) {
   process.wait_for_ownership(lane_mask, index, out, in);
 
   // Apply the \p use function to read the memory out of the buffer.
-  invoke_rpc(use, process.packet[index]);
+  invoke_rpc<S>(use, process.get_packet(index), process.header[index].mask);
   receive = true;
   owns_buffer = true;
 }
 
 /// Combines a send and receive into a single function.
-template <bool T, typename S>
+template <bool T, uint32_t S>
 template <typename F, typename U>
 LIBC_INLINE void Port<T, S>::send_and_recv(F fill, U use) {
   send(fill);
@@ -433,7 +447,7 @@ LIBC_INLINE void Port<T, S>::send_and_recv(F fill, U use) {
 /// Combines a receive and send operation into a single function. The \p work
 /// function modifies the buffer in-place and the send is only used to initiate
 /// the copy back.
-template <bool T, typename S>
+template <bool T, uint32_t S>
 template <typename W>
 LIBC_INLINE void Port<T, S>::recv_and_send(W work) {
   recv(work);
@@ -442,7 +456,7 @@ LIBC_INLINE void Port<T, S>::recv_and_send(W work) {
 
 /// Helper routine to simplify the interface when sending from the GPU using
 /// thread private pointers to the underlying value.
-template <bool T, typename S>
+template <bool T, uint32_t S>
 LIBC_INLINE void Port<T, S>::send_n(const void *src, uint64_t size) {
   const void **src_ptr = &src;
   uint64_t *size_ptr = &size;
@@ -451,7 +465,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *src, uint64_t size) {
 
 /// Sends an arbitrarily sized data buffer \p src across the shared channel in
 /// multiples of the packet length.
-template <bool T, typename S>
+template <bool T, uint32_t S>
 LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
   uint64_t num_sends = 0;
   send([&](Buffer *buffer, uint32_t id) {
@@ -465,7 +479,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
     rpc_memcpy(&buffer->data[1], lane_value(src, id), len);
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
-  uint64_t mask = process.packet[index].header.mask;
+  uint64_t mask = process.header[index].mask;
   while (gpu::ballot(mask, idx < num_sends)) {
     send([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
@@ -481,7 +495,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
 /// Receives an arbitrarily sized data buffer across the shared channel in
 /// multiples of the packet length. The \p alloc function is called with the
 /// size of the data so that we can initialize the size of the \p dst buffer.
-template <bool T, typename S>
+template <bool T, uint32_t S>
 template <typename A>
 LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
   uint64_t num_recvs = 0;
@@ -498,7 +512,7 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
     rpc_memcpy(lane_value(dst, id), &buffer->data[1], len);
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
-  uint64_t mask = process.packet[index].header.mask;
+  uint64_t mask = process.header[index].mask;
   while (gpu::ballot(mask, idx < num_recvs)) {
     recv([=](Buffer *buffer, uint32_t id) {
       uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
@@ -516,7 +530,10 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
 /// is, there are send operations pending that haven't been serviced on this
 /// port. Each port instance uses an associated \p opcode to tell the server
 /// what to do.
-template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
+template <uint32_t lane_size>
+template <uint16_t opcode>
+[[clang::convergent]] LIBC_INLINE typename Client<lane_size>::Port
+Client<lane_size>::open() {
   // Repeatedly perform a naive linear scan for a port that can be opened to
   // send data.
   for (uint32_t index = gpu::get_cluster_id();; ++index) {
@@ -540,8 +557,8 @@ template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
     }
 
     if (gpu::is_first_lane(lane_mask)) {
-      process.packet[index].header.opcode = opcode;
-      process.packet[index].header.mask = lane_mask;
+      process.header[index].opcode = opcode;
+      process.header[index].mask = lane_mask;
     }
     gpu::sync_lane(lane_mask);
     return Port(process, lane_mask, index, out);
diff --git a/libc/src/__support/RPC/rpc_client.cpp b/libc/src/__support/RPC/rpc_client.cpp
index 8367d54fada63b..3e51a25062c822 100644
--- a/libc/src/__support/RPC/rpc_client.cpp
+++ b/libc/src/__support/RPC/rpc_client.cpp
@@ -13,7 +13,7 @@ namespace LIBC_NAMESPACE {
 namespace rpc {
 
 /// The libc client instance used to communicate with the server.
-Client client;
+Client<> client;
 
 /// Externally visible symbol to signify the usage of an RPC client to
 /// whomever needs to run the server as well as provide a way to initialize
diff --git a/libc/src/__support/RPC/rpc_client.h b/libc/src/__support/RPC/rpc_client.h
index 571d7cce2a8039..8bfdad311317a8 100644
--- a/libc/src/__support/RPC/rpc_client.h
+++ b/libc/src/__support/RPC/rpc_client.h
@@ -17,7 +17,7 @@ namespace LIBC_NAMESPACE {
 namespace rpc {
 
 /// The libc client instance used to communicate with the server.
-extern Client client;
+extern Client<> client;
 
 } // namespace rpc
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/gpu/rpc_host_call.cpp b/libc/src/gpu/rpc_host_call.cpp
index 7b9b9f2adfd536..499e2a811e4dab 100644
--- a/libc/src/gpu/rpc_host_call.cpp
+++ b/libc/src/gpu/rpc_host_call.cpp
@@ -17,7 +17,7 @@ namespace LIBC_NAMESPACE {
 // This calls the associated function pointer on the RPC server with the given
 // arguments. We expect that the pointer here is a valid pointer on the server.
 LLVM_LIBC_FUNCTION(void, rpc_host_call, (void *fn, void *data, size_t size)) {
-  rpc::Client::Port port = rpc::client.open<RPC_HOST_CALL>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_HOST_CALL>();
   port.send_n(data, size);
   port.send([=](rpc::Buffer *buffer) {
     buffer->data[0] = reinterpret_cast<uintptr_t>(fn);
diff --git a/libc/src/stdio/gpu/clearerr.cpp b/libc/src/stdio/gpu/clearerr.cpp
index 2267fdf8115eed..02c1a2f53adb8b 100644
--- a/libc/src/stdio/gpu/clearerr.cpp
+++ b/libc/src/stdio/gpu/clearerr.cpp
@@ -14,7 +14,7 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void, clearerr, (::FILE * stream)) {
-  rpc::Client::Port port = rpc::client.open<RPC_CLEARERR>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_CLEARERR>();
   port.send_and_recv(
       [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); },
       [&](rpc::Buffer *) {});
diff --git a/libc/src/stdio/gpu/fclose.cpp b/libc/src/stdio/gpu/fclose.cpp
index bdedbe51ec38ea..49e67e47d54efb 100644
--- a/libc/src/stdio/gpu/fclose.cpp
+++ b/libc/src/stdio/gpu/fclose.cpp
@@ -16,7 +16,7 @@ namespace LIBC_NAMESPACE {
 LLVM_LIBC_FUNCTION(int, fclose, (::FILE * stream)) {
   uint64_t ret = 0;
   uintptr_t file = reinterpret_cast<uintptr_t>(stream);
-  rpc::Client::Port port = rpc::client.open<RPC_CLOSE_FILE>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_CLOSE_FILE>();
   port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = file; },
                      [&](rpc::Buffer *buffer) { ret = buffer->data[0]; });
   port.close();
diff --git a/libc/src/stdio/gpu/feof.cpp b/libc/src/stdio/gpu/feof.cpp
index ddcef384142c42..16c18d3cedf4c3 100644
--- a/libc/src/stdio/gpu/feof.cpp
+++ b/libc/src/stdio/gpu/feof.cpp
@@ -15,7 +15,7 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(int, feof, (::FILE * stream)) {
   int ret;
-  rpc::Client::Port port = rpc::client.open<RPC_FEOF>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_FEOF>();
   port.send_and_recv(
       [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); },
       [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
diff --git a/libc/src/stdio/gpu/ferror.cpp b/libc/src/stdio/gpu/ferror.cpp
index 9ed598fb67541e..54aebda23a1d90 100644
--- a/libc/src/stdio/gpu/ferror.cpp
+++ b/libc/src/stdio/gpu/ferror.cpp
@@ -15,7 +15,7 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(int, ferror, (::FILE * stream)) {
   int ret;
-  rpc::Client::Port port = rpc::client.open<RPC_FERROR>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_FERROR>();
   port.send_and_recv(
       [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); },
       [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
diff --git a/libc/src/stdio/gpu/fflush.cpp b/libc/src/stdio/gpu/fflush.cpp
index 68192174e58a8e..a2f27138246716 100644
--- a/libc/src/stdio/gpu/fflush.cpp
+++ b/libc/src/stdio/gpu/fflush.cpp
@@ -15,7 +15,7 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(int, fflush, (::FILE * stream)) {
   int ret;
-  rpc::Client::Port port = rpc::client.open<RPC_FFLUSH>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_FFLUSH>();
   port.send_and_recv(
       [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); },
       [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
diff --git a/libc/src/stdio/gpu/fgets.cpp b/libc/src/stdio/gpu/fgets.cpp
index 5ea4bdcdc9e0fe..8d6ebd5bb1609b 100644
--- a/libc/src/stdio/gpu/fgets.cpp
+++ b/libc/src/stdio/gpu/fgets.cpp
@@ -24,7 +24,7 @@ LLVM_LIBC_FUNCTION(char *, fgets,
 
   uint64_t recv_size;
   void *buf = nullptr;
-  rpc::Client::Port port = rpc::client.open<RPC_READ_FGETS>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_READ_FGETS>();
   port.send([=](rpc::Buffer *buffer) {
     buffer->data[0] = count;
     buffer->data[1] = file::from_stream(stream);
diff --git a/libc/src/stdio/gpu/file.h b/libc/src/stdio/gpu/file.h
index 2cab2e6d36a259..edeaecef16bf60 100644
--- a/libc/src/stdio/gpu/file.h
+++ b/libc/src/stdio/gpu/file.h
@@ -50,7 +50,7 @@ LIBC_INLINE ::FILE *to_stream(uintptr_t f) {
 template <uint16_t opcode>
 LIBC_INLINE uint64_t write_impl(::FILE *file, const void *data, size_t size) {
   uint64_t ret = 0;
-  rpc::Client::Port port = rpc::client.open<opcode>();
+  rpc::Client<>::Port port = rpc::client.open<opcode>();
 
   if constexpr (opcode == RPC_WRITE_TO_STREAM) {
     port.send([&](rpc::Buffer *buffer) {
@@ -78,7 +78,7 @@ LIBC_INLINE uint64_t write(::FILE *f, const void *data, size_t size) {
 LIBC_INLINE uint64_t read_from_stream(::FILE *file, void *buf, size_t size) {
   uint64_t ret = 0;
   uint64_t recv_size;
-  rpc::Client::Port port = rpc::client.open<RPC_READ_FROM_STREAM>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_READ_FROM_STREAM>();
   port.send([=](rpc::Buffer *buffer) {
     buffer->data[0] = size;
     buffer->data[1] = from_stream(file);
diff --git a/libc/src/stdio/gpu/fopen.cpp b/libc/src/stdio/gpu/fopen.cpp
index 41d2c89473168e..5234a3ac534d3d 100644
--- a/libc/src/stdio/gpu/fopen.cpp
+++ b/libc/src/stdio/gpu/fopen.cpp
@@ -17,7 +17,7 @@ namespace LIBC_NAMESPACE {
 LLVM_LIBC_FUNCTION(::FILE *, fopen,
                    (const char *__restrict path, const char *__restrict mode)) {
   uintptr_t file;
-  rpc::Client::Port port = rpc::client.open<RPC_OPEN_FILE>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_OPEN_FILE>();
   port.send_n(path, internal::string_length(path) + 1);
   port.send_and_recv(
       [=](rpc::Buffer *buffer) {
diff --git a/libc/src/stdio/gpu/fseek.cpp b/libc/src/stdio/gpu/fseek.cpp
index 3e93ddb46bf975..e67e7acc72e224 100644
--- a/libc/src/stdio/gpu/fseek.cpp
+++ b/libc/src/stdio/gpu/fseek.cpp
@@ -15,7 +15,7 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(int, fseek, (::FILE * stream, long offset, int whence)) {
   int ret;
-  rpc::Client::Port port = rpc::client.open<RPC_FSEEK>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_FSEEK>();
   port.send_and_recv(
       [=](rpc::Buffer *buffer) {
         buffer->data[0] = file::from_stream(stream);
diff --git a/libc/src/stdio/gpu/ftell.cpp b/libc/src/stdio/gpu/ftell.cpp
index aea3e8b229fce6..e39c95211915a8 100644
--- a/libc/src/stdio/gpu/ftell.cpp
+++ b/libc/src/stdio/gpu/ftell.cpp
@@ -15,7 +15,7 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(long, ftell, (::FILE * stream)) {
   long ret;
-  rpc::Client::Port port = rpc::client.open<RPC_FSEEK>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_FSEEK>();
   port.send_and_recv(
       [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); },
       [&](rpc::Buffer *buffer) { ret = static_cast<long>(buffer->data[0]); });
diff --git a/libc/src/stdio/gpu/ungetc.cpp b/libc/src/stdio/gpu/ungetc.cpp
index 373164a0c53a32..ba289257a01742 100644
--- a/libc/src/stdio/gpu/ungetc.cpp
+++ b/libc/src/stdio/gpu/ungetc.cpp
@@ -15,7 +15,7 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(int, ungetc, (int c, ::FILE *stream)) {
   int ret;
-  rpc::Client::Port port = rpc::client.open<RPC_UNGETC>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_UNGETC>();
   port.send_and_recv(
       [=](rpc::Buffer *buffer) {
         buffer->data[0] = c;
diff --git a/libc/src/stdlib/gpu/abort.cpp b/libc/src/stdlib/gpu/abort.cpp
index f3b052be6d24db..5a1a75cd4fb5f0 100644
--- a/libc/src/stdlib/gpu/abort.cpp
+++ b/libc/src/stdlib/gpu/abort.cpp
@@ -15,7 +15,7 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void, abort, ()) {
   // We want to first make sure the server is listening before we abort.
-  rpc::Client::Port port = rpc::client.open<RPC_ABORT>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_ABORT>();
   port.send_and_recv([](rpc::Buffer *) {}, [](rpc::Buffer *) {});
   port.send([&](rpc::Buffer *) {});
   port.close();
diff --git a/libc/src/stdlib/gpu/free.cpp b/libc/src/stdlib/gpu/free.cpp
index 3a41e5febad0bb..2527eea8a83837 100644
--- a/libc/src/stdlib/gpu/free.cpp
+++ b/libc/src/stdlib/gpu/free.cpp
@@ -13,7 +13,7 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void, free, (void *ptr)) {
-  rpc::Client::Port port = rpc::client.open<RPC_FREE>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_FREE>();
   port.send([=](rpc::Buffer *buffer) {
     buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
   });
diff --git a/libc/src/stdlib/gpu/malloc.cpp b/libc/src/stdlib/gpu/malloc.cpp
index a2196907830513..ab0187acfbf115 100644
--- a/libc/src/stdlib/gpu/malloc.cpp
+++ b/libc/src/stdlib/gpu/malloc.cpp
@@ -14,7 +14,7 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void *, malloc, (size_t size)) {
   void *ptr = nullptr;
-  rpc::Client::Port port = rpc::client.open<RPC_MALLOC>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_MALLOC>();
   port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = size; },
                      [&](rpc::Buffer *buffer) {
                        ptr = reinterpret_cast<void *>(buffer->data[0]);
diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
index 674e2cc1ed7499..92b11ddf151a49 100644
--- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
@@ -17,7 +17,7 @@ using namespace LIBC_NAMESPACE;
 // as long as they are mirrored.
 static void test_interface(bool end_with_send) {
   uint64_t cnt = 0;
-  rpc::Client::Port port = rpc::client.open<RPC_TEST_INTERFACE>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_TEST_INTERFACE>();
   port.send([&](rpc::Buffer *buffer) { buffer->data[0] = end_with_send; });
   port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
   port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
diff --git a/libc/test/integration/startup/gpu/rpc_stream_test.cpp b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
index 09a4ae67256e3a..91a242a57c2d3d 100644
--- a/libc/test/integration/startup/gpu/rpc_stream_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
@@ -34,7 +34,7 @@ static void test_stream() {
 
   inline_memcpy(send_ptr, str, send_size);
   ASSERT_TRUE(inline_memcmp(send_ptr, str, send_size) == 0 && "Data mismatch");
-  rpc::Client::Port port = rpc::client.open<RPC_TEST_STREAM>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_TEST_STREAM>();
   port.send_n(send_ptr, send_size);
   port.recv_n(&recv_ptr, &recv_size,
               [](uint64_t size) { return malloc(size); });
@@ -77,7 +77,7 @@ static void test_divergent() {
   inline_memcpy(buffer, &data[offset], offset);
   ASSERT_TRUE(inline_memcmp(buffer, &data[offset], offset) == 0 &&
               "Data mismatch");
-  rpc::Client::Port port = rpc::client.open<RPC_TEST_STREAM>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_TEST_STREAM>();
   port.send_n(buffer, offset);
   inline_memset(buffer, offset, 0);
   port.recv_n(&recv_ptr, &recv_size, [&](uint64_t) { return buffer; });
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
index 4032d890c53ec8..fb5a5d29d54b1e 100644
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -18,7 +18,7 @@ static void test_add_simple() {
       10 + 10 * gpu::get_thread_id() + 10 * gpu::get_block_id();
   uint64_t cnt = 0;
   for (uint32_t i = 0; i < num_additions; ++i) {
-    rpc::Client::Port port = rpc::client.open<RPC_TEST_INCREMENT>();
+    rpc::Client<>::Port port = rpc::client.open<RPC_TEST_INCREMENT>();
     port.send_and_recv(
         [=](rpc::Buffer *buffer) {
           reinterpret_cast<uint64_t *>(buffer->data)[0] = cnt;
@@ -33,7 +33,7 @@ static void test_add_simple() {
 
 // Test to ensure that the RPC mechanism doesn't hang on divergence.
 static void test_noop(uint8_t data) {
-  rpc::Client::Port port = rpc::client.open<RPC_NOOP>();
+  rpc::Client<>::Port port = rpc::client.open<RPC_NOOP>();
   port.send([=](rpc::Buffer *buffer) { buffer->data[0] = data; });
   port.close();
 }
diff --git a/libc/test/src/__support/RPC/rpc_smoke_test.cpp b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
index 54821e21f9ccf7..ea40bea0e4c695 100644
--- a/libc/test/src/__support/RPC/rpc_smoke_test.cpp
+++ b/libc/test/src/__support/RPC/rpc_smoke_test.cpp
@@ -13,12 +13,8 @@
 namespace {
 enum { lane_size = 8, port_count = 4 };
 
-struct Packet {
-  uint64_t unused;
-};
-
-using ProcAType = LIBC_NAMESPACE::rpc::Process<false, Packet>;
-using ProcBType = LIBC_NAMESPACE::rpc::Process<true, Packet>;
+using ProcAType = LIBC_NAMESPACE::rpc::Process<false, 1>;
+using ProcBType = LIBC_NAMESPACE::rpc::Process<true, 1>;
 
 static_assert(ProcAType::inbox_offset(port_count) ==
               ProcBType::outbox_offset(port_count));
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index a2e5d0fd5a833f..5567a2f7b5f903 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -249,7 +249,7 @@ struct Device {
       : buffer(buffer), server(std::move(server)), client(num_ports, buffer) {}
   void *buffer;
   Server server;
-  rpc::Client client;
+  rpc::Client<> client;
   std::unordered_map<rpc_opcode_t, rpc_opcode_callback_ty> callbacks;
   std::unordered_map<rpc_opcode_t, void *> callback_data;
 };
@@ -394,7 +394,7 @@ const void *rpc_get_client_buffer(uint32_t device_id) {
   return &state->devices[device_id]->client;
 }
 
-uint64_t rpc_get_client_size() { return sizeof(rpc::Client); }
+uint64_t rpc_get_client_size() { return sizeof(rpc::Client<>); }
 
 using ServerPort = std::variant<rpc::Server<1>::Port *, rpc::Server<32>::Port *,
                                 rpc::Server<64>::Port *>;