[libc] [llvm] [libc] Remove RPC server API and use the header directly (PR #117075)
Joseph Huber via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 21 08:13:03 PST 2024
https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/117075
>From 3e8b0f8ef848325d79165285aeeb33b2e9614402 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 20 Nov 2024 13:24:04 -0600
Subject: [PATCH 1/2] [libc] Move RPC interface to `libc/shared` to export it
Summary:
Previous patches have made the `rpc.h` header independent of the `libc`
internals. This allows us to include it directly rather than providing
an indirect C API. This patch only does the work to move the header. A
future patch will pull out the `rpc_server` interface and simply replace
it with a single function that handles the opcodes.
---
libc/{src/__support/RPC => shared}/rpc.h | 162 +++++++++---------
libc/{src/__support/RPC => shared}/rpc_util.h | 109 ++++++------
libc/src/__support/RPC/CMakeLists.txt | 15 --
libc/src/__support/RPC/rpc_client.cpp | 2 +-
libc/src/__support/RPC/rpc_client.h | 8 +-
.../startup/gpu/rpc_interface_test.cpp | 50 ++++--
.../startup/gpu/rpc_stream_test.cpp | 6 +-
.../test/integration/startup/gpu/rpc_test.cpp | 14 +-
libc/utils/gpu/server/rpc_server.cpp | 52 ++++--
9 files changed, 222 insertions(+), 196 deletions(-)
rename libc/{src/__support/RPC => shared}/rpc.h (80%)
rename libc/{src/__support/RPC => shared}/rpc_util.h (61%)
diff --git a/libc/src/__support/RPC/rpc.h b/libc/shared/rpc.h
similarity index 80%
rename from libc/src/__support/RPC/rpc.h
rename to libc/shared/rpc.h
index 30dd2c1a8125d74..489a8cebfb807c4 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/shared/rpc.h
@@ -15,16 +15,17 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC___SUPPORT_RPC_RPC_H
-#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_H
+#ifndef LLVM_LIBC_SHARED_RPC_H
+#define LLVM_LIBC_SHARED_RPC_H
#include "rpc_util.h"
-#include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
#include <stdint.h>
-namespace LIBC_NAMESPACE_DECL {
+#ifndef RPC_INLINE
+#define RPC_INLINE inline
+#endif
+
namespace rpc {
/// Use scoped atomic variants if they are available for the target.
@@ -70,12 +71,12 @@ constexpr static uint64_t MAX_PORT_COUNT = 4096;
/// - The server will always start with a 'recv' operation.
/// - Every 'send' or 'recv' call is mirrored by the other process.
template <bool Invert> struct Process {
- LIBC_INLINE Process() = default;
- LIBC_INLINE Process(const Process &) = delete;
- LIBC_INLINE Process &operator=(const Process &) = delete;
- LIBC_INLINE Process(Process &&) = default;
- LIBC_INLINE Process &operator=(Process &&) = default;
- LIBC_INLINE ~Process() = default;
+ RPC_INLINE Process() = default;
+ RPC_INLINE Process(const Process &) = delete;
+ RPC_INLINE Process &operator=(const Process &) = delete;
+ RPC_INLINE Process(Process &&) = default;
+ RPC_INLINE Process &operator=(Process &&) = default;
+ RPC_INLINE ~Process() = default;
uint32_t port_count = 0;
uint32_t *inbox = nullptr;
@@ -86,7 +87,7 @@ template <bool Invert> struct Process {
static constexpr uint64_t NUM_BITS_IN_WORD = sizeof(uint32_t) * 8;
uint32_t lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
- LIBC_INLINE Process(uint32_t port_count, void *buffer)
+ RPC_INLINE Process(uint32_t port_count, void *buffer)
: port_count(port_count), inbox(reinterpret_cast<uint32_t *>(
advance(buffer, inbox_offset(port_count)))),
outbox(reinterpret_cast<uint32_t *>(
@@ -105,20 +106,20 @@ template <bool Invert> struct Process {
/// Header header[port_count];
/// Buffer packet[port_count][lane_size];
/// };
- LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count,
- uint32_t lane_size) {
+ RPC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count,
+ uint32_t lane_size) {
return buffer_offset(port_count) + buffer_bytes(port_count, lane_size);
}
/// Retrieve the inbox state from memory shared between processes.
- LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
+ RPC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
return rpc::broadcast_value(
lane_mask, __scoped_atomic_load_n(&inbox[index], __ATOMIC_RELAXED,
__MEMORY_SCOPE_SYSTEM));
}
/// Retrieve the outbox state from memory shared between processes.
- LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
+ RPC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
return rpc::broadcast_value(
lane_mask, __scoped_atomic_load_n(&outbox[index], __ATOMIC_RELAXED,
__MEMORY_SCOPE_SYSTEM));
@@ -128,7 +129,7 @@ template <bool Invert> struct Process {
/// Equivalent to loading outbox followed by store of the inverted value
/// The outbox is write only by this warp and tracking the value locally is
/// cheaper than calling load_outbox to get the value to store.
- LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
+ RPC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
uint32_t inverted_outbox = !current_outbox;
__scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM);
__scoped_atomic_store_n(&outbox[index], inverted_outbox, __ATOMIC_RELAXED,
@@ -138,8 +139,8 @@ template <bool Invert> struct Process {
// Given the current outbox and inbox values, wait until the inbox changes
// to indicate that this thread owns the buffer element.
- LIBC_INLINE void wait_for_ownership(uint64_t lane_mask, uint32_t index,
- uint32_t outbox, uint32_t in) {
+ RPC_INLINE void wait_for_ownership(uint64_t lane_mask, uint32_t index,
+ uint32_t outbox, uint32_t in) {
while (buffer_unavailable(in, outbox)) {
sleep_briefly();
in = load_inbox(lane_mask, index);
@@ -150,14 +151,14 @@ template <bool Invert> struct Process {
/// The packet is a linearly allocated array of buffers used to communicate
/// with the other process. This function returns the appropriate slot in this
/// array such that the process can operate on an entire warp or wavefront.
- LIBC_INLINE Buffer *get_packet(uint32_t index, uint32_t lane_size) {
+ RPC_INLINE Buffer *get_packet(uint32_t index, uint32_t lane_size) {
return &packet[index * lane_size];
}
/// Determines if this process needs to wait for ownership of the buffer. We
/// invert the condition on one of the processes to indicate that if one
/// process owns the buffer then the other does not.
- LIBC_INLINE static bool buffer_unavailable(uint32_t in, uint32_t out) {
+ RPC_INLINE static bool buffer_unavailable(uint32_t in, uint32_t out) {
bool cond = in != out;
return Invert ? !cond : cond;
}
@@ -166,7 +167,7 @@ template <bool Invert> struct Process {
/// lane_mask is a bitmap of the threads in the warp that would hold the
/// single lock on success, e.g. the result of rpc::get_lane_mask()
/// The lock is held when the n-th bit of the lock bitfield is set.
- LIBC_INLINE bool try_lock(uint64_t lane_mask, uint32_t index) {
+ RPC_INLINE bool try_lock(uint64_t lane_mask, uint32_t index) {
// On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
// On volta, need to handle differences between the threads running and
// the threads that were detected in the previous call to get_lane_mask()
@@ -206,7 +207,7 @@ template <bool Invert> struct Process {
/// Unlock the lock at index. We need a lane sync to keep this function
/// convergent, otherwise the compiler will sink the store and deadlock.
- LIBC_INLINE void unlock(uint64_t lane_mask, uint32_t index) {
+ RPC_INLINE void unlock(uint64_t lane_mask, uint32_t index) {
// Do not move any writes past the unlock.
__scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_DEVICE);
@@ -219,40 +220,40 @@ template <bool Invert> struct Process {
}
/// Number of bytes to allocate for an inbox or outbox.
- LIBC_INLINE static constexpr uint64_t mailbox_bytes(uint32_t port_count) {
+ RPC_INLINE static constexpr uint64_t mailbox_bytes(uint32_t port_count) {
return port_count * sizeof(uint32_t);
}
/// Number of bytes to allocate for the buffer containing the packets.
- LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count,
- uint32_t lane_size) {
+ RPC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count,
+ uint32_t lane_size) {
return port_count * lane_size * sizeof(Buffer);
}
/// Offset of the inbox in memory. This is the same as the outbox if inverted.
- LIBC_INLINE static constexpr uint64_t inbox_offset(uint32_t port_count) {
+ RPC_INLINE static constexpr uint64_t inbox_offset(uint32_t port_count) {
return Invert ? mailbox_bytes(port_count) : 0;
}
/// Offset of the outbox in memory. This is the same as the inbox if inverted.
- LIBC_INLINE static constexpr uint64_t outbox_offset(uint32_t port_count) {
+ RPC_INLINE static constexpr uint64_t outbox_offset(uint32_t port_count) {
return Invert ? 0 : mailbox_bytes(port_count);
}
/// Offset of the buffer containing the packets after the inbox and outbox.
- LIBC_INLINE static constexpr uint64_t header_offset(uint32_t port_count) {
+ RPC_INLINE static constexpr uint64_t header_offset(uint32_t port_count) {
return align_up(2 * mailbox_bytes(port_count), alignof(Header));
}
/// Offset of the buffer containing the packets after the inbox and outbox.
- LIBC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) {
+ RPC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) {
return align_up(header_offset(port_count) + port_count * sizeof(Header),
alignof(Buffer));
}
/// Conditionally set the n-th bit in the atomic bitfield.
- LIBC_INLINE static constexpr uint32_t set_nth(uint32_t *bits, uint32_t index,
- bool cond) {
+ RPC_INLINE static constexpr uint32_t set_nth(uint32_t *bits, uint32_t index,
+ bool cond) {
uint32_t slot = index / NUM_BITS_IN_WORD;
uint32_t bit = index % NUM_BITS_IN_WORD;
return __scoped_atomic_fetch_or(&bits[slot],
@@ -262,8 +263,8 @@ template <bool Invert> struct Process {
}
/// Conditionally clear the n-th bit in the atomic bitfield.
- LIBC_INLINE static constexpr uint32_t clear_nth(uint32_t *bits,
- uint32_t index, bool cond) {
+ RPC_INLINE static constexpr uint32_t clear_nth(uint32_t *bits, uint32_t index,
+ bool cond) {
uint32_t slot = index / NUM_BITS_IN_WORD;
uint32_t bit = index % NUM_BITS_IN_WORD;
return __scoped_atomic_fetch_and(&bits[slot],
@@ -275,8 +276,8 @@ template <bool Invert> struct Process {
/// Invokes a function accross every active buffer across the total lane size.
template <typename F>
-LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
- uint64_t lane_mask, Buffer *slot) {
+RPC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
+ uint64_t lane_mask, Buffer *slot) {
if constexpr (is_process_gpu()) {
fn(&slot[rpc::get_lane_id()], rpc::get_lane_id());
} else {
@@ -290,40 +291,40 @@ LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
/// processes. A port is conceptually an index into the memory provided by the
/// underlying process that is guarded by a lock bit.
template <bool T> struct Port {
- LIBC_INLINE Port(Process<T> &process, uint64_t lane_mask, uint32_t lane_size,
- uint32_t index, uint32_t out)
+ RPC_INLINE Port(Process<T> &process, uint64_t lane_mask, uint32_t lane_size,
+ uint32_t index, uint32_t out)
: process(process), lane_mask(lane_mask), lane_size(lane_size),
index(index), out(out), receive(false), owns_buffer(true) {}
- LIBC_INLINE ~Port() = default;
+ RPC_INLINE ~Port() = default;
private:
- LIBC_INLINE Port(const Port &) = delete;
- LIBC_INLINE Port &operator=(const Port &) = delete;
- LIBC_INLINE Port(Port &&) = default;
- LIBC_INLINE Port &operator=(Port &&) = default;
+ RPC_INLINE Port(const Port &) = delete;
+ RPC_INLINE Port &operator=(const Port &) = delete;
+ RPC_INLINE Port(Port &&) = default;
+ RPC_INLINE Port &operator=(Port &&) = default;
friend struct Client;
friend struct Server;
friend class rpc::optional<Port<T>>;
public:
- template <typename U> LIBC_INLINE void recv(U use);
- template <typename F> LIBC_INLINE void send(F fill);
+ template <typename U> RPC_INLINE void recv(U use);
+ template <typename F> RPC_INLINE void send(F fill);
template <typename F, typename U>
- LIBC_INLINE void send_and_recv(F fill, U use);
- template <typename W> LIBC_INLINE void recv_and_send(W work);
- LIBC_INLINE void send_n(const void *const *src, uint64_t *size);
- LIBC_INLINE void send_n(const void *src, uint64_t size);
+ RPC_INLINE void send_and_recv(F fill, U use);
+ template <typename W> RPC_INLINE void recv_and_send(W work);
+ RPC_INLINE void send_n(const void *const *src, uint64_t *size);
+ RPC_INLINE void send_n(const void *src, uint64_t size);
template <typename A>
- LIBC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
+ RPC_INLINE void recv_n(void **dst, uint64_t *size, A &&alloc);
- LIBC_INLINE uint32_t get_opcode() const {
+ RPC_INLINE uint32_t get_opcode() const {
return process.header[index].opcode;
}
- LIBC_INLINE uint32_t get_index() const { return index; }
+ RPC_INLINE uint32_t get_index() const { return index; }
- LIBC_INLINE void close() {
+ RPC_INLINE void close() {
// Wait for all lanes to finish using the port.
rpc::sync_lane(lane_mask);
@@ -346,16 +347,16 @@ template <bool T> struct Port {
/// The RPC client used to make requests to the server.
struct Client {
- LIBC_INLINE Client() = default;
- LIBC_INLINE Client(const Client &) = delete;
- LIBC_INLINE Client &operator=(const Client &) = delete;
- LIBC_INLINE ~Client() = default;
+ RPC_INLINE Client() = default;
+ RPC_INLINE Client(const Client &) = delete;
+ RPC_INLINE Client &operator=(const Client &) = delete;
+ RPC_INLINE ~Client() = default;
- LIBC_INLINE Client(uint32_t port_count, void *buffer)
+ RPC_INLINE Client(uint32_t port_count, void *buffer)
: process(port_count, buffer) {}
using Port = rpc::Port<false>;
- template <uint32_t opcode> LIBC_INLINE Port open();
+ template <uint32_t opcode> RPC_INLINE Port open();
private:
Process<false> process;
@@ -363,21 +364,21 @@ struct Client {
/// The RPC server used to respond to the client.
struct Server {
- LIBC_INLINE Server() = default;
- LIBC_INLINE Server(const Server &) = delete;
- LIBC_INLINE Server &operator=(const Server &) = delete;
- LIBC_INLINE ~Server() = default;
+ RPC_INLINE Server() = default;
+ RPC_INLINE Server(const Server &) = delete;
+ RPC_INLINE Server &operator=(const Server &) = delete;
+ RPC_INLINE ~Server() = default;
- LIBC_INLINE Server(uint32_t port_count, void *buffer)
+ RPC_INLINE Server(uint32_t port_count, void *buffer)
: process(port_count, buffer) {}
using Port = rpc::Port<true>;
- LIBC_INLINE rpc::optional<Port> try_open(uint32_t lane_size,
- uint32_t start = 0);
- LIBC_INLINE Port open(uint32_t lane_size);
+ RPC_INLINE rpc::optional<Port> try_open(uint32_t lane_size,
+ uint32_t start = 0);
+ RPC_INLINE Port open(uint32_t lane_size);
- LIBC_INLINE static uint64_t allocation_size(uint32_t lane_size,
- uint32_t port_count) {
+ RPC_INLINE static uint64_t allocation_size(uint32_t lane_size,
+ uint32_t port_count) {
return Process<true>::allocation_size(port_count, lane_size);
}
@@ -386,7 +387,7 @@ struct Server {
};
/// Applies \p fill to the shared buffer and initiates a send operation.
-template <bool T> template <typename F> LIBC_INLINE void Port<T>::send(F fill) {
+template <bool T> template <typename F> RPC_INLINE void Port<T>::send(F fill) {
uint32_t in = owns_buffer ? out ^ T : process.load_inbox(lane_mask, index);
// We need to wait until we own the buffer before sending.
@@ -401,7 +402,7 @@ template <bool T> template <typename F> LIBC_INLINE void Port<T>::send(F fill) {
}
/// Applies \p use to the shared buffer and acknowledges the send.
-template <bool T> template <typename U> LIBC_INLINE void Port<T>::recv(U use) {
+template <bool T> template <typename U> RPC_INLINE void Port<T>::recv(U use) {
// We only exchange ownership of the buffer during a receive if we are waiting
// for a previous receive to finish.
if (receive) {
@@ -424,7 +425,7 @@ template <bool T> template <typename U> LIBC_INLINE void Port<T>::recv(U use) {
/// Combines a send and receive into a single function.
template <bool T>
template <typename F, typename U>
-LIBC_INLINE void Port<T>::send_and_recv(F fill, U use) {
+RPC_INLINE void Port<T>::send_and_recv(F fill, U use) {
send(fill);
recv(use);
}
@@ -434,7 +435,7 @@ LIBC_INLINE void Port<T>::send_and_recv(F fill, U use) {
/// the copy back.
template <bool T>
template <typename W>
-LIBC_INLINE void Port<T>::recv_and_send(W work) {
+RPC_INLINE void Port<T>::recv_and_send(W work) {
recv(work);
send([](Buffer *, uint32_t) { /* no-op */ });
}
@@ -442,7 +443,7 @@ LIBC_INLINE void Port<T>::recv_and_send(W work) {
/// Helper routine to simplify the interface when sending from the GPU using
/// thread private pointers to the underlying value.
template <bool T>
-LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
+RPC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
const void **src_ptr = &src;
uint64_t *size_ptr = &size;
send_n(src_ptr, size_ptr);
@@ -451,7 +452,7 @@ LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
/// Sends an arbitrarily sized data buffer \p src across the shared channel in
/// multiples of the packet length.
template <bool T>
-LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
+RPC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
uint64_t num_sends = 0;
send([&](Buffer *buffer, uint32_t id) {
reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
@@ -482,7 +483,7 @@ LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
/// size of the data so that we can initialize the size of the \p dst buffer.
template <bool T>
template <typename A>
-LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
+RPC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
uint64_t num_recvs = 0;
recv([&](Buffer *buffer, uint32_t id) {
lane_value(size, id) = reinterpret_cast<uint64_t *>(buffer->data)[0];
@@ -516,7 +517,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
/// port. Each port instance uses an associated \p opcode to tell the server
/// what to do. The Client interface provides the appropriate lane size to the
/// port using the platform's returned value.
-template <uint32_t opcode> LIBC_INLINE Client::Port Client::open() {
+template <uint32_t opcode> RPC_INLINE Client::Port Client::open() {
// Repeatedly perform a naive linear scan for a port that can be opened to
// send data.
for (uint32_t index = 0;; ++index) {
@@ -550,7 +551,7 @@ template <uint32_t opcode> LIBC_INLINE Client::Port Client::open() {
/// Attempts to open a port to use as the server. The server can only open a
/// port if it has a pending receive operation
-LIBC_INLINE rpc::optional<typename Server::Port>
+RPC_INLINE rpc::optional<typename Server::Port>
Server::try_open(uint32_t lane_size, uint32_t start) {
// Perform a naive linear scan for a port that has a pending request.
for (uint32_t index = start; index < process.port_count; ++index) {
@@ -580,7 +581,7 @@ Server::try_open(uint32_t lane_size, uint32_t start) {
return rpc::nullopt;
}
-LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
+RPC_INLINE Server::Port Server::open(uint32_t lane_size) {
for (;;) {
if (rpc::optional<Server::Port> p = try_open(lane_size))
return rpc::move(p.value());
@@ -599,6 +600,5 @@ LIBC_INLINE Server::Port Server::open(uint32_t lane_size) {
#endif
} // namespace rpc
-} // namespace LIBC_NAMESPACE_DECL
-#endif
+#endif // LLVM_LIBC_SHARED_RPC_H
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/shared/rpc_util.h
similarity index 61%
rename from libc/src/__support/RPC/rpc_util.h
rename to libc/shared/rpc_util.h
index 7067dfc974eb31f..502014d839ae949 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/shared/rpc_util.h
@@ -6,11 +6,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
-#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
-
-#include "src/__support/macros/attributes.h"
-#include "src/__support/macros/config.h"
+#ifndef LLVM_LIBC_SHARED_RPC_UTIL_H
+#define LLVM_LIBC_SHARED_RPC_UTIL_H
#include <stddef.h>
#include <stdint.h>
@@ -20,7 +17,10 @@
#define RPC_TARGET_IS_GPU
#endif
-namespace LIBC_NAMESPACE_DECL {
+#ifndef RPC_INLINE
+#define RPC_INLINE inline
+#endif
+
namespace rpc {
template <typename T> struct type_identity {
@@ -40,26 +40,26 @@ template <class T> struct is_const<const T> : type_constant<bool, true> {};
/// Freestanding implementation of std::move.
template <class T>
-LIBC_INLINE constexpr typename remove_reference<T>::type &&move(T &&t) {
+RPC_INLINE constexpr typename remove_reference<T>::type &&move(T &&t) {
return static_cast<typename remove_reference<T>::type &&>(t);
}
/// Freestanding implementation of std::forward.
template <typename T>
-LIBC_INLINE constexpr T &&forward(typename remove_reference<T>::type &value) {
+RPC_INLINE constexpr T &&forward(typename remove_reference<T>::type &value) {
return static_cast<T &&>(value);
}
template <typename T>
-LIBC_INLINE constexpr T &&forward(typename remove_reference<T>::type &&value) {
+RPC_INLINE constexpr T &&forward(typename remove_reference<T>::type &&value) {
return static_cast<T &&>(value);
}
struct in_place_t {
- LIBC_INLINE explicit in_place_t() = default;
+ RPC_INLINE explicit in_place_t() = default;
};
struct nullopt_t {
- LIBC_INLINE constexpr explicit nullopt_t() = default;
+ RPC_INLINE constexpr explicit nullopt_t() = default;
};
constexpr inline in_place_t in_place{};
@@ -75,15 +75,15 @@ template <typename T> class optional {
bool in_use = false;
- LIBC_INLINE ~OptionalStorage() { reset(); }
+ RPC_INLINE ~OptionalStorage() { reset(); }
- LIBC_INLINE constexpr OptionalStorage() : empty() {}
+ RPC_INLINE constexpr OptionalStorage() : empty() {}
template <typename... Args>
- LIBC_INLINE constexpr explicit OptionalStorage(in_place_t, Args &&...args)
+ RPC_INLINE constexpr explicit OptionalStorage(in_place_t, Args &&...args)
: stored_value(forward<Args>(args)...) {}
- LIBC_INLINE constexpr void reset() {
+ RPC_INLINE constexpr void reset() {
if (in_use)
stored_value.~U();
in_use = false;
@@ -93,60 +93,54 @@ template <typename T> class optional {
OptionalStorage<T> storage;
public:
- LIBC_INLINE constexpr optional() = default;
- LIBC_INLINE constexpr optional(nullopt_t) {}
+ RPC_INLINE constexpr optional() = default;
+ RPC_INLINE constexpr optional(nullopt_t) {}
- LIBC_INLINE constexpr optional(const T &t) : storage(in_place, t) {
+ RPC_INLINE constexpr optional(const T &t) : storage(in_place, t) {
storage.in_use = true;
}
- LIBC_INLINE constexpr optional(const optional &) = default;
+ RPC_INLINE constexpr optional(const optional &) = default;
- LIBC_INLINE constexpr optional(T &&t) : storage(in_place, move(t)) {
+ RPC_INLINE constexpr optional(T &&t) : storage(in_place, move(t)) {
storage.in_use = true;
}
- LIBC_INLINE constexpr optional(optional &&O) = default;
+ RPC_INLINE constexpr optional(optional &&O) = default;
- LIBC_INLINE constexpr optional &operator=(T &&t) {
+ RPC_INLINE constexpr optional &operator=(T &&t) {
storage = move(t);
return *this;
}
- LIBC_INLINE constexpr optional &operator=(optional &&) = default;
+ RPC_INLINE constexpr optional &operator=(optional &&) = default;
- LIBC_INLINE constexpr optional &operator=(const T &t) {
+ RPC_INLINE constexpr optional &operator=(const T &t) {
storage = t;
return *this;
}
- LIBC_INLINE constexpr optional &operator=(const optional &) = default;
+ RPC_INLINE constexpr optional &operator=(const optional &) = default;
- LIBC_INLINE constexpr void reset() { storage.reset(); }
+ RPC_INLINE constexpr void reset() { storage.reset(); }
- LIBC_INLINE constexpr const T &value() const & {
- return storage.stored_value;
- }
+ RPC_INLINE constexpr const T &value() const & { return storage.stored_value; }
- LIBC_INLINE constexpr T &value() & { return storage.stored_value; }
+ RPC_INLINE constexpr T &value() & { return storage.stored_value; }
- LIBC_INLINE constexpr explicit operator bool() const {
- return storage.in_use;
- }
- LIBC_INLINE constexpr bool has_value() const { return storage.in_use; }
- LIBC_INLINE constexpr const T *operator->() const {
+ RPC_INLINE constexpr explicit operator bool() const { return storage.in_use; }
+ RPC_INLINE constexpr bool has_value() const { return storage.in_use; }
+ RPC_INLINE constexpr const T *operator->() const {
return &storage.stored_value;
}
- LIBC_INLINE constexpr T *operator->() { return &storage.stored_value; }
- LIBC_INLINE constexpr const T &operator*() const & {
+ RPC_INLINE constexpr T *operator->() { return &storage.stored_value; }
+ RPC_INLINE constexpr const T &operator*() const & {
return storage.stored_value;
}
- LIBC_INLINE constexpr T &operator*() & { return storage.stored_value; }
+ RPC_INLINE constexpr T &operator*() & { return storage.stored_value; }
- LIBC_INLINE constexpr T &&value() && { return move(storage.stored_value); }
- LIBC_INLINE constexpr T &&operator*() && {
- return move(storage.stored_value);
- }
+ RPC_INLINE constexpr T &&value() && { return move(storage.stored_value); }
+ RPC_INLINE constexpr T &&operator*() && { return move(storage.stored_value); }
};
/// Suspend the thread briefly to assist the thread scheduler during busy loops.
-LIBC_INLINE void sleep_briefly() {
+RPC_INLINE void sleep_briefly() {
#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
if (__nvvm_reflect("__CUDA_ARCH") >= 700)
asm("nanosleep.u32 64;" ::: "memory");
@@ -164,7 +158,7 @@ LIBC_INLINE void sleep_briefly() {
}
/// Conditional to indicate if this process is running on the GPU.
-LIBC_INLINE constexpr bool is_process_gpu() {
+RPC_INLINE constexpr bool is_process_gpu() {
#ifdef RPC_TARGET_IS_GPU
return true;
#else
@@ -173,14 +167,14 @@ LIBC_INLINE constexpr bool is_process_gpu() {
}
/// Wait for all lanes in the group to complete.
-LIBC_INLINE void sync_lane(uint64_t lane_mask) {
+RPC_INLINE void sync_lane(uint64_t lane_mask) {
#ifdef RPC_TARGET_IS_GPU
return __gpu_sync_lane(lane_mask);
#endif
}
/// Copies the value from the first active thread to the rest.
-LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
+RPC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
#ifdef RPC_TARGET_IS_GPU
return __gpu_read_first_lane_u32(lane_mask, x);
#else
@@ -189,7 +183,7 @@ LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask, uint32_t x) {
}
/// Returns the number lanes that participate in the RPC interface.
-LIBC_INLINE uint32_t get_num_lanes() {
+RPC_INLINE uint32_t get_num_lanes() {
#ifdef RPC_TARGET_IS_GPU
return __gpu_num_lanes();
#else
@@ -198,7 +192,7 @@ LIBC_INLINE uint32_t get_num_lanes() {
}
/// Returns the id of the thread inside of an AMD wavefront executing together.
-LIBC_INLINE uint64_t get_lane_mask() {
+RPC_INLINE uint64_t get_lane_mask() {
#ifdef RPC_TARGET_IS_GPU
return __gpu_lane_mask();
#else
@@ -207,7 +201,7 @@ LIBC_INLINE uint64_t get_lane_mask() {
}
/// Returns the id of the thread inside of an AMD wavefront executing together.
-LIBC_INLINE uint32_t get_lane_id() {
+RPC_INLINE uint32_t get_lane_id() {
#ifdef RPC_TARGET_IS_GPU
return __gpu_lane_id();
#else
@@ -216,7 +210,7 @@ LIBC_INLINE uint32_t get_lane_id() {
}
/// Conditional that is only true for a single thread in a lane.
-LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
+RPC_INLINE bool is_first_lane(uint64_t lane_mask) {
#ifdef RPC_TARGET_IS_GPU
return __gpu_is_first_in_lane(lane_mask);
#else
@@ -225,7 +219,7 @@ LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
}
/// Returns a bitmask of threads in the current lane for which \p x is true.
-LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
+RPC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
#ifdef RPC_TARGET_IS_GPU
return __gpu_ballot(lane_mask, x);
#else
@@ -235,7 +229,7 @@ LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
/// Return \p val aligned "upwards" according to \p align.
template <typename V, typename A>
-LIBC_INLINE constexpr V align_up(V val, A align) {
+RPC_INLINE constexpr V align_up(V val, A align) {
return ((val + V(align) - 1) / V(align)) * V(align);
}
@@ -243,14 +237,14 @@ LIBC_INLINE constexpr V align_up(V val, A align) {
/// model. On the GPU stack variables are always private to a lane so we can
/// simply use the variable passed in. On the CPU we need to allocate enough
/// space for the whole lane and index into it.
-template <typename V> LIBC_INLINE V &lane_value(V *val, uint32_t id) {
+template <typename V> RPC_INLINE V &lane_value(V *val, uint32_t id) {
if constexpr (is_process_gpu())
return *val;
return val[id];
}
/// Advance the \p p by \p bytes.
-template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
+template <typename T, typename U> RPC_INLINE T *advance(T *ptr, U bytes) {
if constexpr (is_const<T>::value)
return reinterpret_cast<T *>(reinterpret_cast<const uint8_t *>(ptr) +
bytes);
@@ -259,15 +253,14 @@ template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
}
/// Wrapper around the optimal memory copy implementation for the target.
-LIBC_INLINE void rpc_memcpy(void *dst, const void *src, size_t count) {
+RPC_INLINE void rpc_memcpy(void *dst, const void *src, size_t count) {
__builtin_memcpy(dst, src, count);
}
-template <class T> LIBC_INLINE constexpr const T &max(const T &a, const T &b) {
+template <class T> RPC_INLINE constexpr const T &max(const T &a, const T &b) {
return (a < b) ? b : a;
}
} // namespace rpc
-} // namespace LIBC_NAMESPACE_DECL
-#endif // LLVM_LIBC_SRC___SUPPORT_RPC_RPC_UTIL_H
+#endif // LLVM_LIBC_SHARED_RPC_UTIL_H
diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
index 183fc6f8683e067..0a7141fb60bf039 100644
--- a/libc/src/__support/RPC/CMakeLists.txt
+++ b/libc/src/__support/RPC/CMakeLists.txt
@@ -2,20 +2,6 @@ if(NOT LIBC_TARGET_OS_IS_GPU)
return()
endif()
-add_header_library(
- rpc
- HDRS
- rpc.h
- rpc_util.h
- DEPENDS
- libc.src.__support.common
- libc.src.__support.CPP.algorithm
- libc.src.__support.CPP.atomic
- libc.src.__support.CPP.functional
- libc.src.__support.CPP.optional
- libc.src.__support.GPU.utils
-)
-
add_object_library(
rpc_client
SRCS
@@ -25,5 +11,4 @@ add_object_library(
DEPENDS
libc.include.gpu_rpc
libc.src.__support.GPU.utils
- .rpc
)
diff --git a/libc/src/__support/RPC/rpc_client.cpp b/libc/src/__support/RPC/rpc_client.cpp
index 232b20d008d1d59..c26cf9ca2ddbe6f 100644
--- a/libc/src/__support/RPC/rpc_client.cpp
+++ b/libc/src/__support/RPC/rpc_client.cpp
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
#include "rpc_client.h"
-#include "rpc.h"
+
#include "src/__support/macros/config.h"
namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/__support/RPC/rpc_client.h b/libc/src/__support/RPC/rpc_client.h
index 7bd6d0b5e00b478..8923e62e0e22a0b 100644
--- a/libc/src/__support/RPC/rpc_client.h
+++ b/libc/src/__support/RPC/rpc_client.h
@@ -9,7 +9,7 @@
#ifndef LLVM_LIBC_SRC___SUPPORT_RPC_RPC_CLIENT_H
#define LLVM_LIBC_SRC___SUPPORT_RPC_RPC_CLIENT_H
-#include "rpc.h"
+#include "shared/rpc.h"
#include "include/llvm-libc-types/rpc_opcodes_t.h"
#include "src/__support/CPP/type_traits.h"
@@ -18,6 +18,12 @@
namespace LIBC_NAMESPACE_DECL {
namespace rpc {
+using ::rpc::Buffer;
+using ::rpc::Client;
+using ::rpc::Port;
+using ::rpc::Process;
+using ::rpc::Server;
+
static_assert(cpp::is_trivially_copyable<Client>::value &&
sizeof(Process<true>) == sizeof(Process<false>),
"The client is not trivially copyable from the server");
diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
index 2dafa911783ffc4..b05ffb92699bf79 100644
--- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
@@ -17,27 +17,43 @@ using namespace LIBC_NAMESPACE;
// as long as they are mirrored.
static void test_interface(bool end_with_send) {
uint64_t cnt = 0;
- rpc::Client::Port port = rpc::client.open<RPC_TEST_INTERFACE>();
- port.send(
- [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = end_with_send; });
- port.send(
- [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
- port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
- port.send(
- [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
- port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
- port.send(
- [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
- port.send(
- [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
- port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
- port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+ LIBC_NAMESPACE::rpc::Client::Port port =
+ LIBC_NAMESPACE::rpc::client.open<RPC_TEST_INTERFACE>();
+ port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = end_with_send;
+ });
+ port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = cnt = cnt + 1;
+ });
+ port.recv([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ cnt = buffer->data[0];
+ });
+ port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = cnt = cnt + 1;
+ });
+ port.recv([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ cnt = buffer->data[0];
+ });
+ port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = cnt = cnt + 1;
+ });
+ port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = cnt = cnt + 1;
+ });
+ port.recv([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ cnt = buffer->data[0];
+ });
+ port.recv([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ cnt = buffer->data[0];
+ });
if (end_with_send)
- port.send([&](rpc::Buffer *buffer, uint32_t) {
+ port.send([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
buffer->data[0] = cnt = cnt + 1;
});
else
- port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+ port.recv([&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ cnt = buffer->data[0];
+ });
port.close();
ASSERT_TRUE(cnt == 9 && "Invalid number of increments");
diff --git a/libc/test/integration/startup/gpu/rpc_stream_test.cpp b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
index 09a4ae67256e3a6..208130bcfd9a96e 100644
--- a/libc/test/integration/startup/gpu/rpc_stream_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
@@ -34,7 +34,8 @@ static void test_stream() {
inline_memcpy(send_ptr, str, send_size);
ASSERT_TRUE(inline_memcmp(send_ptr, str, send_size) == 0 && "Data mismatch");
- rpc::Client::Port port = rpc::client.open<RPC_TEST_STREAM>();
+ LIBC_NAMESPACE::rpc::Client::Port port =
+ LIBC_NAMESPACE::rpc::client.open<RPC_TEST_STREAM>();
port.send_n(send_ptr, send_size);
port.recv_n(&recv_ptr, &recv_size,
[](uint64_t size) { return malloc(size); });
@@ -77,7 +78,8 @@ static void test_divergent() {
inline_memcpy(buffer, &data[offset], offset);
ASSERT_TRUE(inline_memcmp(buffer, &data[offset], offset) == 0 &&
"Data mismatch");
- rpc::Client::Port port = rpc::client.open<RPC_TEST_STREAM>();
+ LIBC_NAMESPACE::rpc::Client::Port port =
+ LIBC_NAMESPACE::rpc::client.open<RPC_TEST_STREAM>();
port.send_n(buffer, offset);
inline_memset(buffer, offset, 0);
port.recv_n(&recv_ptr, &recv_size, [&](uint64_t) { return buffer; });
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
index bec8171180a0550..3deb72b9f85dab7 100644
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -18,12 +18,13 @@ static void test_add_simple() {
10 + 10 * gpu::get_thread_id() + 10 * gpu::get_block_id();
uint64_t cnt = 0;
for (uint32_t i = 0; i < num_additions; ++i) {
- rpc::Client::Port port = rpc::client.open<RPC_TEST_INCREMENT>();
+ LIBC_NAMESPACE::rpc::Client::Port port =
+ LIBC_NAMESPACE::rpc::client.open<RPC_TEST_INCREMENT>();
port.send_and_recv(
- [=](rpc::Buffer *buffer, uint32_t) {
+ [=](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
reinterpret_cast<uint64_t *>(buffer->data)[0] = cnt;
},
- [&](rpc::Buffer *buffer, uint32_t) {
+ [&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
cnt = reinterpret_cast<uint64_t *>(buffer->data)[0];
});
port.close();
@@ -33,8 +34,11 @@ static void test_add_simple() {
// Test to ensure that the RPC mechanism doesn't hang on divergence.
static void test_noop(uint8_t data) {
- rpc::Client::Port port = rpc::client.open<RPC_NOOP>();
- port.send([=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = data; });
+ LIBC_NAMESPACE::rpc::Client::Port port =
+ LIBC_NAMESPACE::rpc::client.open<RPC_NOOP>();
+ port.send([=](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = data;
+ });
port.close();
}
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 972601aaf1d5e0f..d877cbc25a13d01 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -14,15 +14,16 @@
// Make sure these are included first so they don't conflict with the system.
#include <limits.h>
+#include "shared/rpc.h"
+
#include "llvmlibc_rpc_server.h"
-#include "src/__support/RPC/rpc.h"
+#include "include/llvm-libc-types/rpc_opcodes_t.h"
#include "src/__support/arg_list.h"
#include "src/stdio/printf_core/converter.h"
#include "src/stdio/printf_core/parser.h"
#include "src/stdio/printf_core/writer.h"
-#include "src/stdio/gpu/file.h"
#include <algorithm>
#include <atomic>
#include <cstdio>
@@ -53,6 +54,26 @@ struct TempStorage {
};
} // namespace
+enum Stream {
+ File = 0,
+ Stdin = 1,
+ Stdout = 2,
+ Stderr = 3,
+};
+
+// Get the associated stream out of an encoded number.
+LIBC_INLINE ::FILE *to_stream(uintptr_t f) {
+ ::FILE *stream = reinterpret_cast<FILE *>(f & ~0x3ull);
+ Stream type = static_cast<Stream>(f & 0x3ull);
+ if (type == Stdin)
+ return stdin;
+ if (type == Stdout)
+ return stdout;
+ if (type == Stderr)
+ return stderr;
+ return stream;
+}
+
template <bool packed, uint32_t lane_size>
static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
FILE *files[lane_size] = {nullptr};
@@ -260,7 +281,7 @@ rpc_status_t handle_server_impl(
port->recv([&](rpc::Buffer *buffer, uint32_t id) {
data[id] = temp_storage.alloc(buffer->data[0]);
sizes[id] =
- fread(data[id], 1, buffer->data[0], file::to_stream(buffer->data[1]));
+ fread(data[id], 1, buffer->data[0], to_stream(buffer->data[1]));
});
port->send_n(data, sizes);
port->send([&](rpc::Buffer *buffer, uint32_t id) {
@@ -273,9 +294,8 @@ rpc_status_t handle_server_impl(
void *data[lane_size] = {nullptr};
port->recv([&](rpc::Buffer *buffer, uint32_t id) {
data[id] = temp_storage.alloc(buffer->data[0]);
- const char *str =
- fgets(reinterpret_cast<char *>(data[id]), buffer->data[0],
- file::to_stream(buffer->data[1]));
+ const char *str = fgets(reinterpret_cast<char *>(data[id]),
+ buffer->data[0], to_stream(buffer->data[1]));
sizes[id] = !str ? 0 : std::strlen(str) + 1;
});
port->send_n(data, sizes);
@@ -335,46 +355,46 @@ rpc_status_t handle_server_impl(
}
case RPC_FEOF: {
port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
- buffer->data[0] = feof(file::to_stream(buffer->data[0]));
+ buffer->data[0] = feof(to_stream(buffer->data[0]));
});
break;
}
case RPC_FERROR: {
port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
- buffer->data[0] = ferror(file::to_stream(buffer->data[0]));
+ buffer->data[0] = ferror(to_stream(buffer->data[0]));
});
break;
}
case RPC_CLEARERR: {
port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
- clearerr(file::to_stream(buffer->data[0]));
+ clearerr(to_stream(buffer->data[0]));
});
break;
}
case RPC_FSEEK: {
port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
- buffer->data[0] = fseek(file::to_stream(buffer->data[0]),
- static_cast<long>(buffer->data[1]),
- static_cast<int>(buffer->data[2]));
+ buffer->data[0] =
+ fseek(to_stream(buffer->data[0]), static_cast<long>(buffer->data[1]),
+ static_cast<int>(buffer->data[2]));
});
break;
}
case RPC_FTELL: {
port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
- buffer->data[0] = ftell(file::to_stream(buffer->data[0]));
+ buffer->data[0] = ftell(to_stream(buffer->data[0]));
});
break;
}
case RPC_FFLUSH: {
port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
- buffer->data[0] = fflush(file::to_stream(buffer->data[0]));
+ buffer->data[0] = fflush(to_stream(buffer->data[0]));
});
break;
}
case RPC_UNGETC: {
port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
- buffer->data[0] = ungetc(static_cast<int>(buffer->data[0]),
- file::to_stream(buffer->data[1]));
+ buffer->data[0] =
+ ungetc(static_cast<int>(buffer->data[0]), to_stream(buffer->data[1]));
});
break;
}
>From 1144109437f9575519847005c68ad67acc15e5fd Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 20 Nov 2024 16:45:37 -0600
Subject: [PATCH 2/2] [libc] Remove RPC server API and use the header directly
Summary:
This patch removes much of the `llvmlibc_rpc_server` interface. This
pretty much deletes all of this code and just replaces it with including
`rpc.h` directly. We still maintain the file to let `libc` handle the
opcodes, since those depend on the `printf` impelmentation.
This will need to be cleaned up more, but I don't want to put too much
into a single patch.
---
libc/shared/rpc.h | 7 +
libc/utils/gpu/loader/Loader.h | 204 +++++-----
.../utils/gpu/loader/amdgpu/amdhsa-loader.cpp | 123 +++----
libc/utils/gpu/loader/nvptx/nvptx-loader.cpp | 96 ++---
libc/utils/gpu/server/llvmlibc_rpc_server.h | 94 +----
libc/utils/gpu/server/rpc_server.cpp | 348 +++++-------------
offload/plugins-nextgen/common/CMakeLists.txt | 1 +
offload/plugins-nextgen/common/include/RPC.h | 2 +-
offload/plugins-nextgen/common/src/RPC.cpp | 126 +++----
9 files changed, 339 insertions(+), 662 deletions(-)
diff --git a/libc/shared/rpc.h b/libc/shared/rpc.h
index 489a8cebfb807c4..c5e4277286c3999 100644
--- a/libc/shared/rpc.h
+++ b/libc/shared/rpc.h
@@ -42,6 +42,13 @@ namespace rpc {
#define __scoped_atomic_thread_fence(ord, scp) __atomic_thread_fence(ord)
#endif
+/// Generic codes that can be used whem implementing the server.
+enum Status {
+ SUCCESS = 0x0,
+ ERROR = 0x1000,
+ UNHANDLED_OPCODE = 0x1001,
+};
+
/// A fixed size channel used to communicate between the RPC client and server.
struct Buffer {
uint64_t data[8];
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index 8be8c0d5f85532f..fd5105b34709e6c 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -13,6 +13,7 @@
#include "include/llvm-libc-types/rpc_opcodes_t.h"
#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "shared/rpc.h"
#include <cstddef>
#include <cstdint>
@@ -103,129 +104,90 @@ inline void handle_error_impl(const char *file, int32_t line, const char *msg) {
fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, msg);
exit(EXIT_FAILURE);
}
-
-inline void handle_error_impl(const char *file, int32_t line,
- rpc_status_t err) {
- fprintf(stderr, "%s:%d:0: Error: %d\n", file, line, err);
- exit(EXIT_FAILURE);
-}
#define handle_error(X) handle_error_impl(__FILE__, __LINE__, X)
-template <uint32_t lane_size>
-inline void register_rpc_callbacks(rpc_device_t device) {
- static_assert(lane_size == 32 || lane_size == 64, "Invalid Lane size");
- // Register the ping test for the `libc` tests.
- rpc_register_callback(
- device, static_cast<rpc_opcode_t>(RPC_TEST_INCREMENT),
- [](rpc_port_t port, void *data) {
- rpc_recv_and_send(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
- },
- data);
- },
- nullptr);
-
- // Register the interface test callbacks.
- rpc_register_callback(
- device, static_cast<rpc_opcode_t>(RPC_TEST_INTERFACE),
- [](rpc_port_t port, void *data) {
- uint64_t cnt = 0;
- bool end_with_recv;
- rpc_recv(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- *reinterpret_cast<bool *>(data) = buffer->data[0];
- },
- &end_with_recv);
- rpc_recv(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
- },
- &cnt);
- rpc_send(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
- buffer->data[0] = cnt = cnt + 1;
- },
- &cnt);
- rpc_recv(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
- },
- &cnt);
- rpc_send(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
- buffer->data[0] = cnt = cnt + 1;
- },
- &cnt);
- rpc_recv(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
- },
- &cnt);
- rpc_recv(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
- },
- &cnt);
- rpc_send(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
- buffer->data[0] = cnt = cnt + 1;
- },
- &cnt);
- rpc_send(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
- buffer->data[0] = cnt = cnt + 1;
- },
- &cnt);
- if (end_with_recv)
- rpc_recv(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
- },
- &cnt);
- else
- rpc_send(
- port,
- [](rpc_buffer_t *buffer, void *data) {
- uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
- buffer->data[0] = cnt = cnt + 1;
- },
- &cnt);
- },
- nullptr);
-
- // Register the stream test handler.
- rpc_register_callback(
- device, static_cast<rpc_opcode_t>(RPC_TEST_STREAM),
- [](rpc_port_t port, void *data) {
- uint64_t sizes[lane_size] = {0};
- void *dst[lane_size] = {nullptr};
- rpc_recv_n(
- port, dst, sizes,
- [](uint64_t size, void *) -> void * { return new char[size]; },
- nullptr);
- rpc_send_n(port, dst, sizes);
- for (uint64_t i = 0; i < lane_size; ++i) {
- if (dst[i])
- delete[] reinterpret_cast<uint8_t *>(dst[i]);
- }
- },
- nullptr);
+template <uint32_t num_lanes, typename Alloc, typename Free>
+inline uint32_t handle_server(rpc::Server &server, uint32_t index,
+ Alloc &&alloc, Free &&free) {
+ auto port = server.try_open(num_lanes, index);
+ if (!port)
+ return 0;
+ index = port->get_index() + 1;
+
+ switch (port->get_opcode()) {
+ case RPC_TEST_INCREMENT: {
+ port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+ reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
+ });
+ break;
+ }
+ case RPC_TEST_INTERFACE: {
+ bool end_with_recv;
+ uint64_t cnt;
+ port->recv([&](rpc::Buffer *buffer, uint32_t) {
+ end_with_recv = buffer->data[0];
+ });
+ port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+ port->send([&](rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = cnt = cnt + 1;
+ });
+ port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+ port->send([&](rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = cnt = cnt + 1;
+ });
+ port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+ port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+ port->send([&](rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = cnt = cnt + 1;
+ });
+ port->send([&](rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = cnt = cnt + 1;
+ });
+ if (end_with_recv)
+ port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+ else
+ port->send([&](rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = cnt = cnt + 1;
+ });
+
+ break;
+ }
+ case RPC_TEST_STREAM: {
+ uint64_t sizes[num_lanes] = {0};
+ void *dst[num_lanes] = {nullptr};
+ port->recv_n(dst, sizes,
+ [](uint64_t size) -> void * { return new char[size]; });
+ port->send_n(dst, sizes);
+ for (uint64_t i = 0; i < num_lanes; ++i) {
+ if (dst[i])
+ delete[] reinterpret_cast<uint8_t *>(dst[i]);
+ }
+ break;
+ }
+ case RPC_MALLOC: {
+ port->recv_and_send([&](rpc::Buffer *buffer, uint32_t) {
+ buffer->data[0] = reinterpret_cast<uintptr_t>(alloc(buffer->data[0]));
+ });
+ break;
+ }
+ case RPC_FREE: {
+ port->recv([&](rpc::Buffer *buffer, uint32_t) {
+ free(reinterpret_cast<void *>(buffer->data[0]));
+ });
+ break;
+ }
+ default:
+ break;
+ }
+
+ // Handle all of the `libc` specific opcodes.
+ int status = libc_handle_rpc_port(&*port, num_lanes);
+ if (status != rpc::SUCCESS)
+ handle_error("Error handling RPC server");
+
+ port->close();
+
+ return index;
}
#endif
diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
index cb81a866622f937..4849022e89d3306 100644
--- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
@@ -160,7 +160,7 @@ template <typename args_t>
hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
hsa_amd_memory_pool_t kernargs_pool,
hsa_amd_memory_pool_t coarsegrained_pool,
- hsa_queue_t *queue, rpc_device_t device,
+ hsa_queue_t *queue, rpc::Server &server,
const LaunchParameters ¶ms,
const char *kernel_name, args_t kernel_args,
bool print_resource_usage) {
@@ -170,37 +170,10 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
executable, kernel_name, &dev_agent, &symbol))
return err;
- // Register RPC callbacks for the malloc and free functions on HSA.
- auto tuple = std::make_tuple(dev_agent, coarsegrained_pool);
- rpc_register_callback(
- device, RPC_MALLOC,
- [](rpc_port_t port, void *data) {
- auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
- auto &[dev_agent, pool] = *static_cast<decltype(tuple) *>(data);
- uint64_t size = buffer->data[0];
- void *dev_ptr = nullptr;
- if (hsa_status_t err =
- hsa_amd_memory_pool_allocate(pool, size,
- /*flags=*/0, &dev_ptr))
- dev_ptr = nullptr;
- hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
- buffer->data[0] = reinterpret_cast<uintptr_t>(dev_ptr);
- };
- rpc_recv_and_send(port, malloc_handler, data);
- },
- &tuple);
- rpc_register_callback(
- device, RPC_FREE,
- [](rpc_port_t port, void *data) {
- auto free_handler = [](rpc_buffer_t *buffer, void *) {
- if (hsa_status_t err = hsa_amd_memory_pool_free(
- reinterpret_cast<void *>(buffer->data[0])))
- handle_error(err);
- };
- rpc_recv_and_send(port, free_handler, data);
- },
- nullptr);
-
+ uint32_t wavefront_size = 0;
+ if (hsa_status_t err = hsa_agent_get_info(
+ dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
+ handle_error(err);
// Retrieve different properties of the kernel symbol used for launch.
uint64_t kernel;
uint32_t args_size;
@@ -292,14 +265,38 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
std::atomic<bool> finished = false;
- std::thread server(
- [](std::atomic<bool> *finished, rpc_device_t device) {
- while (!*finished) {
- if (rpc_status_t err = rpc_handle_server(device))
+ std::thread server_thread(
+ [](std::atomic<bool> *finished, rpc::Server *server,
+ uint32_t wavefront_size, hsa_agent_t dev_agent,
+ hsa_amd_memory_pool_t coarsegrained_pool) {
+ // Register RPC callbacks for the malloc and free functions on HSA.
+ auto malloc_handler = [&](size_t size) -> void * {
+ void *dev_ptr = nullptr;
+ if (hsa_status_t err =
+ hsa_amd_memory_pool_allocate(coarsegrained_pool, size,
+ /*flags=*/0, &dev_ptr))
+ dev_ptr = nullptr;
+ hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
+ return dev_ptr;
+ };
+
+ auto free_handler = [](void *ptr) -> void {
+ if (hsa_status_t err =
+ hsa_amd_memory_pool_free(reinterpret_cast<void *>(ptr)))
handle_error(err);
+ };
+
+ uint32_t index = 0;
+ while (!*finished) {
+ if (wavefront_size == 32)
+ index =
+ handle_server<32>(*server, index, malloc_handler, free_handler);
+ else
+ index =
+ handle_server<64>(*server, index, malloc_handler, free_handler);
}
},
- &finished, device);
+ &finished, &server, wavefront_size, dev_agent, coarsegrained_pool);
// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
@@ -309,8 +306,8 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
;
finished = true;
- if (server.joinable())
- server.join();
+ if (server_thread.joinable())
+ server_thread.join();
// Destroy the resources acquired to launch the kernel and return.
if (hsa_status_t err = hsa_amd_memory_pool_free(args))
@@ -452,34 +449,22 @@ int load(int argc, const char **argv, const char **envp, void *image,
handle_error(err);
// Set up the RPC server.
- auto tuple = std::make_tuple(dev_agent, finegrained_pool);
- auto rpc_alloc = [](uint64_t size, void *data) {
- auto &[dev_agent, finegrained_pool] = *static_cast<decltype(tuple) *>(data);
- void *dev_ptr = nullptr;
- if (hsa_status_t err = hsa_amd_memory_pool_allocate(finegrained_pool, size,
- /*flags=*/0, &dev_ptr))
- handle_error(err);
- hsa_amd_agents_allow_access(1, &dev_agent, nullptr, dev_ptr);
- return dev_ptr;
- };
- rpc_device_t device;
- if (rpc_status_t err = rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
- wavefront_size, rpc_alloc, &tuple))
+ void *rpc_buffer;
+ if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+ finegrained_pool,
+ rpc::Server::allocation_size(wavefront_size, rpc::MAX_PORT_COUNT),
+ /*flags=*/0, &rpc_buffer))
handle_error(err);
+ hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_buffer);
- // Register callbacks for the RPC unit tests.
- if (wavefront_size == 32)
- register_rpc_callbacks<32>(device);
- else if (wavefront_size == 64)
- register_rpc_callbacks<64>(device);
- else
- handle_error("Invalid wavefront size");
+ rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
+ rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
// Initialize the RPC client on the device by copying the local data to the
// device's internal pointer.
hsa_executable_symbol_t rpc_client_sym;
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
- executable, rpc_client_symbol_name, &dev_agent, &rpc_client_sym))
+ executable, "__llvm_libc_rpc_client", &dev_agent, &rpc_client_sym))
handle_error(err);
void *rpc_client_host;
@@ -502,19 +487,17 @@ int load(int argc, const char **argv, const char **envp, void *image,
void *rpc_client_buffer;
if (hsa_status_t err =
- hsa_amd_memory_lock(const_cast<void *>(rpc_get_client_buffer(device)),
- rpc_get_client_size(),
+ hsa_amd_memory_lock(&client, sizeof(rpc::Client),
/*agents=*/nullptr, 0, &rpc_client_buffer))
handle_error(err);
// Copy the RPC client buffer to the address pointed to by the symbol.
if (hsa_status_t err =
hsa_memcpy(*reinterpret_cast<void **>(rpc_client_host), dev_agent,
- rpc_client_buffer, host_agent, rpc_get_client_size()))
+ rpc_client_buffer, host_agent, sizeof(rpc::Client)))
handle_error(err);
- if (hsa_status_t err = hsa_amd_memory_unlock(
- const_cast<void *>(rpc_get_client_buffer(device))))
+ if (hsa_status_t err = hsa_amd_memory_unlock(&client))
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_client_host))
handle_error(err);
@@ -566,7 +549,7 @@ int load(int argc, const char **argv, const char **envp, void *image,
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
begin_args_t init_args = {argc, dev_argv, dev_envp};
if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
- coarsegrained_pool, queue, device,
+ coarsegrained_pool, queue, server,
single_threaded_params, "_begin.kd",
init_args, print_resource_usage))
handle_error(err);
@@ -574,7 +557,7 @@ int load(int argc, const char **argv, const char **envp, void *image,
start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
if (hsa_status_t err = launch_kernel(
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
- device, params, "_start.kd", args, print_resource_usage))
+ server, params, "_start.kd", args, print_resource_usage))
handle_error(err);
void *host_ret;
@@ -593,14 +576,12 @@ int load(int argc, const char **argv, const char **envp, void *image,
end_args_t fini_args = {ret};
if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
- coarsegrained_pool, queue, device,
+ coarsegrained_pool, queue, server,
single_threaded_params, "_end.kd",
fini_args, print_resource_usage))
handle_error(err);
- if (rpc_status_t err = rpc_server_shutdown(
- device, [](void *ptr, void *) { hsa_amd_memory_pool_free(ptr); },
- nullptr))
+ if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))
handle_error(err);
// Free the memory allocated for the device.
diff --git a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
index 58e5e5f04d0a709..0ba217451feaea9 100644
--- a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
@@ -167,10 +167,9 @@ void print_kernel_resources(CUmodule binary, const char *kernel_name) {
}
template <typename args_t>
-CUresult launch_kernel(CUmodule binary, CUstream stream,
- rpc_device_t rpc_device, const LaunchParameters ¶ms,
- const char *kernel_name, args_t kernel_args,
- bool print_resource_usage) {
+CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
+ const LaunchParameters ¶ms, const char *kernel_name,
+ args_t kernel_args, bool print_resource_usage) {
// look up the '_start' kernel in the loaded module.
CUfunction function;
if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
@@ -181,23 +180,21 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
CU_LAUNCH_PARAM_END};
+ if (print_resource_usage)
+ print_kernel_resources(binary, kernel_name);
- // Initialize a non-blocking CUDA stream to allocate memory if needed. This
- // needs to be done on a separate stream or else it will deadlock with the
- // executing kernel.
+ // Initialize a non-blocking CUDA stream to allocate memory if needed.
+ // This needs to be done on a separate stream or else it will deadlock
+ // with the executing kernel.
CUstream memory_stream;
if (CUresult err = cuStreamCreate(&memory_stream, CU_STREAM_NON_BLOCKING))
handle_error(err);
- // Register RPC callbacks for the malloc and free functions on HSA.
- register_rpc_callbacks<32>(rpc_device);
-
- rpc_register_callback(
- rpc_device, RPC_MALLOC,
- [](rpc_port_t port, void *data) {
- auto malloc_handler = [](rpc_buffer_t *buffer, void *data) -> void {
- CUstream memory_stream = *static_cast<CUstream *>(data);
- uint64_t size = buffer->data[0];
+ std::atomic<bool> finished = false;
+ std::thread server_thread(
+ [](std::atomic<bool> *finished, rpc::Server *server,
+ CUstream memory_stream) {
+ auto malloc_handler = [&](size_t size) -> void * {
CUdeviceptr dev_ptr;
if (CUresult err = cuMemAllocAsync(&dev_ptr, size, memory_stream))
dev_ptr = 0UL;
@@ -205,36 +202,22 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
// Wait until the memory allocation is complete.
while (cuStreamQuery(memory_stream) == CUDA_ERROR_NOT_READY)
;
- buffer->data[0] = static_cast<uintptr_t>(dev_ptr);
+ return reinterpret_cast<void *>(dev_ptr);
};
- rpc_recv_and_send(port, malloc_handler, data);
- },
- &memory_stream);
- rpc_register_callback(
- rpc_device, RPC_FREE,
- [](rpc_port_t port, void *data) {
- auto free_handler = [](rpc_buffer_t *buffer, void *data) {
- CUstream memory_stream = *static_cast<CUstream *>(data);
- if (CUresult err = cuMemFreeAsync(
- static_cast<CUdeviceptr>(buffer->data[0]), memory_stream))
+
+ auto free_handler = [&](void *ptr) -> void {
+ if (CUresult err = cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(ptr),
+ memory_stream))
handle_error(err);
};
- rpc_recv_and_send(port, free_handler, data);
- },
- &memory_stream);
- if (print_resource_usage)
- print_kernel_resources(binary, kernel_name);
-
- std::atomic<bool> finished = false;
- std::thread server(
- [](std::atomic<bool> *finished, rpc_device_t device) {
+ uint32_t index = 0;
while (!*finished) {
- if (rpc_status_t err = rpc_handle_server(device))
- handle_error(err);
+ index =
+ handle_server<32>(*server, index, malloc_handler, free_handler);
}
},
- &finished, rpc_device);
+ &finished, &server, memory_stream);
// Call the kernel with the given arguments.
if (CUresult err = cuLaunchKernel(
@@ -247,8 +230,8 @@ CUresult launch_kernel(CUmodule binary, CUstream stream,
handle_error(err);
finished = true;
- if (server.joinable())
- server.join();
+ if (server_thread.joinable())
+ server_thread.join();
return CUDA_SUCCESS;
}
@@ -318,23 +301,20 @@ int load(int argc, const char **argv, const char **envp, void *image,
handle_error(err);
uint32_t warp_size = 32;
- auto rpc_alloc = [](uint64_t size, void *) -> void * {
- void *dev_ptr;
- if (CUresult err = cuMemAllocHost(&dev_ptr, size))
- handle_error(err);
- return dev_ptr;
- };
- rpc_device_t rpc_device;
- if (rpc_status_t err = rpc_server_init(&rpc_device, RPC_MAXIMUM_PORT_COUNT,
- warp_size, rpc_alloc, nullptr))
+ void *rpc_buffer = nullptr;
+ if (CUresult err = cuMemAllocHost(
+ &rpc_buffer,
+ rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
handle_error(err);
+ rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
+ rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
// Initialize the RPC client on the device by copying the local data to the
// device's internal pointer.
CUdeviceptr rpc_client_dev = 0;
uint64_t client_ptr_size = sizeof(void *);
if (CUresult err = cuModuleGetGlobal(&rpc_client_dev, &client_ptr_size,
- binary, rpc_client_symbol_name))
+ binary, "__llvm_libc_rpc_client"))
handle_error(err);
CUdeviceptr rpc_client_host = 0;
@@ -342,20 +322,19 @@ int load(int argc, const char **argv, const char **envp, void *image,
cuMemcpyDtoH(&rpc_client_host, rpc_client_dev, sizeof(void *)))
handle_error(err);
if (CUresult err =
- cuMemcpyHtoD(rpc_client_host, rpc_get_client_buffer(rpc_device),
- rpc_get_client_size()))
+ cuMemcpyHtoD(rpc_client_host, &client, sizeof(rpc::Client)))
handle_error(err);
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
begin_args_t init_args = {argc, dev_argv, dev_envp};
if (CUresult err =
- launch_kernel(binary, stream, rpc_device, single_threaded_params,
+ launch_kernel(binary, stream, server, single_threaded_params,
"_begin", init_args, print_resource_usage))
handle_error(err);
start_args_t args = {argc, dev_argv, dev_envp,
reinterpret_cast<void *>(dev_ret)};
- if (CUresult err = launch_kernel(binary, stream, rpc_device, params, "_start",
+ if (CUresult err = launch_kernel(binary, stream, server, params, "_start",
args, print_resource_usage))
handle_error(err);
@@ -369,8 +348,8 @@ int load(int argc, const char **argv, const char **envp, void *image,
end_args_t fini_args = {host_ret};
if (CUresult err =
- launch_kernel(binary, stream, rpc_device, single_threaded_params,
- "_end", fini_args, print_resource_usage))
+ launch_kernel(binary, stream, server, single_threaded_params, "_end",
+ fini_args, print_resource_usage))
handle_error(err);
// Free the memory allocated for the device.
@@ -380,8 +359,7 @@ int load(int argc, const char **argv, const char **envp, void *image,
handle_error(err);
if (CUresult err = cuMemFreeHost(dev_argv))
handle_error(err);
- if (rpc_status_t err = rpc_server_shutdown(
- rpc_device, [](void *ptr, void *) { cuMemFreeHost(ptr); }, nullptr))
+ if (CUresult err = cuMemFreeHost(rpc_buffer))
handle_error(err);
// Destroy the context and the loaded binary.
diff --git a/libc/utils/gpu/server/llvmlibc_rpc_server.h b/libc/utils/gpu/server/llvmlibc_rpc_server.h
index 98df882afa21cf6..b7f173734345c0a 100644
--- a/libc/utils/gpu/server/llvmlibc_rpc_server.h
+++ b/libc/utils/gpu/server/llvmlibc_rpc_server.h
@@ -15,99 +15,7 @@
extern "C" {
#endif
-/// The maximum number of ports that can be opened for any server.
-const uint64_t RPC_MAXIMUM_PORT_COUNT = 4096;
-
-/// The symbol name associated with the client for use with the LLVM C library
-/// implementation.
-const char *const rpc_client_symbol_name = "__llvm_libc_rpc_client";
-
-/// status codes.
-typedef enum {
- RPC_STATUS_SUCCESS = 0x0,
- RPC_STATUS_CONTINUE = 0x1,
- RPC_STATUS_ERROR = 0x1000,
- RPC_STATUS_UNHANDLED_OPCODE = 0x1001,
- RPC_STATUS_INVALID_LANE_SIZE = 0x1002,
-} rpc_status_t;
-
-/// A struct containing an opaque handle to an RPC port. This is what allows the
-/// server to communicate with the client.
-typedef struct rpc_port_s {
- uint64_t handle;
- uint32_t lane_size;
-} rpc_port_t;
-
-/// A fixed-size buffer containing the payload sent from the client.
-typedef struct rpc_buffer_s {
- uint64_t data[8];
-} rpc_buffer_t;
-
-/// An opaque handle to an RPC server that can be attached to a device.
-typedef struct rpc_device_s {
- uintptr_t handle;
-} rpc_device_t;
-
-/// A function used to allocate \p bytes for use by the RPC server and client.
-/// The memory should support asynchronous and atomic access from both the
-/// client and server.
-typedef void *(*rpc_alloc_ty)(uint64_t size, void *data);
-
-/// A function used to free the \p ptr previously allocated.
-typedef void (*rpc_free_ty)(void *ptr, void *data);
-
-/// A callback function provided with a \p port to communicate with the RPC
-/// client. This will be called by the server to handle an opcode.
-typedef void (*rpc_opcode_callback_ty)(rpc_port_t port, void *data);
-
-/// A callback function to use the port to receive or send a \p buffer.
-typedef void (*rpc_port_callback_ty)(rpc_buffer_t *buffer, void *data);
-
-/// Initialize the server for a given device and return it in \p device.
-rpc_status_t rpc_server_init(rpc_device_t *rpc_device, uint64_t num_ports,
- uint32_t lane_size, rpc_alloc_ty alloc,
- void *data);
-
-/// Shut down the server for a given device.
-rpc_status_t rpc_server_shutdown(rpc_device_t rpc_device, rpc_free_ty dealloc,
- void *data);
-
-/// Queries the RPC clients at least once and performs server-side work if there
-/// are any active requests. Runs until all work on the server is completed.
-rpc_status_t rpc_handle_server(rpc_device_t rpc_device);
-
-/// Register a callback to handle an opcode from the RPC client. The associated
-/// data must remain accessible as long as the user intends to handle the server
-/// with this callback.
-rpc_status_t rpc_register_callback(rpc_device_t rpc_device, uint32_t opcode,
- rpc_opcode_callback_ty callback, void *data);
-
-/// Obtain a pointer to a local client buffer that can be copied directly to the
-/// other process using the address stored at the rpc client symbol name.
-const void *rpc_get_client_buffer(rpc_device_t device);
-
-/// Returns the size of the client in bytes to be used for a memory copy.
-uint64_t rpc_get_client_size();
-
-/// Use the \p port to send a buffer using the \p callback.
-void rpc_send(rpc_port_t port, rpc_port_callback_ty callback, void *data);
-
-/// Use the \p port to send \p bytes using the \p callback. The input is an
-/// array of at least the configured lane size.
-void rpc_send_n(rpc_port_t port, const void *const *src, uint64_t *size);
-
-/// Use the \p port to recieve a buffer using the \p callback.
-void rpc_recv(rpc_port_t port, rpc_port_callback_ty callback, void *data);
-
-/// Use the \p port to recieve \p bytes using the \p callback. The inputs is an
-/// array of at least the configured lane size. The \p alloc function allocates
-/// memory for the recieved bytes.
-void rpc_recv_n(rpc_port_t port, void **dst, uint64_t *size, rpc_alloc_ty alloc,
- void *data);
-
-/// Use the \p port to receive and send a buffer using the \p callback.
-void rpc_recv_and_send(rpc_port_t port, rpc_port_callback_ty callback,
- void *data);
+int libc_handle_rpc_port(void *port, uint32_t num_lanes);
#ifdef __cplusplus
}
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index d877cbc25a13d01..1fdbb79df7e3e02 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -37,12 +37,6 @@
using namespace LIBC_NAMESPACE;
using namespace LIBC_NAMESPACE::printf_core;
-static_assert(sizeof(rpc_buffer_t) == sizeof(rpc::Buffer),
- "Buffer size mismatch");
-
-static_assert(RPC_MAXIMUM_PORT_COUNT == rpc::MAX_PORT_COUNT,
- "Incorrect maximum port count");
-
namespace {
struct TempStorage {
char *alloc(size_t size) {
@@ -74,9 +68,9 @@ LIBC_INLINE ::FILE *to_stream(uintptr_t f) {
return stream;
}
-template <bool packed, uint32_t lane_size>
+template <bool packed, uint32_t num_lanes>
static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
- FILE *files[lane_size] = {nullptr};
+ FILE *files[num_lanes] = {nullptr};
// Get the appropriate output stream to use.
if (port.get_opcode() == RPC_PRINTF_TO_STREAM ||
port.get_opcode() == RPC_PRINTF_TO_STREAM_PACKED)
@@ -85,22 +79,22 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
});
else if (port.get_opcode() == RPC_PRINTF_TO_STDOUT ||
port.get_opcode() == RPC_PRINTF_TO_STDOUT_PACKED)
- std::fill(files, files + lane_size, stdout);
+ std::fill(files, files + num_lanes, stdout);
else
- std::fill(files, files + lane_size, stderr);
+ std::fill(files, files + num_lanes, stderr);
- uint64_t format_sizes[lane_size] = {0};
- void *format[lane_size] = {nullptr};
+ uint64_t format_sizes[num_lanes] = {0};
+ void *format[num_lanes] = {nullptr};
- uint64_t args_sizes[lane_size] = {0};
- void *args[lane_size] = {nullptr};
+ uint64_t args_sizes[num_lanes] = {0};
+ void *args[num_lanes] = {nullptr};
// Recieve the format string and arguments from the client.
port.recv_n(format, format_sizes,
[&](uint64_t size) { return temp_storage.alloc(size); });
// Parse the format string to get the expected size of the buffer.
- for (uint32_t lane = 0; lane < lane_size; ++lane) {
+ for (uint32_t lane = 0; lane < num_lanes; ++lane) {
if (!format[lane])
continue;
@@ -125,9 +119,9 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
// Identify any arguments that are actually pointers to strings on the client.
// Additionally we want to determine how much buffer space we need to print.
- std::vector<void *> strs_to_copy[lane_size];
- int buffer_size[lane_size] = {0};
- for (uint32_t lane = 0; lane < lane_size; ++lane) {
+ std::vector<void *> strs_to_copy[num_lanes];
+ int buffer_size[num_lanes] = {0};
+ for (uint32_t lane = 0; lane < num_lanes; ++lane) {
if (!format[lane])
continue;
@@ -159,7 +153,7 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
}
// Recieve any strings from the client and push them into a buffer.
- std::vector<void *> copied_strs[lane_size];
+ std::vector<void *> copied_strs[num_lanes];
while (std::any_of(std::begin(strs_to_copy), std::end(strs_to_copy),
[](const auto &v) { return !v.empty() && v.back(); })) {
port.send([&](rpc::Buffer *buffer, uint32_t id) {
@@ -168,11 +162,11 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
if (!strs_to_copy[id].empty())
strs_to_copy[id].pop_back();
});
- uint64_t str_sizes[lane_size] = {0};
- void *strs[lane_size] = {nullptr};
+ uint64_t str_sizes[num_lanes] = {0};
+ void *strs[num_lanes] = {nullptr};
port.recv_n(strs, str_sizes,
[&](uint64_t size) { return temp_storage.alloc(size); });
- for (uint32_t lane = 0; lane < lane_size; ++lane) {
+ for (uint32_t lane = 0; lane < num_lanes; ++lane) {
if (!strs[lane])
continue;
@@ -182,8 +176,8 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
}
// Perform the final formatting and printing using the LLVM C library printf.
- int results[lane_size] = {0};
- for (uint32_t lane = 0; lane < lane_size; ++lane) {
+ int results[num_lanes] = {0};
+ for (uint32_t lane = 0; lane < num_lanes; ++lane) {
if (!format[lane])
continue;
@@ -233,42 +227,34 @@ static void handle_printf(rpc::Server::Port &port, TempStorage &temp_storage) {
});
}
-template <uint32_t lane_size>
-rpc_status_t handle_server_impl(
- rpc::Server &server,
- const std::unordered_map<uint32_t, rpc_opcode_callback_ty> &callbacks,
- const std::unordered_map<uint32_t, void *> &callback_data,
- uint32_t &index) {
- auto port = server.try_open(lane_size, index);
- if (!port)
- return RPC_STATUS_SUCCESS;
-
+template <uint32_t num_lanes>
+rpc::Status handle_port_impl(rpc::Server::Port &port) {
TempStorage temp_storage;
- switch (port->get_opcode()) {
+ switch (port.get_opcode()) {
case RPC_WRITE_TO_STREAM:
case RPC_WRITE_TO_STDERR:
case RPC_WRITE_TO_STDOUT:
case RPC_WRITE_TO_STDOUT_NEWLINE: {
- uint64_t sizes[lane_size] = {0};
- void *strs[lane_size] = {nullptr};
- FILE *files[lane_size] = {nullptr};
- if (port->get_opcode() == RPC_WRITE_TO_STREAM) {
- port->recv([&](rpc::Buffer *buffer, uint32_t id) {
+ uint64_t sizes[num_lanes] = {0};
+ void *strs[num_lanes] = {nullptr};
+ FILE *files[num_lanes] = {nullptr};
+ if (port.get_opcode() == RPC_WRITE_TO_STREAM) {
+ port.recv([&](rpc::Buffer *buffer, uint32_t id) {
files[id] = reinterpret_cast<FILE *>(buffer->data[0]);
});
- } else if (port->get_opcode() == RPC_WRITE_TO_STDERR) {
- std::fill(files, files + lane_size, stderr);
+ } else if (port.get_opcode() == RPC_WRITE_TO_STDERR) {
+ std::fill(files, files + num_lanes, stderr);
} else {
- std::fill(files, files + lane_size, stdout);
+ std::fill(files, files + num_lanes, stdout);
}
- port->recv_n(strs, sizes,
- [&](uint64_t size) { return temp_storage.alloc(size); });
- port->send([&](rpc::Buffer *buffer, uint32_t id) {
+ port.recv_n(strs, sizes,
+ [&](uint64_t size) { return temp_storage.alloc(size); });
+ port.send([&](rpc::Buffer *buffer, uint32_t id) {
flockfile(files[id]);
buffer->data[0] = fwrite_unlocked(strs[id], 1, sizes[id], files[id]);
- if (port->get_opcode() == RPC_WRITE_TO_STDOUT_NEWLINE &&
+ if (port.get_opcode() == RPC_WRITE_TO_STDOUT_NEWLINE &&
buffer->data[0] == sizes[id])
buffer->data[0] += fwrite_unlocked("\n", 1, 1, files[id]);
funlockfile(files[id]);
@@ -276,37 +262,37 @@ rpc_status_t handle_server_impl(
break;
}
case RPC_READ_FROM_STREAM: {
- uint64_t sizes[lane_size] = {0};
- void *data[lane_size] = {nullptr};
- port->recv([&](rpc::Buffer *buffer, uint32_t id) {
+ uint64_t sizes[num_lanes] = {0};
+ void *data[num_lanes] = {nullptr};
+ port.recv([&](rpc::Buffer *buffer, uint32_t id) {
data[id] = temp_storage.alloc(buffer->data[0]);
sizes[id] =
fread(data[id], 1, buffer->data[0], to_stream(buffer->data[1]));
});
- port->send_n(data, sizes);
- port->send([&](rpc::Buffer *buffer, uint32_t id) {
+ port.send_n(data, sizes);
+ port.send([&](rpc::Buffer *buffer, uint32_t id) {
std::memcpy(buffer->data, &sizes[id], sizeof(uint64_t));
});
break;
}
case RPC_READ_FGETS: {
- uint64_t sizes[lane_size] = {0};
- void *data[lane_size] = {nullptr};
- port->recv([&](rpc::Buffer *buffer, uint32_t id) {
+ uint64_t sizes[num_lanes] = {0};
+ void *data[num_lanes] = {nullptr};
+ port.recv([&](rpc::Buffer *buffer, uint32_t id) {
data[id] = temp_storage.alloc(buffer->data[0]);
const char *str = fgets(reinterpret_cast<char *>(data[id]),
buffer->data[0], to_stream(buffer->data[1]));
sizes[id] = !str ? 0 : std::strlen(str) + 1;
});
- port->send_n(data, sizes);
+ port.send_n(data, sizes);
break;
}
case RPC_OPEN_FILE: {
- uint64_t sizes[lane_size] = {0};
- void *paths[lane_size] = {nullptr};
- port->recv_n(paths, sizes,
- [&](uint64_t size) { return temp_storage.alloc(size); });
- port->recv_and_send([&](rpc::Buffer *buffer, uint32_t id) {
+ uint64_t sizes[num_lanes] = {0};
+ void *paths[num_lanes] = {nullptr};
+ port.recv_n(paths, sizes,
+ [&](uint64_t size) { return temp_storage.alloc(size); });
+ port.recv_and_send([&](rpc::Buffer *buffer, uint32_t id) {
FILE *file = fopen(reinterpret_cast<char *>(paths[id]),
reinterpret_cast<char *>(buffer->data));
buffer->data[0] = reinterpret_cast<uintptr_t>(file);
@@ -314,7 +300,7 @@ rpc_status_t handle_server_impl(
break;
}
case RPC_CLOSE_FILE: {
- port->recv_and_send([&](rpc::Buffer *buffer, uint32_t id) {
+ port.recv_and_send([&](rpc::Buffer *buffer, uint32_t id) {
FILE *file = reinterpret_cast<FILE *>(buffer->data[0]);
buffer->data[0] = fclose(file);
});
@@ -322,8 +308,8 @@ rpc_status_t handle_server_impl(
}
case RPC_EXIT: {
// Send a response to the client to signal that we are ready to exit.
- port->recv_and_send([](rpc::Buffer *, uint32_t) {});
- port->recv([](rpc::Buffer *buffer, uint32_t) {
+ port.recv_and_send([](rpc::Buffer *, uint32_t) {});
+ port.recv([](rpc::Buffer *buffer, uint32_t) {
int status = 0;
std::memcpy(&status, buffer->data, sizeof(int));
exit(status);
@@ -332,47 +318,47 @@ rpc_status_t handle_server_impl(
}
case RPC_ABORT: {
// Send a response to the client to signal that we are ready to abort.
- port->recv_and_send([](rpc::Buffer *, uint32_t) {});
- port->recv([](rpc::Buffer *, uint32_t) {});
+ port.recv_and_send([](rpc::Buffer *, uint32_t) {});
+ port.recv([](rpc::Buffer *, uint32_t) {});
abort();
break;
}
case RPC_HOST_CALL: {
- uint64_t sizes[lane_size] = {0};
- unsigned long long results[lane_size] = {0};
- void *args[lane_size] = {nullptr};
- port->recv_n(args, sizes,
- [&](uint64_t size) { return temp_storage.alloc(size); });
- port->recv([&](rpc::Buffer *buffer, uint32_t id) {
+ uint64_t sizes[num_lanes] = {0};
+ unsigned long long results[num_lanes] = {0};
+ void *args[num_lanes] = {nullptr};
+ port.recv_n(args, sizes,
+ [&](uint64_t size) { return temp_storage.alloc(size); });
+ port.recv([&](rpc::Buffer *buffer, uint32_t id) {
using func_ptr_t = unsigned long long (*)(void *);
auto func = reinterpret_cast<func_ptr_t>(buffer->data[0]);
results[id] = func(args[id]);
});
- port->send([&](rpc::Buffer *buffer, uint32_t id) {
+ port.send([&](rpc::Buffer *buffer, uint32_t id) {
buffer->data[0] = static_cast<uint64_t>(results[id]);
});
break;
}
case RPC_FEOF: {
- port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+ port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
buffer->data[0] = feof(to_stream(buffer->data[0]));
});
break;
}
case RPC_FERROR: {
- port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+ port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
buffer->data[0] = ferror(to_stream(buffer->data[0]));
});
break;
}
case RPC_CLEARERR: {
- port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+ port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
clearerr(to_stream(buffer->data[0]));
});
break;
}
case RPC_FSEEK: {
- port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+ port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
buffer->data[0] =
fseek(to_stream(buffer->data[0]), static_cast<long>(buffer->data[1]),
static_cast<int>(buffer->data[2]));
@@ -380,19 +366,19 @@ rpc_status_t handle_server_impl(
break;
}
case RPC_FTELL: {
- port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+ port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
buffer->data[0] = ftell(to_stream(buffer->data[0]));
});
break;
}
case RPC_FFLUSH: {
- port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+ port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
buffer->data[0] = fflush(to_stream(buffer->data[0]));
});
break;
}
case RPC_UNGETC: {
- port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
+ port.recv_and_send([](rpc::Buffer *buffer, uint32_t) {
buffer->data[0] =
ungetc(static_cast<int>(buffer->data[0]), to_stream(buffer->data[1]));
});
@@ -401,36 +387,36 @@ rpc_status_t handle_server_impl(
case RPC_PRINTF_TO_STREAM_PACKED:
case RPC_PRINTF_TO_STDOUT_PACKED:
case RPC_PRINTF_TO_STDERR_PACKED: {
- handle_printf<true, lane_size>(*port, temp_storage);
+ handle_printf<true, num_lanes>(port, temp_storage);
break;
}
case RPC_PRINTF_TO_STREAM:
case RPC_PRINTF_TO_STDOUT:
case RPC_PRINTF_TO_STDERR: {
- handle_printf<false, lane_size>(*port, temp_storage);
+ handle_printf<false, num_lanes>(port, temp_storage);
break;
}
case RPC_REMOVE: {
- uint64_t sizes[lane_size] = {0};
- void *args[lane_size] = {nullptr};
- port->recv_n(args, sizes,
- [&](uint64_t size) { return temp_storage.alloc(size); });
- port->send([&](rpc::Buffer *buffer, uint32_t id) {
+ uint64_t sizes[num_lanes] = {0};
+ void *args[num_lanes] = {nullptr};
+ port.recv_n(args, sizes,
+ [&](uint64_t size) { return temp_storage.alloc(size); });
+ port.send([&](rpc::Buffer *buffer, uint32_t id) {
buffer->data[0] = static_cast<uint64_t>(
remove(reinterpret_cast<const char *>(args[id])));
});
break;
}
case RPC_RENAME: {
- uint64_t oldsizes[lane_size] = {0};
- uint64_t newsizes[lane_size] = {0};
- void *oldpath[lane_size] = {nullptr};
- void *newpath[lane_size] = {nullptr};
- port->recv_n(oldpath, oldsizes,
- [&](uint64_t size) { return temp_storage.alloc(size); });
- port->recv_n(newpath, newsizes,
- [&](uint64_t size) { return temp_storage.alloc(size); });
- port->send([&](rpc::Buffer *buffer, uint32_t id) {
+ uint64_t oldsizes[num_lanes] = {0};
+ uint64_t newsizes[num_lanes] = {0};
+ void *oldpath[num_lanes] = {nullptr};
+ void *newpath[num_lanes] = {nullptr};
+ port.recv_n(oldpath, oldsizes,
+ [&](uint64_t size) { return temp_storage.alloc(size); });
+ port.recv_n(newpath, newsizes,
+ [&](uint64_t size) { return temp_storage.alloc(size); });
+ port.send([&](rpc::Buffer *buffer, uint32_t id) {
buffer->data[0] = static_cast<uint64_t>(
rename(reinterpret_cast<const char *>(oldpath[id]),
reinterpret_cast<const char *>(newpath[id])));
@@ -438,168 +424,36 @@ rpc_status_t handle_server_impl(
break;
}
case RPC_SYSTEM: {
- uint64_t sizes[lane_size] = {0};
- void *args[lane_size] = {nullptr};
- port->recv_n(args, sizes,
- [&](uint64_t size) { return temp_storage.alloc(size); });
- port->send([&](rpc::Buffer *buffer, uint32_t id) {
+ uint64_t sizes[num_lanes] = {0};
+ void *args[num_lanes] = {nullptr};
+ port.recv_n(args, sizes,
+ [&](uint64_t size) { return temp_storage.alloc(size); });
+ port.send([&](rpc::Buffer *buffer, uint32_t id) {
buffer->data[0] = static_cast<uint64_t>(
system(reinterpret_cast<const char *>(args[id])));
});
break;
}
case RPC_NOOP: {
- port->recv([](rpc::Buffer *, uint32_t) {});
+ port.recv([](rpc::Buffer *, uint32_t) {});
break;
}
- default: {
- auto handler =
- callbacks.find(static_cast<rpc_opcode_t>(port->get_opcode()));
-
- // We error out on an unhandled opcode.
- if (handler == callbacks.end())
- return RPC_STATUS_UNHANDLED_OPCODE;
-
- // Invoke the registered callback with a reference to the port.
- void *data =
- callback_data.at(static_cast<rpc_opcode_t>(port->get_opcode()));
- rpc_port_t port_ref{reinterpret_cast<uint64_t>(&*port), lane_size};
- (handler->second)(port_ref, data);
+ default:
+ return rpc::UNHANDLED_OPCODE;
}
- }
-
- // Increment the index so we start the scan after this port.
- index = port->get_index() + 1;
- port->close();
- return RPC_STATUS_CONTINUE;
+ return rpc::SUCCESS;
}
-struct Device {
- Device(uint32_t lane_size, uint32_t num_ports, void *buffer)
- : lane_size(lane_size), buffer(buffer), server(num_ports, buffer),
- client(num_ports, buffer) {}
-
- rpc_status_t handle_server(uint32_t &index) {
- switch (lane_size) {
- case 1:
- return handle_server_impl<1>(server, callbacks, callback_data, index);
- case 32:
- return handle_server_impl<32>(server, callbacks, callback_data, index);
- case 64:
- return handle_server_impl<64>(server, callbacks, callback_data, index);
- default:
- return RPC_STATUS_INVALID_LANE_SIZE;
- }
+int libc_handle_rpc_port(void *port, uint32_t num_lanes) {
+ switch (num_lanes) {
+ case 1:
+ return handle_port_impl<1>(*reinterpret_cast<rpc::Server::Port *>(port));
+ case 32:
+ return handle_port_impl<32>(*reinterpret_cast<rpc::Server::Port *>(port));
+ case 64:
+ return handle_port_impl<64>(*reinterpret_cast<rpc::Server::Port *>(port));
+ default:
+ return rpc::ERROR;
}
-
- uint32_t lane_size;
- void *buffer;
- rpc::Server server;
- rpc::Client client;
- std::unordered_map<uint32_t, rpc_opcode_callback_ty> callbacks;
- std::unordered_map<uint32_t, void *> callback_data;
-};
-
-rpc_status_t rpc_server_init(rpc_device_t *rpc_device, uint64_t num_ports,
- uint32_t lane_size, rpc_alloc_ty alloc,
- void *data) {
- if (!rpc_device)
- return RPC_STATUS_ERROR;
- if (lane_size != 1 && lane_size != 32 && lane_size != 64)
- return RPC_STATUS_INVALID_LANE_SIZE;
-
- uint64_t size = rpc::Server::allocation_size(lane_size, num_ports);
- void *buffer = alloc(size, data);
-
- if (!buffer)
- return RPC_STATUS_ERROR;
-
- Device *device = new Device(lane_size, num_ports, buffer);
- if (!device)
- return RPC_STATUS_ERROR;
-
- rpc_device->handle = reinterpret_cast<uintptr_t>(device);
- return RPC_STATUS_SUCCESS;
-}
-
-rpc_status_t rpc_server_shutdown(rpc_device_t rpc_device, rpc_free_ty dealloc,
- void *data) {
- if (!rpc_device.handle)
- return RPC_STATUS_ERROR;
-
- Device *device = reinterpret_cast<Device *>(rpc_device.handle);
- dealloc(device->buffer, data);
- delete device;
-
- return RPC_STATUS_SUCCESS;
-}
-
-rpc_status_t rpc_handle_server(rpc_device_t rpc_device) {
- if (!rpc_device.handle)
- return RPC_STATUS_ERROR;
-
- Device *device = reinterpret_cast<Device *>(rpc_device.handle);
- uint32_t index = 0;
- for (;;) {
- rpc_status_t status = device->handle_server(index);
- if (status != RPC_STATUS_CONTINUE)
- return status;
- }
-}
-
-rpc_status_t rpc_register_callback(rpc_device_t rpc_device, uint32_t opcode,
- rpc_opcode_callback_ty callback,
- void *data) {
- if (!rpc_device.handle)
- return RPC_STATUS_ERROR;
-
- Device *device = reinterpret_cast<Device *>(rpc_device.handle);
-
- device->callbacks[opcode] = callback;
- device->callback_data[opcode] = data;
- return RPC_STATUS_SUCCESS;
-}
-
-const void *rpc_get_client_buffer(rpc_device_t rpc_device) {
- if (!rpc_device.handle)
- return nullptr;
- Device *device = reinterpret_cast<Device *>(rpc_device.handle);
- return &device->client;
-}
-
-uint64_t rpc_get_client_size() { return sizeof(rpc::Client); }
-
-void rpc_send(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
- auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
- port->send([=](rpc::Buffer *buffer, uint32_t) {
- callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
- });
-}
-
-void rpc_send_n(rpc_port_t ref, const void *const *src, uint64_t *size) {
- auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
- port->send_n(src, size);
-}
-
-void rpc_recv(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
- auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
- port->recv([=](rpc::Buffer *buffer, uint32_t) {
- callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
- });
-}
-
-void rpc_recv_n(rpc_port_t ref, void **dst, uint64_t *size, rpc_alloc_ty alloc,
- void *data) {
- auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
- auto alloc_fn = [=](uint64_t size) { return alloc(size, data); };
- port->recv_n(dst, size, alloc_fn);
-}
-
-void rpc_recv_and_send(rpc_port_t ref, rpc_port_callback_ty callback,
- void *data) {
- auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
- port->recv_and_send([=](rpc::Buffer *buffer, uint32_t) {
- callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
- });
}
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index fde4b2f930349e8..3ed5c02ed4a3bbb 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -34,6 +34,7 @@ elseif(${LIBOMPTARGET_GPU_LIBC_SUPPORT})
# We may need to get the headers directly from the 'libc' source directory.
target_include_directories(PluginCommon PRIVATE
${CMAKE_SOURCE_DIR}/../libc/utils/gpu/server
+ ${CMAKE_SOURCE_DIR}/../libc/
${CMAKE_SOURCE_DIR}/../libc/include)
endif()
endif()
diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h
index 01bf539bcb3f32d..5b9b7ffd086b57f 100644
--- a/offload/plugins-nextgen/common/include/RPC.h
+++ b/offload/plugins-nextgen/common/include/RPC.h
@@ -61,7 +61,7 @@ struct RPCServerTy {
private:
/// Array from this device's identifier to its attached devices.
- llvm::SmallVector<uintptr_t> Handles;
+ llvm::SmallVector<void *> Buffers;
};
} // namespace llvm::omp::target
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index faa2cbd4f02fe15..48ed612cf853b6d 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -12,9 +12,11 @@
#include "PluginInterface.h"
+// TODO: This should be included unconditionally and cleaned up.
#if defined(LIBOMPTARGET_RPC_SUPPORT)
-#include "llvm-libc-types/rpc_opcodes_t.h"
+#include "include/llvm-libc-types/rpc_opcodes_t.h"
#include "llvmlibc_rpc_server.h"
+#include "shared/rpc.h"
#endif
using namespace llvm;
@@ -22,14 +24,14 @@ using namespace omp;
using namespace target;
RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
- : Handles(Plugin.getNumDevices()) {}
+ : Buffers(Plugin.getNumDevices()) {}
llvm::Expected<bool>
RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
plugin::GenericGlobalHandlerTy &Handler,
plugin::DeviceImageTy &Image) {
#ifdef LIBOMPTARGET_RPC_SUPPORT
- return Handler.isSymbolInImage(Device, Image, rpc_client_symbol_name);
+ return Handler.isSymbolInImage(Device, Image, "__llvm_libc_rpc_client");
#else
return false;
#endif
@@ -39,59 +41,18 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
plugin::GenericGlobalHandlerTy &Handler,
plugin::DeviceImageTy &Image) {
#ifdef LIBOMPTARGET_RPC_SUPPORT
- auto Alloc = [](uint64_t Size, void *Data) {
- plugin::GenericDeviceTy &Device =
- *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
- return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST);
- };
uint64_t NumPorts =
- std::min(Device.requestedRPCPortCount(), RPC_MAXIMUM_PORT_COUNT);
- rpc_device_t RPCDevice;
- if (rpc_status_t Err = rpc_server_init(&RPCDevice, NumPorts,
- Device.getWarpSize(), Alloc, &Device))
+ std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT);
+ void *RPCBuffer = Device.allocate(
+ rpc::Server::allocation_size(Device.getWarpSize(), NumPorts), nullptr,
+ TARGET_ALLOC_HOST);
+ if (!RPCBuffer)
return plugin::Plugin::error(
- "Failed to initialize RPC server for device %d: %d",
- Device.getDeviceId(), Err);
-
- // Register a custom opcode handler to perform plugin specific allocation.
- auto MallocHandler = [](rpc_port_t Port, void *Data) {
- rpc_recv_and_send(
- Port,
- [](rpc_buffer_t *Buffer, void *Data) {
- plugin::GenericDeviceTy &Device =
- *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
- Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
- Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
- },
- Data);
- };
- if (rpc_status_t Err =
- rpc_register_callback(RPCDevice, RPC_MALLOC, MallocHandler, &Device))
- return plugin::Plugin::error(
- "Failed to register RPC malloc handler for device %d: %d\n",
- Device.getDeviceId(), Err);
-
- // Register a custom opcode handler to perform plugin specific deallocation.
- auto FreeHandler = [](rpc_port_t Port, void *Data) {
- rpc_recv(
- Port,
- [](rpc_buffer_t *Buffer, void *Data) {
- plugin::GenericDeviceTy &Device =
- *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
- Device.free(reinterpret_cast<void *>(Buffer->data[0]),
- TARGET_ALLOC_DEVICE_NON_BLOCKING);
- },
- Data);
- };
- if (rpc_status_t Err =
- rpc_register_callback(RPCDevice, RPC_FREE, FreeHandler, &Device))
- return plugin::Plugin::error(
- "Failed to register RPC free handler for device %d: %d\n",
- Device.getDeviceId(), Err);
+ "Failed to initialize RPC server for device %d", Device.getDeviceId());
// Get the address of the RPC client from the device.
void *ClientPtr;
- plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *));
+ plugin::GlobalTy ClientGlobal("__llvm_libc_rpc_client", sizeof(void *));
if (auto Err =
Handler.getGlobalMetadataFromDevice(Device, Image, ClientGlobal))
return Err;
@@ -100,38 +61,63 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
sizeof(void *), nullptr))
return Err;
- const void *ClientBuffer = rpc_get_client_buffer(RPCDevice);
- if (auto Err = Device.dataSubmit(ClientPtr, ClientBuffer,
- rpc_get_client_size(), nullptr))
+ rpc::Client client(NumPorts, RPCBuffer);
+ if (auto Err =
+ Device.dataSubmit(ClientPtr, &client, sizeof(rpc::Client), nullptr))
return Err;
- Handles[Device.getDeviceId()] = RPCDevice.handle;
+ Buffers[Device.getDeviceId()] = RPCBuffer;
+
+ return Error::success();
+
#endif
return Error::success();
}
Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) {
#ifdef LIBOMPTARGET_RPC_SUPPORT
- rpc_device_t RPCDevice{Handles[Device.getDeviceId()]};
- if (rpc_status_t Err = rpc_handle_server(RPCDevice))
- return plugin::Plugin::error(
- "Error while running RPC server on device %d: %d", Device.getDeviceId(),
- Err);
+ uint64_t NumPorts =
+ std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT);
+ rpc::Server Server(NumPorts, Buffers[Device.getDeviceId()]);
+
+ auto port = Server.try_open(Device.getWarpSize());
+ if (!port)
+ return Error::success();
+
+ switch (port->get_opcode()) {
+ case RPC_MALLOC: {
+ port->recv_and_send([&](rpc::Buffer *Buffer, uint32_t) {
+ Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
+ Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
+ });
+ break;
+ }
+ case RPC_FREE: {
+ port->recv([&](rpc::Buffer *Buffer, uint32_t) {
+ Device.free(reinterpret_cast<void *>(Buffer->data[0]),
+ TARGET_ALLOC_DEVICE_NON_BLOCKING);
+ });
+ break;
+ }
+ default:
+ break;
+ }
+
+ // Let the `libc` library handle and unhandled opcodes.
+ int Status = libc_handle_rpc_port(&*port, Device.getWarpSize());
+ if (Status != rpc::SUCCESS)
+ return createStringError("RPC server given invalid opcode!");
+
+ port->close();
+
+ return Error::success();
#endif
return Error::success();
}
Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
#ifdef LIBOMPTARGET_RPC_SUPPORT
- rpc_device_t RPCDevice{Handles[Device.getDeviceId()]};
- auto Dealloc = [](void *Ptr, void *Data) {
- plugin::GenericDeviceTy &Device =
- *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
- Device.free(Ptr, TARGET_ALLOC_HOST);
- };
- if (rpc_status_t Err = rpc_server_shutdown(RPCDevice, Dealloc, &Device))
- return plugin::Plugin::error(
- "Failed to shut down RPC server for device %d: %d",
- Device.getDeviceId(), Err);
+ Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST);
+ return Error::success();
#endif
return Error::success();
}
More information about the llvm-commits
mailing list