[libc-commits] [libc] 5f276d3 - [libc] Drop RPC port count and index to 32-bit numbers
Joseph Huber via libc-commits
libc-commits at lists.llvm.org
Tue Aug 15 09:11:20 PDT 2023
Author: Joseph Huber
Date: 2023-08-15T11:10:57-05:00
New Revision: 5f276d3d3360af72d4c2deafe2844f164cacbe7e
URL: https://github.com/llvm/llvm-project/commit/5f276d3d3360af72d4c2deafe2844f164cacbe7e
DIFF: https://github.com/llvm/llvm-project/commit/5f276d3d3360af72d4c2deafe2844f164cacbe7e.diff
LOG: [libc] Drop RPC port count and index to 32-bit numbers
The port count and index into the ports was originally written as a
64-bit number. This was with an abundance of caution, however it's
highly unlikely that any configuration will excede a 32-bit number as
most machines will require something in the low-thousands. Because GPUs
are functionally 32-bit in many of their operations this costs us some
extra time and registers to do the 64-bit operations. Doing this saves
us about four registers in most tests.
Reviewed By: JonChesterfield
Differential Revision: https://reviews.llvm.org/D157980
Added:
Modified:
libc/src/__support/RPC/rpc.h
Removed:
################################################################################
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 2f42fb4ea40a16..33694cd7ceed8c 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -80,7 +80,7 @@ template <bool Invert, typename Packet> struct Process {
LIBC_INLINE Process &operator=(Process &&) = default;
LIBC_INLINE ~Process() = default;
- uint64_t port_count = 0;
+ uint32_t port_count = 0;
cpp::Atomic<uint32_t> *inbox = nullptr;
cpp::Atomic<uint32_t> *outbox = nullptr;
Packet *packet = nullptr;
@@ -89,7 +89,7 @@ template <bool Invert, typename Packet> struct Process {
cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
/// Initialize the communication channels.
- LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
+ LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
this->port_count = port_count;
this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
advance(buffer, inbox_offset(port_count)));
@@ -111,17 +111,17 @@ template <bool Invert, typename Packet> struct Process {
/// Atomic<uint32_t> secondary[port_count];
/// Packet buffer[port_count];
/// };
- LIBC_INLINE static constexpr uint64_t allocation_size(uint64_t port_count) {
+ LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count) {
return buffer_offset(port_count) + buffer_bytes(port_count);
}
/// Retrieve the inbox state from memory shared between processes.
- LIBC_INLINE uint32_t load_inbox(uint64_t index) {
+ LIBC_INLINE uint32_t load_inbox(uint32_t index) {
return inbox[index].load(cpp::MemoryOrder::RELAXED);
}
/// Retrieve the outbox state from memory shared between processes.
- LIBC_INLINE uint32_t load_outbox(uint64_t index) {
+ LIBC_INLINE uint32_t load_outbox(uint32_t index) {
return outbox[index].load(cpp::MemoryOrder::RELAXED);
}
@@ -129,7 +129,7 @@ template <bool Invert, typename Packet> struct Process {
/// Equivalent to loading outbox followed by store of the inverted value
/// The outbox is write only by this warp and tracking the value locally is
/// cheaper than calling load_outbox to get the value to store.
- LIBC_INLINE uint32_t invert_outbox(uint64_t index, uint32_t current_outbox) {
+ LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
uint32_t inverted_outbox = !current_outbox;
atomic_thread_fence(cpp::MemoryOrder::RELEASE);
outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED);
@@ -138,7 +138,7 @@ template <bool Invert, typename Packet> struct Process {
// Given the current outbox and inbox values, wait until the inbox changes
// to indicate that this thread owns the buffer element.
- LIBC_INLINE void wait_for_ownership(uint64_t index, uint32_t outbox,
+ LIBC_INLINE void wait_for_ownership(uint32_t index, uint32_t outbox,
uint32_t in) {
while (buffer_unavailable(in, outbox)) {
sleep_briefly();
@@ -160,7 +160,7 @@ template <bool Invert, typename Packet> struct Process {
/// single lock on success, e.g. the result of gpu::get_lane_mask()
/// The lock is held when the n-th bit of the lock bitfield is set.
[[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
- uint64_t index) {
+ uint32_t index) {
// On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
// On volta, need to handle
diff erences between the threads running and
// the threads that were detected in the previous call to get_lane_mask()
@@ -201,7 +201,7 @@ template <bool Invert, typename Packet> struct Process {
/// Unlock the lock at index. We need a lane sync to keep this function
/// convergent, otherwise the compiler will sink the store and deadlock.
[[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
- uint64_t index) {
+ uint32_t index) {
// Do not move any writes past the unlock
atomic_thread_fence(cpp::MemoryOrder::RELEASE);
@@ -217,35 +217,35 @@ template <bool Invert, typename Packet> struct Process {
}
/// Number of bytes to allocate for an inbox or outbox.
- LIBC_INLINE static constexpr uint64_t mailbox_bytes(uint64_t port_count) {
+ LIBC_INLINE static constexpr uint64_t mailbox_bytes(uint32_t port_count) {
return port_count * sizeof(cpp::Atomic<uint32_t>);
}
/// Number of bytes to allocate for the buffer containing the packets.
- LIBC_INLINE static constexpr uint64_t buffer_bytes(uint64_t port_count) {
+ LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count) {
return port_count * sizeof(Packet);
}
/// Offset of the inbox in memory. This is the same as the outbox if inverted.
- LIBC_INLINE static constexpr uint64_t inbox_offset(uint64_t port_count) {
+ LIBC_INLINE static constexpr uint64_t inbox_offset(uint32_t port_count) {
return Invert ? mailbox_bytes(port_count) : 0;
}
/// Offset of the outbox in memory. This is the same as the inbox if inverted.
- LIBC_INLINE static constexpr uint64_t outbox_offset(uint64_t port_count) {
+ LIBC_INLINE static constexpr uint64_t outbox_offset(uint32_t port_count) {
return Invert ? 0 : mailbox_bytes(port_count);
}
/// Offset of the buffer containing the packets after the inbox and outbox.
- LIBC_INLINE static constexpr uint64_t buffer_offset(uint64_t port_count) {
+ LIBC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) {
return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
}
/// Conditionally set the n-th bit in the atomic bitfield.
LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic<uint32_t> *bits,
- uint64_t index, bool cond) {
- uint64_t slot = index / NUM_BITS_IN_WORD;
- uint64_t bit = index % NUM_BITS_IN_WORD;
+ uint32_t index, bool cond) {
+ uint32_t slot = index / NUM_BITS_IN_WORD;
+ uint32_t bit = index % NUM_BITS_IN_WORD;
return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
cpp::MemoryOrder::RELAXED) &
(1u << bit);
@@ -253,9 +253,9 @@ template <bool Invert, typename Packet> struct Process {
/// Conditionally clear the n-th bit in the atomic bitfield.
LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic<uint32_t> *bits,
- uint64_t index, bool cond) {
- uint64_t slot = index / NUM_BITS_IN_WORD;
- uint64_t bit = index % NUM_BITS_IN_WORD;
+ uint32_t index, bool cond) {
+ uint32_t slot = index / NUM_BITS_IN_WORD;
+ uint32_t bit = index % NUM_BITS_IN_WORD;
return bits[slot].fetch_and(~0u ^ (static_cast<uint32_t>(cond) << bit),
cpp::MemoryOrder::RELAXED) &
(1u << bit);
@@ -292,7 +292,7 @@ static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *, uint32_t)> fn,
/// processes. A port is conceptually an index into the memory provided by the
/// underlying process that is guarded by a lock bit.
template <bool T, typename S> struct Port {
- LIBC_INLINE Port(Process<T, S> &process, uint64_t lane_mask, uint64_t index,
+ LIBC_INLINE Port(Process<T, S> &process, uint64_t lane_mask, uint32_t index,
uint32_t out)
: process(process), lane_mask(lane_mask), index(index), out(out),
receive(false), owns_buffer(true) {}
@@ -334,7 +334,7 @@ template <bool T, typename S> struct Port {
private:
Process<T, S> &process;
uint64_t lane_mask;
- uint64_t index;
+ uint32_t index;
uint32_t out;
bool receive;
bool owns_buffer;
@@ -351,7 +351,7 @@ struct Client {
template <uint16_t opcode> LIBC_INLINE cpp::optional<Port> try_open();
template <uint16_t opcode> LIBC_INLINE Port open();
- LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
+ LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
process.reset(port_count, buffer);
}
@@ -374,7 +374,7 @@ template <uint32_t lane_size> struct Server {
LIBC_INLINE cpp::optional<Port> try_open();
LIBC_INLINE Port open();
- LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
+ LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
process.reset(port_count, buffer);
}
@@ -382,7 +382,7 @@ template <uint32_t lane_size> struct Server {
return process.get_buffer_start();
}
- LIBC_INLINE static uint64_t allocation_size(uint64_t port_count) {
+ LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) {
return Process<true, Packet<lane_size>>::allocation_size(port_count);
}
@@ -525,7 +525,7 @@ template <uint16_t opcode>
[[clang::convergent]] LIBC_INLINE cpp::optional<Client::Port>
Client::try_open() {
// Perform a naive linear scan for a port that can be opened to send data.
- for (uint64_t index = 0; index < process.port_count; ++index) {
+ for (uint32_t index = 0; index < process.port_count; ++index) {
// Attempt to acquire the lock on this index.
uint64_t lane_mask = gpu::get_lane_mask();
if (!process.try_lock(lane_mask, index))
@@ -566,7 +566,7 @@ template <uint32_t lane_size>
cpp::optional<typename Server<lane_size>::Port>
Server<lane_size>::try_open() {
// Perform a naive linear scan for a port that has a pending request.
- for (uint64_t index = 0; index < process.port_count; ++index) {
+ for (uint32_t index = 0; index < process.port_count; ++index) {
uint32_t in = process.load_inbox(index);
uint32_t out = process.load_outbox(index);
More information about the libc-commits
mailing list