[libc-commits] [libc] 5f276d3 - [libc] Drop RPC port count and index to 32-bit numbers

Joseph Huber via libc-commits libc-commits at lists.llvm.org
Tue Aug 15 09:11:20 PDT 2023


Author: Joseph Huber
Date: 2023-08-15T11:10:57-05:00
New Revision: 5f276d3d3360af72d4c2deafe2844f164cacbe7e

URL: https://github.com/llvm/llvm-project/commit/5f276d3d3360af72d4c2deafe2844f164cacbe7e
DIFF: https://github.com/llvm/llvm-project/commit/5f276d3d3360af72d4c2deafe2844f164cacbe7e.diff

LOG: [libc] Drop RPC port count and index to 32-bit numbers

The port count and index into the ports was originally written as a
64-bit number. This was with an abundance of caution, however it's
highly unlikely that any configuration will excede a 32-bit number as
most machines  will require something in the low-thousands. Because GPUs
are functionally 32-bit in many of their operations this costs us some
extra time and registers to do the 64-bit operations. Doing this saves
us about four registers in most tests.

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D157980

Added: 
    

Modified: 
    libc/src/__support/RPC/rpc.h

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 2f42fb4ea40a16..33694cd7ceed8c 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -80,7 +80,7 @@ template <bool Invert, typename Packet> struct Process {
   LIBC_INLINE Process &operator=(Process &&) = default;
   LIBC_INLINE ~Process() = default;
 
-  uint64_t port_count = 0;
+  uint32_t port_count = 0;
   cpp::Atomic<uint32_t> *inbox = nullptr;
   cpp::Atomic<uint32_t> *outbox = nullptr;
   Packet *packet = nullptr;
@@ -89,7 +89,7 @@ template <bool Invert, typename Packet> struct Process {
   cpp::Atomic<uint32_t> lock[MAX_PORT_COUNT / NUM_BITS_IN_WORD] = {0};
 
   /// Initialize the communication channels.
-  LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
+  LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
     this->port_count = port_count;
     this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
         advance(buffer, inbox_offset(port_count)));
@@ -111,17 +111,17 @@ template <bool Invert, typename Packet> struct Process {
   ///   Atomic<uint32_t> secondary[port_count];
   ///   Packet buffer[port_count];
   /// };
-  LIBC_INLINE static constexpr uint64_t allocation_size(uint64_t port_count) {
+  LIBC_INLINE static constexpr uint64_t allocation_size(uint32_t port_count) {
     return buffer_offset(port_count) + buffer_bytes(port_count);
   }
 
   /// Retrieve the inbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_inbox(uint64_t index) {
+  LIBC_INLINE uint32_t load_inbox(uint32_t index) {
     return inbox[index].load(cpp::MemoryOrder::RELAXED);
   }
 
   /// Retrieve the outbox state from memory shared between processes.
-  LIBC_INLINE uint32_t load_outbox(uint64_t index) {
+  LIBC_INLINE uint32_t load_outbox(uint32_t index) {
     return outbox[index].load(cpp::MemoryOrder::RELAXED);
   }
 
@@ -129,7 +129,7 @@ template <bool Invert, typename Packet> struct Process {
   /// Equivalent to loading outbox followed by store of the inverted value
   /// The outbox is write only by this warp and tracking the value locally is
   /// cheaper than calling load_outbox to get the value to store.
-  LIBC_INLINE uint32_t invert_outbox(uint64_t index, uint32_t current_outbox) {
+  LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
     uint32_t inverted_outbox = !current_outbox;
     atomic_thread_fence(cpp::MemoryOrder::RELEASE);
     outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED);
@@ -138,7 +138,7 @@ template <bool Invert, typename Packet> struct Process {
 
   // Given the current outbox and inbox values, wait until the inbox changes
   // to indicate that this thread owns the buffer element.
-  LIBC_INLINE void wait_for_ownership(uint64_t index, uint32_t outbox,
+  LIBC_INLINE void wait_for_ownership(uint32_t index, uint32_t outbox,
                                       uint32_t in) {
     while (buffer_unavailable(in, outbox)) {
       sleep_briefly();
@@ -160,7 +160,7 @@ template <bool Invert, typename Packet> struct Process {
   /// single lock on success, e.g. the result of gpu::get_lane_mask()
   /// The lock is held when the n-th bit of the lock bitfield is set.
   [[clang::convergent]] LIBC_INLINE bool try_lock(uint64_t lane_mask,
-                                                  uint64_t index) {
+                                                  uint32_t index) {
     // On amdgpu, test and set to the nth lock bit and a sync_lane would suffice
     // On volta, need to handle 
diff erences between the threads running and
     // the threads that were detected in the previous call to get_lane_mask()
@@ -201,7 +201,7 @@ template <bool Invert, typename Packet> struct Process {
   /// Unlock the lock at index. We need a lane sync to keep this function
   /// convergent, otherwise the compiler will sink the store and deadlock.
   [[clang::convergent]] LIBC_INLINE void unlock(uint64_t lane_mask,
-                                                uint64_t index) {
+                                                uint32_t index) {
     // Do not move any writes past the unlock
     atomic_thread_fence(cpp::MemoryOrder::RELEASE);
 
@@ -217,35 +217,35 @@ template <bool Invert, typename Packet> struct Process {
   }
 
   /// Number of bytes to allocate for an inbox or outbox.
-  LIBC_INLINE static constexpr uint64_t mailbox_bytes(uint64_t port_count) {
+  LIBC_INLINE static constexpr uint64_t mailbox_bytes(uint32_t port_count) {
     return port_count * sizeof(cpp::Atomic<uint32_t>);
   }
 
   /// Number of bytes to allocate for the buffer containing the packets.
-  LIBC_INLINE static constexpr uint64_t buffer_bytes(uint64_t port_count) {
+  LIBC_INLINE static constexpr uint64_t buffer_bytes(uint32_t port_count) {
     return port_count * sizeof(Packet);
   }
 
   /// Offset of the inbox in memory. This is the same as the outbox if inverted.
-  LIBC_INLINE static constexpr uint64_t inbox_offset(uint64_t port_count) {
+  LIBC_INLINE static constexpr uint64_t inbox_offset(uint32_t port_count) {
     return Invert ? mailbox_bytes(port_count) : 0;
   }
 
   /// Offset of the outbox in memory. This is the same as the inbox if inverted.
-  LIBC_INLINE static constexpr uint64_t outbox_offset(uint64_t port_count) {
+  LIBC_INLINE static constexpr uint64_t outbox_offset(uint32_t port_count) {
     return Invert ? 0 : mailbox_bytes(port_count);
   }
 
   /// Offset of the buffer containing the packets after the inbox and outbox.
-  LIBC_INLINE static constexpr uint64_t buffer_offset(uint64_t port_count) {
+  LIBC_INLINE static constexpr uint64_t buffer_offset(uint32_t port_count) {
     return align_up(2 * mailbox_bytes(port_count), alignof(Packet));
   }
 
   /// Conditionally set the n-th bit in the atomic bitfield.
   LIBC_INLINE static constexpr uint32_t set_nth(cpp::Atomic<uint32_t> *bits,
-                                                uint64_t index, bool cond) {
-    uint64_t slot = index / NUM_BITS_IN_WORD;
-    uint64_t bit = index % NUM_BITS_IN_WORD;
+                                                uint32_t index, bool cond) {
+    uint32_t slot = index / NUM_BITS_IN_WORD;
+    uint32_t bit = index % NUM_BITS_IN_WORD;
     return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
                                cpp::MemoryOrder::RELAXED) &
            (1u << bit);
@@ -253,9 +253,9 @@ template <bool Invert, typename Packet> struct Process {
 
   /// Conditionally clear the n-th bit in the atomic bitfield.
   LIBC_INLINE static constexpr uint32_t clear_nth(cpp::Atomic<uint32_t> *bits,
-                                                  uint64_t index, bool cond) {
-    uint64_t slot = index / NUM_BITS_IN_WORD;
-    uint64_t bit = index % NUM_BITS_IN_WORD;
+                                                  uint32_t index, bool cond) {
+    uint32_t slot = index / NUM_BITS_IN_WORD;
+    uint32_t bit = index % NUM_BITS_IN_WORD;
     return bits[slot].fetch_and(~0u ^ (static_cast<uint32_t>(cond) << bit),
                                 cpp::MemoryOrder::RELAXED) &
            (1u << bit);
@@ -292,7 +292,7 @@ static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *, uint32_t)> fn,
 /// processes. A port is conceptually an index into the memory provided by the
 /// underlying process that is guarded by a lock bit.
 template <bool T, typename S> struct Port {
-  LIBC_INLINE Port(Process<T, S> &process, uint64_t lane_mask, uint64_t index,
+  LIBC_INLINE Port(Process<T, S> &process, uint64_t lane_mask, uint32_t index,
                    uint32_t out)
       : process(process), lane_mask(lane_mask), index(index), out(out),
         receive(false), owns_buffer(true) {}
@@ -334,7 +334,7 @@ template <bool T, typename S> struct Port {
 private:
   Process<T, S> &process;
   uint64_t lane_mask;
-  uint64_t index;
+  uint32_t index;
   uint32_t out;
   bool receive;
   bool owns_buffer;
@@ -351,7 +351,7 @@ struct Client {
   template <uint16_t opcode> LIBC_INLINE cpp::optional<Port> try_open();
   template <uint16_t opcode> LIBC_INLINE Port open();
 
-  LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
+  LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
     process.reset(port_count, buffer);
   }
 
@@ -374,7 +374,7 @@ template <uint32_t lane_size> struct Server {
   LIBC_INLINE cpp::optional<Port> try_open();
   LIBC_INLINE Port open();
 
-  LIBC_INLINE void reset(uint64_t port_count, void *buffer) {
+  LIBC_INLINE void reset(uint32_t port_count, void *buffer) {
     process.reset(port_count, buffer);
   }
 
@@ -382,7 +382,7 @@ template <uint32_t lane_size> struct Server {
     return process.get_buffer_start();
   }
 
-  LIBC_INLINE static uint64_t allocation_size(uint64_t port_count) {
+  LIBC_INLINE static uint64_t allocation_size(uint32_t port_count) {
     return Process<true, Packet<lane_size>>::allocation_size(port_count);
   }
 
@@ -525,7 +525,7 @@ template <uint16_t opcode>
 [[clang::convergent]] LIBC_INLINE cpp::optional<Client::Port>
 Client::try_open() {
   // Perform a naive linear scan for a port that can be opened to send data.
-  for (uint64_t index = 0; index < process.port_count; ++index) {
+  for (uint32_t index = 0; index < process.port_count; ++index) {
     // Attempt to acquire the lock on this index.
     uint64_t lane_mask = gpu::get_lane_mask();
     if (!process.try_lock(lane_mask, index))
@@ -566,7 +566,7 @@ template <uint32_t lane_size>
     cpp::optional<typename Server<lane_size>::Port>
     Server<lane_size>::try_open() {
   // Perform a naive linear scan for a port that has a pending request.
-  for (uint64_t index = 0; index < process.port_count; ++index) {
+  for (uint32_t index = 0; index < process.port_count; ++index) {
     uint32_t in = process.load_inbox(index);
     uint32_t out = process.load_outbox(index);
 


        


More information about the libc-commits mailing list