[libc-commits] [libc] 5470ea4 - [libc] Change the starting port index to use the SMID (#79200)

Tue Jan 30 11:07:01 PST 2024

Author: Joseph Huber
Date: 2024-01-30T13:06:58-06:00
New Revision: 5470ea4e36d47ed09595517854f0fa07ca91e16f

URL: https://github.com/llvm/llvm-project/commit/5470ea4e36d47ed09595517854f0fa07ca91e16f
DIFF: https://github.com/llvm/llvm-project/commit/5470ea4e36d47ed09595517854f0fa07ca91e16f.diff

LOG: [libc] Change the starting port index to use the SMID (#79200)

Summary:
The RPC interface uses several ports to provide parallel access. Right
now we begin the search at the beginning, which heavily contests the
early ports. Using the SMID allows us to stagger the starting index
based off of the cluster identifier that is executing the current warp.
Multiple warps can share an SM, but it will guaruntee that the
contention for the low indices is lower.

This also increases the maximum port size to around 4096, this is
because 512 isn't enough to cover the full hardare parallelism needed to
guarantee this doesdn't deadlock.

Added: 
    

Modified: 
    libc/src/__support/GPU/amdgpu/utils.h
    libc/src/__support/GPU/generic/utils.h
    libc/src/__support/GPU/nvptx/utils.h
    libc/src/__support/RPC/rpc.h
    libc/utils/gpu/server/rpc_server.h

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 9f0ff0c717a6..96e3efccb3b5 100644

--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -179,6 +179,14 @@ LIBC_INLINE uint64_t fixed_frequency_clock() {
 /// Terminates execution of the associated wavefront.
 [[noreturn]] LIBC_INLINE void end_program() { __builtin_amdgcn_endpgm(); }
 
+/// Returns a unique identifier for the process cluster the current wavefront is
+/// executing on. Here we use the identifier for the compute unit (CU) and
+/// shader engine.
+/// FIXME: Currently unimplemented on AMDGPU until we have a simpler interface
+/// than the one at
+/// https://github.com/ROCm/clr/blob/develop/hipamd/include/hip/amd_detail/amd_device_functions.h#L899
+LIBC_INLINE uint32_t get_cluster_id() { return 0; }
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE
 

diff  --git a/libc/src/__support/GPU/generic/utils.h b/libc/src/__support/GPU/generic/utils.h
index b701db482bbe..00b59837ccc6 100644
--- a/libc/src/__support/GPU/generic/utils.h
+++ b/libc/src/__support/GPU/generic/utils.h
@@ -75,6 +75,8 @@ LIBC_INLINE uint64_t fixed_frequency_clock() { return 0; }
 
 [[noreturn]] LIBC_INLINE void end_program() { __builtin_unreachable(); }
 
+LIBC_INLINE uint32_t get_cluster_id() { return 0; }
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE
 

diff  --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index 9fe3caa49147..e7e297adf7ec 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -150,6 +150,10 @@ LIBC_INLINE uint64_t fixed_frequency_clock() {
 /// Terminates execution of the calling thread.
 [[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); }
 
+/// Returns a unique identifier for the process cluster the current warp is
+/// executing on. Here we use the identifier for the symmetric multiprocessor.
+LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); }
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE
 

diff  --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 7b2c89ac4dce..7924d4cec2ac 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -57,7 +57,7 @@ template <uint32_t lane_size = gpu::LANE_SIZE> struct alignas(64) Packet {
 };
 
 /// The maximum number of parallel ports that the RPC interface can support.
-constexpr uint64_t MAX_PORT_COUNT = 512;
+constexpr uint64_t MAX_PORT_COUNT = 4096;
 
 /// A common process used to synchronize communication between a client and a
 /// server. The process contains a read-only inbox and a write-only outbox used
@@ -519,7 +519,7 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
 template <uint16_t opcode> LIBC_INLINE Client::Port Client::open() {
   // Repeatedly perform a naive linear scan for a port that can be opened to
   // send data.
-  for (uint32_t index = 0;; ++index) {
+  for (uint32_t index = gpu::get_cluster_id();; ++index) {
     // Start from the beginning if we run out of ports to check.
     if (index >= process.port_count)
       index = 0;

diff  --git a/libc/utils/gpu/server/rpc_server.h b/libc/utils/gpu/server/rpc_server.h
index a818aab4ced9..f1a8fe06281c 100644
--- a/libc/utils/gpu/server/rpc_server.h
+++ b/libc/utils/gpu/server/rpc_server.h
@@ -18,7 +18,7 @@ extern "C" {
 #endif
 
 /// The maximum number of ports that can be opened for any server.
-const uint64_t RPC_MAXIMUM_PORT_COUNT = 512;
+const uint64_t RPC_MAXIMUM_PORT_COUNT = 4096;
 
 /// The symbol name associated with the client for use with the LLVM C library
 /// implementation.