[libc-commits] [libc] e826762 - [libc] More efficiently send bytes via `send_n` and `recv_n`

Tue May 23 08:59:56 PDT 2023

Author: Joseph Huber
Date: 2023-05-23T10:59:47-05:00
New Revision: e826762a0826c11dc62696e46068c61c57a00aa9

URL: https://github.com/llvm/llvm-project/commit/e826762a0826c11dc62696e46068c61c57a00aa9
DIFF: https://github.com/llvm/llvm-project/commit/e826762a0826c11dc62696e46068c61c57a00aa9.diff

LOG: [libc] More efficiently send bytes via `send_n` and `recv_n`

Currently we have the `send_n` and `recv_n` routines to stream data,
such as a string to print, to the other side. The first operation is to
send the size so the other side knows the number of bytes to recieve.
However, this wasted 56 bytes that could've been sent. This meant that
small values, like the arguments to a function to call on the host for
example, needed to perform an extra send. This patch sends the first 56
bytes in the first packet and continues if necessary.

Depends on D150992

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D151041

Added: 
    

Modified: 
    libc/src/__support/OSUtil/gpu/io.cpp
    libc/src/__support/RPC/rpc.h
    libc/src/__support/RPC/rpc_util.h
    libc/utils/gpu/loader/Server.h

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/OSUtil/gpu/io.cpp b/libc/src/__support/OSUtil/gpu/io.cpp
index 995a97389184d..0d8c9acc5211a 100644

--- a/libc/src/__support/OSUtil/gpu/io.cpp
+++ b/libc/src/__support/OSUtil/gpu/io.cpp
@@ -17,6 +17,7 @@ namespace __llvm_libc {
 void write_to_stderr(cpp::string_view msg) {
   rpc::Client::Port port = rpc::client.open<rpc::PRINT_TO_STDERR>();
   port.send_n(msg.data(), msg.size());
+  port.recv([](rpc::Buffer *) { /* void */ });
   port.close();
 }
 

diff  --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index bc8c05b04a22d..836cdbecf5394 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -417,44 +417,44 @@ LIBC_INLINE void Port<T>::recv_and_send(W work) {
   send([](Buffer *) { /* no-op */ });
 }
 
+/// Helper routine to simplify the interface when sending from the GPU using
+/// thread private pointers to the underlying value.
+template <bool T>
+LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
+  static_assert(is_process_gpu(), "Only valid when running on the GPU");
+  const void **src_ptr = &src;
+  uint64_t *size_ptr = &size;
+  send_n(src_ptr, size_ptr);
+}
+
 /// Sends an arbitrarily sized data buffer \p src across the shared channel in
 /// multiples of the packet length.
 template <bool T>
 LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
-  // TODO: We could send the first bytes in this call and potentially save an
-  // extra send operation.
   uint64_t num_sends = 0;
   send([&](Buffer *buffer, uint32_t id) {
     reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
     num_sends = is_process_gpu() ? lane_value(size, id)
                                  : max(lane_value(size, id), num_sends);
+    uint64_t len =
+        lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
+            ? sizeof(Buffer::data) - sizeof(uint64_t)
+            : lane_value(size, id);
+    inline_memcpy(&buffer->data[1], lane_value(src, id), len);
   });
-  uint64_t idx = 0;
-  uint64_t mask = process.get_packet(index).header.mask;
-  while (gpu::ballot(mask, idx < num_sends)) {
+  uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
+  while (gpu::ballot(process.get_packet(index).header.mask, idx < num_sends)) {
     send([=](Buffer *buffer, uint32_t id) {
-      const uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
-                               ? sizeof(Buffer::data)
-                               : lane_value(size, id) - idx;
+      uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
+                         ? sizeof(Buffer::data)
+                         : lane_value(size, id) - idx;
       if (idx < lane_value(size, id))
-        inline_memcpy(
-            buffer->data,
-            reinterpret_cast<const uint8_t *>(lane_value(src, id)) + idx, len);
+        inline_memcpy(buffer->data, advance(lane_value(src, id), idx), len);
     });
     idx += sizeof(Buffer::data);
   }
 }
 
-/// Helper routine to simplify the interface when sending from the GPU using
-/// thread private pointers to the underlying value.
-template <bool T>
-LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
-  static_assert(is_process_gpu(), "Only valid when running on the GPU");
-  const void **src_ptr = &src;
-  uint64_t *size_ptr = &size;
-  send_n(src_ptr, size_ptr);
-}
-
 /// Receives an arbitrarily sized data buffer across the shared channel in
 /// multiples of the packet length. The \p alloc function is called with the
 /// size of the data so that we can initialize the size of the \p dst buffer.
@@ -468,8 +468,13 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
         reinterpret_cast<uint8_t *>(alloc(lane_value(size, id)));
     num_recvs = is_process_gpu() ? lane_value(size, id)
                                  : max(lane_value(size, id), num_recvs);
+    uint64_t len =
+        lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
+            ? sizeof(Buffer::data) - sizeof(uint64_t)
+            : lane_value(size, id);
+    inline_memcpy(lane_value(dst, id), &buffer->data[1], len);
   });
-  uint64_t idx = 0;
+  uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
   uint64_t mask = process.get_packet(index).header.mask;
   while (gpu::ballot(mask, idx < num_recvs)) {
     recv([=](Buffer *buffer, uint32_t id) {
@@ -477,8 +482,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
                          ? sizeof(Buffer::data)
                          : lane_value(size, id) - idx;
       if (idx < lane_value(size, id))
-        inline_memcpy(reinterpret_cast<uint8_t *>(lane_value(dst, id)) + idx,
-                      buffer->data, len);
+        inline_memcpy(advance(lane_value(dst, id), idx), buffer->data, len);
     });
     idx += sizeof(Buffer::data);
   }

diff  --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index b9ffdaa089693..67a509c6499b7 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
 #define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
 
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/properties/architectures.h"
@@ -69,9 +70,13 @@ template <typename T> LIBC_INLINE const T &max(const T &x, const T &y) {
   return x < y ? y : x;
 }
 
-/// Advance the \p ptr by \p bytes.
-template <typename T, typename U> LIBC_INLINE T *advance(T ptr, U bytes) {
-  return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes);
+/// Advance the \p p by \p bytes.
+template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
+  if constexpr (cpp::is_const_v<T>)
+    return reinterpret_cast<T *>(reinterpret_cast<const uint8_t *>(ptr) +
+                                 bytes);
+  else
+    return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes);
 }
 
 } // namespace rpc

diff  --git a/libc/utils/gpu/loader/Server.h b/libc/utils/gpu/loader/Server.h
index 2e9fdfd6e69b1..f4e39afa94121 100644
--- a/libc/utils/gpu/loader/Server.h
+++ b/libc/utils/gpu/loader/Server.h
@@ -35,6 +35,7 @@ void handle_server() {
       uint64_t sizes[rpc::MAX_LANE_SIZE] = {0};
       void *strs[rpc::MAX_LANE_SIZE] = {nullptr};
       port->recv_n(strs, sizes, [&](uint64_t size) { return new char[size]; });
+      port->send([](rpc::Buffer *) { /* void */ });
       for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {
         if (strs[i]) {
           fwrite(strs[i], sizes[i], 1, stderr);