[libc-commits] [libc] 58f5e5e - [libc] Implement the RPC client / server for NVPTX

Fri Mar 24 18:04:53 PDT 2023

Author: Joseph Huber
Date: 2023-03-24T20:04:43-05:00
New Revision: 58f5e5e6b00e5dd674d6e37ed651bc996a397cc3

URL: https://github.com/llvm/llvm-project/commit/58f5e5e6b00e5dd674d6e37ed651bc996a397cc3
DIFF: https://github.com/llvm/llvm-project/commit/58f5e5e6b00e5dd674d6e37ed651bc996a397cc3.diff

LOG: [libc] Implement the RPC client / server for NVPTX

This patch adds the necessary code to impelement the existing RPC client
/ server interface when targeting NVPTX GPUs. This follows closely to
the implementation in the AMDGPU version. This does not yet enable unit
testing as the `nvlink` linker does not support static libraries. So
that will need to be worked around.

I am ignoring the RPC duplication between the AMDGPU and NVPTX loaders. This
will be changed completely later so there's no point unifying the code at this
stage. The implementation was tested manually with the following file and
compilation flags.

```
namespace __llvm_libc {
void write_to_stderr(const char *msg);
void quick_exit(int);
} // namespace __llvm_libc

using namespace __llvm_libc;

int main(int argc, char **argv, char **envp) {
  for (int i = 0; i < argc; ++i) {
    write_to_stderr(argv[i]);
    write_to_stderr("\n");
  }
  quick_exit(255);
}
```

```
$ clang++ crt1.o rpc_client.o quick_exit.o io.o main.cpp --target=nvptx64-nvidia-cuda -march=sm_70 -o image
$ ./nvptx_loader image 1 2 3
image
1
2
3
$ echo $?
255
```

Depends on D146681

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D146846

Added: 
    

Modified: 
    libc/startup/gpu/nvptx/CMakeLists.txt
    libc/startup/gpu/nvptx/start.cpp
    libc/utils/gpu/loader/nvptx/Loader.cpp

Removed: 
    


################################################################################
diff  --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index 1ee2108b0ef29..0fe0b2d7f7e6c 100644

--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -2,6 +2,8 @@ add_startup_object(
   crt1
   SRC
     start.cpp
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin

diff  --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 1e7f4ca7668c0..b09d6f685f212 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -1,4 +1,4 @@
-//===-- Implementation of crt for amdgpu ----------------------------------===//
+//===-- Implementation of crt for nvptx -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,10 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/RPC/rpc_client.h"
+
 extern "C" int main(int argc, char **argv, char **envp);
 
 extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
+  __llvm_libc::rpc::client.reset(in, out, buffer);
+
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }

diff  --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 55531fdc588eb..88ef1701b1583 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -15,6 +15,8 @@
 
 #include "Loader.h"
 
+#include "src/__support/RPC/rpc.h"
+
 #include "cuda.h"
 #include <cstddef>
 #include <cstdio>
@@ -32,6 +34,30 @@ struct kernel_args_t {
   void *buffer;
 };
 
+static __llvm_libc::rpc::Server server;
+
+/// Queries the RPC client at least once and performs server-side work if there
+/// are any active requests.
+void handle_server() {
+  while (server.handle(
+      [&](__llvm_libc::rpc::Buffer *buffer) {
+        switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) {
+        case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: {
+          fputs(reinterpret_cast<const char *>(&buffer->data[1]), stderr);
+          break;
+        }
+        case __llvm_libc::rpc::Opcode::EXIT: {
+          exit(buffer->data[1]);
+          break;
+        }
+        default:
+          return;
+        };
+      },
+      [](__llvm_libc::rpc::Buffer *buffer) {}))
+    ;
+}
+
 static void handle_error(CUresult err) {
   if (err == CUDA_SUCCESS)
     return;
@@ -106,8 +132,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
   if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
     handle_error(err);
 
+  void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
+  void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
+  void *buffer = allocator(sizeof(__llvm_libc::rpc::Buffer));
+  if (!server_inbox || !server_outbox || !buffer)
+    handle_error("Failed to allocate memory the RPC client / server.");
+
   // Set up the arguments to the '_start' kernel on the GPU.
-  // TODO: Setup RPC server implementation;
   uint64_t args_size = sizeof(kernel_args_t);
   kernel_args_t args;
   std::memset(&args, 0, args_size);
@@ -115,10 +146,16 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
   args.argv = dev_argv;
   args.envp = dev_envp;
   args.ret = reinterpret_cast<void *>(dev_ret);
+  args.inbox = server_outbox;
+  args.outbox = server_inbox;
+  args.buffer = buffer;
   void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
                          CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
                          CU_LAUNCH_PARAM_END};
 
+  // Initialize the RPC server's buffer for host-device communication.
+  server.reset(server_inbox, server_outbox, buffer);
+
   // Call the kernel with the given arguments.
   if (CUresult err =
           cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
@@ -126,9 +163,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
                          /*bloackDimZ=*/1, 0, stream, nullptr, args_config))
     handle_error(err);
 
-  // TODO: Query the RPC server periodically while the kernel is running.
+  // Wait until the kernel has completed execution on the device. Periodically
+  // check the RPC client for work to be performed on the server.
   while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
-    ;
+    handle_server();
 
   // Copy the return value back from the kernel and wait.
   int host_ret = 0;