[libc-commits] [libc] [libc][Docs] Update `libc` documentation for RPC and others (PR #120018)

Sun Dec 15 12:36:18 PST 2024

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/120018

Summary:
A few of these were out of date, update them now that the C library
interface into RPC was deleted.


>From f4b241e7b053b678f60e34850062c039301bfae4 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Sun, 15 Dec 2024 14:33:24 -0600
Subject: [PATCH] [libc][Docs] Update `libc` documentation for RPC and others

Summary:
A few of these were out of date, update them now that the C library
interface into RPC was deleted.
---
 libc/docs/gpu/rpc.rst   | 112 ++++++++++++----------------------------
 libc/docs/gpu/using.rst |  35 +------------
 2 files changed, 33 insertions(+), 114 deletions(-)

diff --git a/libc/docs/gpu/rpc.rst b/libc/docs/gpu/rpc.rst
index e1244154341e9f..0d169c7db9a50f 100644
--- a/libc/docs/gpu/rpc.rst
+++ b/libc/docs/gpu/rpc.rst
@@ -92,20 +92,6 @@ asynchronous operations that do not need to wait until the server has completed
 them. If an operation requires more data than the fixed size buffer, we simply
 send multiple packets back and forth in a streaming fashion.
 
-Server Library
---------------
-
-The RPC server's basic functionality is provided by the LLVM C library. A static
-library called ``libllvmlibc_rpc_server.a`` includes handling for the basic
-operations, such as printing or exiting. This has a small API that handles
-setting up the unified buffer and an interface to check the opcodes.
-
-Some operations are too divergent to provide generic implementations for, such
-as allocating device accessible memory. For these cases, we provide a callback
-registration scheme to add a custom handler for any given opcode through the
-port API. More information can be found in the installed header
-``<install>/include/llvmlibc_rpc_server.h``.
-
 Client Example
 --------------
 
@@ -183,7 +169,7 @@ CUDA Server Example
 
 The following code shows an example of using the exported RPC interface along
 with the C library to manually configure a working server using the CUDA
-language. Other runtimes can use the presence of the ``__llvm_libc_rpc_client``
+language. Other runtimes can use the presence of the ``__llvm_rpc_client``
 in the GPU executable as an indicator for whether or not the server can be
 checked. These details should ideally be handled by the GPU language runtime,
 but the following example shows how it can be used by a standard user.
@@ -196,53 +182,16 @@ but the following example shows how it can be used by a standard user.
   #include <cstdlib>
   #include <cuda_runtime.h>
 
-  #include <llvmlibc_rpc_server.h>
+  #include <shared/rpc.h>
+  #include <shared/rpc_opcodes.h>
 
   [[noreturn]] void handle_error(cudaError_t err) {
     fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err));
     exit(EXIT_FAILURE);
   }
 
-  [[noreturn]] void handle_error(rpc_status_t err) {
-    fprintf(stderr, "RPC error: %d\n", err);
-    exit(EXIT_FAILURE);
-  }
-
-  // The handle to the RPC client provided by the C library.
-  extern "C" __device__ void *__llvm_libc_rpc_client;
-
-  __global__ void get_client_ptr(void **ptr) { *ptr = __llvm_libc_rpc_client; }
-
-  // Obtain the RPC client's handle from the device. The CUDA language cannot look
-  // up the symbol directly like the driver API, so we launch a kernel to read it.
-  void *get_rpc_client() {
-    void *rpc_client = nullptr;
-    void **rpc_client_d = nullptr;
-
-    if (cudaError_t err = cudaMalloc(&rpc_client_d, sizeof(void *)))
-      handle_error(err);
-    get_client_ptr<<<1, 1>>>(rpc_client_d);
-    if (cudaError_t err = cudaDeviceSynchronize())
-      handle_error(err);
-    if (cudaError_t err = cudaMemcpy(&rpc_client, rpc_client_d, sizeof(void *),
-                                     cudaMemcpyDeviceToHost))
-      handle_error(err);
-    return rpc_client;
-  }
-
-  // Routines to allocate mapped memory that both the host and the device can
-  // access asychonrously to communicate with each other.
-  void *alloc_host(size_t size, void *) {
-    void *sharable_ptr;
-    if (cudaError_t err = cudaMallocHost(&sharable_ptr, sizeof(void *)))
-      handle_error(err);
-    return sharable_ptr;
-  };
-
-  void free_host(void *ptr, void *) {
-    if (cudaError_t err = cudaFreeHost(ptr))
-      handle_error(err);
-  }
+  // Routes the library symbol into the CUDA runtime interface.
+  [[gnu::weak]] __device__ rpc::Client client asm("__llvm_rpc_client");
 
   // The device-side overload of the standard C function to call.
   extern "C" __device__ int puts(const char *);
@@ -251,18 +200,23 @@ but the following example shows how it can be used by a standard user.
   __global__ void hello() { puts("Hello world!"); }
 
   int main() {
-    // Initialize the RPC server to run on the given device.
-    rpc_device_t device;
-    if (rpc_status_t err =
-            rpc_server_init(&device, RPC_MAXIMUM_PORT_COUNT,
-                            /*warp_size=*/32, alloc_host, /*data=*/nullptr))
+    void *rpc_client = nullptr;
+    if (cudaError_t err = cudaGetSymbolAddress(&rpc_client, client))
+      handle_error(err);
+
+    // Initialize the RPC client and server interface.
+    uint32_t warp_size = 32;
+    void *rpc_buffer = nullptr;
+    if (cudaError_t err = cudaMallocHost(
+            &rpc_buffer,
+            rpc::Server::allocation_size(warp_size, rpc::MAX_PORT_COUNT)))
       handle_error(err);
+    rpc::Server server(rpc::MAX_PORT_COUNT, rpc_buffer);
+    rpc::Client client(rpc::MAX_PORT_COUNT, rpc_buffer);
 
-    // Initialize the RPC client by copying the buffer to the device's handle.
-    void *rpc_client = get_rpc_client();
-    if (cudaError_t err =
-            cudaMemcpy(rpc_client, rpc_get_client_buffer(device),
-                       rpc_get_client_size(), cudaMemcpyHostToDevice))
+    // Initialize the client on the device so it can communicate with the server.
+    if (cudaError_t err = cudaMemcpy(rpc_client, &client, sizeof(rpc::Client),
+                                     cudaMemcpyHostToDevice))
       handle_error(err);
 
     cudaStream_t stream;
@@ -274,28 +228,25 @@ but the following example shows how it can be used by a standard user.
 
     // While the kernel is executing, check the RPC server for work to do.
     // Requires non-blocking CUDA kernels but avoids a separate thread.
-    while (cudaStreamQuery(stream) == cudaErrorNotReady)
-      if (rpc_status_t err = rpc_handle_server(device))
-        handle_error(err);
-
-    // Shut down the server running on the given device.
-    if (rpc_status_t err =
-            rpc_server_shutdown(device, free_host, /*data=*/nullptr))
-      handle_error(err);
-
-    return EXIT_SUCCESS;
+    do {
+      auto port = server.try_open(warp_size, /*index=*/0);
+      // From libllvmlibc_rpc_server.a in the installation.
+      if (port)
+        handle_libc_opcodes(*port, warp_size);
+    } while (cudaStreamQuery(stream) == cudaErrorNotReady);
   }
 
 The above code must be compiled in CUDA's relocatable device code mode and with
 the advanced offloading driver to link in the library. Currently this can be
 done with the following invocation. Using LTO avoids the overhead normally
-associated with relocatable device code linking.
+associated with relocatable device code linking. The C library for GPUs is
+linked in by forwarding the static library to the device-side link job.
 
 .. code-block:: sh
 
-  $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart -lcgpu-nvptx \
+  $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart \
        -I<install-path>include -L<install-path>/lib -lllvmlibc_rpc_server \
-       -O3 -foffload-lto -o hello
+       -Xoffload-linker -lc -O3 -foffload-lto -o hello
   $> ./hello
   Hello world!
 
@@ -304,4 +255,5 @@ Extensions
 
 The opcode is a 32-bit integer that must be unique to the requested operation. 
 All opcodes used by ``libc`` internally have the character ``c`` in the most 
-significant byte.
+significant byte. Any other opcode is available for use outside of the ``libc``
+implementation.
diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst
index e56b6f634bb31e..1c1f9c9bfb0c69 100644
--- a/libc/docs/gpu/using.rst
+++ b/libc/docs/gpu/using.rst
@@ -99,39 +99,6 @@ threads and two blocks.
 Including the wrapper headers, linking the C library, and running the :ref:`RPC
 server<libc_gpu_rpc>` are all handled automatically by the compiler and runtime.
 
-Binary format
-^^^^^^^^^^^^^
-
-The ``libcgpu.a`` static archive is a fat-binary containing LLVM-IR for each
-supported target device. The supported architectures can be seen using LLVM's
-``llvm-objdump`` with the ``--offloading`` flag:
-
-.. code-block:: sh
-
-  $> llvm-objdump --offloading libcgpu-amdgpu.a
-  libcgpu-amdgpu.a(strcmp.cpp.o):    file format elf64-x86-64
-
-  OFFLOADING IMAGE [0]:
-  kind            llvm ir
-  arch            generic
-  triple          amdgcn-amd-amdhsa
-  producer        none
-  ...
-
-Because the device code is stored inside a fat binary, it can be difficult to
-inspect the resulting code. This can be done using the following utilities:
-
-.. code-block:: sh
-
-  $> llvm-ar x libcgpu.a strcmp.cpp.o
-  $> clang-offload-packager strcmp.cpp.o --image=arch=generic,file=strcmp.bc
-  $> opt -S out.bc
-  ...
-
-Please note that this fat binary format is provided for compatibility with
-existing offloading toolchains. The implementation in ``libc`` does not depend
-on any existing offloading languages and is completely freestanding.
-
 Direct compilation
 ------------------
 
@@ -246,7 +213,7 @@ compilation. Using link time optimization will help hide this.
 
 .. code-block:: sh
 
-  $> clang hello.c --target=nvptx64-nvidia-cuda -mcpu=native -flto -lc <install>/lib/nvptx64-nvidia-cuda/crt1.o
+  $> clang hello.c --target=nvptx64-nvidia-cuda -march=native -flto -lc <install>/lib/nvptx64-nvidia-cuda/crt1.o
   $> nvptx-loader --threads 2 --blocks 2 a.out
   Hello from NVPTX!
   Hello from NVPTX!