[libc-commits] [libc] 8e4f9b1 - [libc] Add initial support for an RPC mechanism for the GPU

Fri Mar 17 10:55:41 PDT 2023

Author: Joseph Huber
Date: 2023-03-17T12:55:31-05:00
New Revision: 8e4f9b1fcbfd5e747b0df9a2de511b43bfa13348

URL: https://github.com/llvm/llvm-project/commit/8e4f9b1fcbfd5e747b0df9a2de511b43bfa13348
DIFF: https://github.com/llvm/llvm-project/commit/8e4f9b1fcbfd5e747b0df9a2de511b43bfa13348.diff

LOG: [libc] Add initial support for an RPC mechanism for the GPU

This patch adds initial support for an RPC client / server architecture.
The GPU is unable to perform several system utilities on its own, so in
order to implement features like printing or memory allocation we need
to be able to communicate with the executing process. This is done via a
buffer of "sharable" memory. That is, a buffer with a unified pointer
that both the client and server can use to communicate.

The implementation here is based off of Jon Chesterfields minimal RPC
example in his work. We use an `inbox` and `outbox` to communicate
between if there is an RPC request and to signify when work is done.
We use a fixed-size buffer for the communication channel. This is fixed
size so that we can ensure that there is enough space for all
compute-units on the GPU to issue work to any of the ports. Right now
the implementation is single threaded so there is only a single buffer
that is not shared.

This implementation still has several features missing to be complete.
Such as multi-threaded support and asynchrnonous calls.

Depends on D145912

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D145913

Added: 
    libc/src/__support/OSUtil/gpu/io.cpp
    libc/src/__support/OSUtil/gpu/io.h
    libc/src/__support/RPC/CMakeLists.txt
    libc/src/__support/RPC/rpc.h
    libc/src/__support/RPC/rpc_client.cpp
    libc/src/__support/RPC/rpc_client.h

Modified: 
    libc/src/__support/CMakeLists.txt
    libc/src/__support/OSUtil/CMakeLists.txt
    libc/src/__support/OSUtil/gpu/CMakeLists.txt
    libc/src/__support/OSUtil/gpu/quick_exit.cpp
    libc/src/__support/OSUtil/io.h
    libc/startup/gpu/amdgpu/CMakeLists.txt
    libc/startup/gpu/amdgpu/start.cpp
    libc/utils/gpu/loader/amdgpu/CMakeLists.txt
    libc/utils/gpu/loader/amdgpu/Loader.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 8329a80d85fb9..e4eb354aefcd3 100644

--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -204,6 +204,7 @@ add_header_library(
 add_subdirectory(FPUtil)
 add_subdirectory(OSUtil)
 add_subdirectory(StringUtil)
+add_subdirectory(RPC)
 
 # Thread support is used by other "File". So, we add the "threads"
 # before "File".

diff  --git a/libc/src/__support/OSUtil/CMakeLists.txt b/libc/src/__support/OSUtil/CMakeLists.txt
index 50aad32b7ed31..c19677582643e 100644
--- a/libc/src/__support/OSUtil/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/CMakeLists.txt
@@ -8,12 +8,23 @@ if(NOT TARGET ${target_os_util})
   return()
 endif()
 
-add_header_library(
-  osutil
-  HDRS
-    io.h
-    quick_exit.h
-    syscall.h
-  DEPENDS
-    ${target_os_util}
-)
+# The OSUtil is an object library in GPU mode.
+if(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
+  add_header_library(
+    osutil
+    HDRS
+      io.h
+      quick_exit.h
+      syscall.h
+    DEPENDS
+      ${target_os_util}
+  )
+else()
+  add_object_library(
+    osutil
+    ALIAS
+      ${target_os_util}
+    DEPENDS
+      ${target_os_util}
+  )
+endif()

diff  --git a/libc/src/__support/OSUtil/gpu/CMakeLists.txt b/libc/src/__support/OSUtil/gpu/CMakeLists.txt
index eb6e86feaad3b..d1aa69604e515 100644
--- a/libc/src/__support/OSUtil/gpu/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/gpu/CMakeLists.txt
@@ -2,8 +2,11 @@ add_object_library(
   gpu_util
   SRCS
     quick_exit.cpp
+    io.cpp
   HDRS
     quick_exit.h
+    io.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.RPC.rpc_client
 )

diff  --git a/libc/src/__support/OSUtil/gpu/io.cpp b/libc/src/__support/OSUtil/gpu/io.cpp
new file mode 100644
index 0000000000000..75ac83ac6909c
--- /dev/null
+++ b/libc/src/__support/OSUtil/gpu/io.cpp
@@ -0,0 +1,29 @@
+//===-------------- GPU implementation of IO utils --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "io.h"
+
+#include "src/__support/RPC/rpc_client.h"
+#include "src/string/string_utils.h"
+
+namespace __llvm_libc {
+
+void write_to_stderr(const char *msg) {
+  uint64_t length = internal::string_length(msg) + 1;
+  uint64_t buffer_len = sizeof(rpc::Buffer) - sizeof(uint64_t);
+  for (uint64_t i = 0; i < length; i += buffer_len)
+    rpc::client.run(
+        [&](rpc::Buffer *buffer) {
+          buffer->data[0] = rpc::Opcode::PRINT_TO_STDERR;
+          inline_memcpy(reinterpret_cast<char *>(&buffer->data[1]), &msg[i],
+                        (length > buffer_len ? buffer_len : length));
+        },
+        [](rpc::Buffer *) {});
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/src/__support/OSUtil/gpu/io.h b/libc/src/__support/OSUtil/gpu/io.h
new file mode 100644
index 0000000000000..e9a4ebf82a111
--- /dev/null
+++ b/libc/src/__support/OSUtil/gpu/io.h
@@ -0,0 +1,18 @@
+//===-------------- GPU implementation of IO utils --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_OSUTIL_GPU_IO_H
+#define LLVM_LIBC_SRC_SUPPORT_OSUTIL_GPU_IO_H
+
+namespace __llvm_libc {
+
+void write_to_stderr(const char *msg);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SUPPORT_OSUTIL_LINUX_IO_H

diff  --git a/libc/src/__support/OSUtil/gpu/quick_exit.cpp b/libc/src/__support/OSUtil/gpu/quick_exit.cpp
index d8c8f032a9403..9be709552dc5c 100644
--- a/libc/src/__support/OSUtil/gpu/quick_exit.cpp
+++ b/libc/src/__support/OSUtil/gpu/quick_exit.cpp
@@ -11,11 +11,21 @@
 
 #include "quick_exit.h"
 
+#include "src/__support/RPC/rpc_client.h"
 #include "src/__support/macros/properties/architectures.h"
 
 namespace __llvm_libc {
 
 void quick_exit(int status) {
+  // TODO: Support asynchronous calls so we don't wait and exit from the GPU
+  // immediately.
+  rpc::client.run(
+      [&](rpc::Buffer *buffer) {
+        buffer->data[0] = rpc::Opcode::EXIT;
+        buffer->data[1] = status;
+      },
+      [](rpc::Buffer *) {});
+
 #if defined(LIBC_TARGET_ARCH_IS_NVPTX)
   asm("exit" ::: "memory");
 #elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)

diff  --git a/libc/src/__support/OSUtil/io.h b/libc/src/__support/OSUtil/io.h
index dbf92dc201c3d..e2eee0826b177 100644
--- a/libc/src/__support/OSUtil/io.h
+++ b/libc/src/__support/OSUtil/io.h
@@ -9,7 +9,11 @@
 #ifndef LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H
 #define LLVM_LIBC_SRC_SUPPORT_OSUTIL_IO_H
 
-#ifdef __unix__
+#include "src/__support/macros/properties/architectures.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_GPU)
+#include "gpu/io.h"
+#elif defined(__unix__)
 #include "linux/io.h"
 #endif
 

diff  --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
new file mode 100644
index 0000000000000..f5837628c2050
--- /dev/null
+++ b/libc/src/__support/RPC/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_header_library(
+  rpc
+  HDRS
+    rpc.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.CPP.atomic
+)
+
+add_object_library(
+  rpc_client
+  SRCS
+    rpc_client.cpp
+  HDRS
+    rpc_client.h
+  DEPENDS
+    .rpc
+)

diff  --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
new file mode 100644
index 0000000000000..c3df09e3f5db2
--- /dev/null
+++ b/libc/src/__support/RPC/rpc.h
@@ -0,0 +1,140 @@
+//===-- Shared memory RPC client / server interface -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_H
+#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_H
+
+#include "src/__support/CPP/atomic.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc {
+namespace rpc {
+
+/// A list of opcodes that we use to invoke certain actions on the server. We
+/// reserve the first 255 values for internal libc usage.
+enum Opcode : uint64_t {
+  NOOP = 0,
+  PRINT_TO_STDERR = 1,
+  EXIT = 2,
+  LIBC_LAST = (1UL << 8) - 1,
+};
+
+/// A fixed size channel used to communicate between the RPC client and server.
+struct Buffer {
+  uint64_t data[8];
+};
+
+/// A common process used to synchronize communication between a client and a
+/// server. The process contains an inbox and an outbox used for signaling
+/// ownership of the shared buffer.
+struct Process {
+  cpp::Atomic<uint32_t> *inbox;
+  cpp::Atomic<uint32_t> *outbox;
+  Buffer *buffer;
+
+  /// Initialize the communication channels.
+  void reset(void *inbox, void *outbox, void *buffer) {
+    *this = {
+        reinterpret_cast<cpp::Atomic<uint32_t> *>(inbox),
+        reinterpret_cast<cpp::Atomic<uint32_t> *>(outbox),
+        reinterpret_cast<Buffer *>(buffer),
+    };
+  }
+};
+
+/// The RPC client used to make requests to the server.
+struct Client : public Process {
+  template <typename F, typename U> void run(F fill, U use);
+};
+
+/// The RPC server used to respond to the client.
+struct Server : public Process {
+  template <typename W, typename C> bool run(W work, C clean);
+};
+
+/// Run the RPC client protocol to communicate with the server. We perform the
+/// following high level actions to complete a communication:
+///   - Apply \p fill to the shared buffer and write 1 to the outbox.
+///   - Wait until the inbox is 1.
+///   - Apply \p use to the shared buffer and write 0 to the outbox.
+///   - Wait until the inbox is 0.
+template <typename F, typename U> void Client::run(F fill, U use) {
+  bool in = inbox->load(cpp::MemoryOrder::RELAXED);
+  bool out = outbox->load(cpp::MemoryOrder::RELAXED);
+  atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+  // Write to buffer then to the outbox.
+  if (!in & !out) {
+    fill(buffer);
+    atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    outbox->store(1, cpp::MemoryOrder::RELEASE);
+    out = 1;
+  }
+  // Wait for the result from the server.
+  if (!in & out) {
+    while (!in)
+      in = inbox->load(cpp::MemoryOrder::RELAXED);
+    atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+  }
+  // Read from the buffer and then write to outbox.
+  if (in & out) {
+    use(buffer);
+    atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    outbox->store(0, cpp::MemoryOrder::RELEASE);
+    out = 0;
+  }
+  // Wait for server to complete the communication.
+  if (in & !out) {
+    while (in)
+      in = inbox->load(cpp::MemoryOrder::RELAXED);
+    atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+  }
+}
+
+/// Run the RPC server protocol to communicate with the client. This is
+/// non-blocking and only checks the server a single time. We perform the
+/// following high level actions to complete a communication:
+///   - Query if the inbox is 1 and exit if there is no work to do.
+///   - Apply \p work to the shared buffer and write 1 to the outbox.
+///   - Wait until the inbox is 0.
+///   - Apply \p clean to the shared buffer and write 0 to the outbox.
+template <typename W, typename C> bool Server::run(W work, C clean) {
+  bool in = inbox->load(cpp::MemoryOrder::RELAXED);
+  bool out = outbox->load(cpp::MemoryOrder::RELAXED);
+  atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+  // No work to do, exit.
+  if (!in & !out)
+    return false;
+  // Do work then write to the outbox.
+  if (in & !out) {
+    work(buffer);
+    atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    outbox->store(1, cpp::MemoryOrder::RELEASE);
+    out = 1;
+  }
+  // Wait for the client to read the result.
+  if (in & out) {
+    while (in)
+      in = inbox->load(cpp::MemoryOrder::RELAXED);
+    atomic_thread_fence(cpp::MemoryOrder::ACQUIRE);
+  }
+  // Clean up the buffer and signal the client.
+  if (!in & out) {
+    clean(buffer);
+    atomic_thread_fence(cpp::MemoryOrder::RELEASE);
+    outbox->store(0, cpp::MemoryOrder::RELEASE);
+    out = 0;
+  }
+
+  return true;
+}
+
+} // namespace rpc
+} // namespace __llvm_libc
+
+#endif

diff  --git a/libc/src/__support/RPC/rpc_client.cpp b/libc/src/__support/RPC/rpc_client.cpp
new file mode 100644
index 0000000000000..3e64fe5ec67c8
--- /dev/null
+++ b/libc/src/__support/RPC/rpc_client.cpp
@@ -0,0 +1,27 @@
+//===-- Shared memory RPC client instantiation ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H
+#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H
+
+#include "rpc.h"
+
+namespace __llvm_libc {
+namespace rpc {
+
+/// The libc client instance used to communicate with the server.
+Client client;
+
+/// Externally visible symbol to signify the usage of an RPC client to
+/// whomever needs to run the server.
+extern "C" [[gnu::visibility("protected")]] const bool __llvm_libc_rpc = false;
+
+} // namespace rpc
+} // namespace __llvm_libc
+
+#endif

diff  --git a/libc/src/__support/RPC/rpc_client.h b/libc/src/__support/RPC/rpc_client.h
new file mode 100644
index 0000000000000..509ec2f4185b9
--- /dev/null
+++ b/libc/src/__support/RPC/rpc_client.h
@@ -0,0 +1,23 @@
+//===-- Shared memory RPC client instantiation ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H
+#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_CLIENT_H
+
+#include "rpc.h"
+
+namespace __llvm_libc {
+namespace rpc {
+
+/// The libc client instance used to communicate with the server.
+extern Client client;
+
+} // namespace rpc
+} // namespace __llvm_libc
+
+#endif

diff  --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
index be202371fa3a1..d1c6fc7cd6442 100644
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -2,11 +2,12 @@ add_startup_object(
   crt1
   SRC
     start.cpp
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
     -nogpulib # Do not include any GPU vendor libraries.
-    -nostdinc
     -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}
     -emit-llvm # AMDGPU's intermediate object file format is bitcode.
     --target=${LIBC_GPU_TARGET_TRIPLE}

diff  --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 3be3745d4c7a0..cc30982e148ff 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -6,9 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/RPC/rpc_client.h"
+
 extern "C" int main(int argc, char **argv);
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
-_start(int argc, char **argv, int *ret) {
+_start(int argc, char **argv, int *ret, void *in, void *out, void *buffer) {
+  __llvm_libc::rpc::client.reset(in, out, buffer);
+
   __atomic_fetch_or(ret, main(argc, argv), __ATOMIC_RELAXED);
 }

diff  --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
index 524e808f7f8a4..bef97af0d195f 100644
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
@@ -1,4 +1,7 @@
 add_executable(amdhsa_loader Loader.cpp)
+add_dependencies(amdhsa_loader libc.src.__support.RPC.rpc)
+
+target_include_directories(amdhsa_loader PRIVATE ${LIBC_SOURCE_DIR})
 target_link_libraries(amdhsa_loader
   PRIVATE
   hsa-runtime64::hsa-runtime64

diff  --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index 5f444d83083a4..3136dc2509790 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -15,6 +15,8 @@
 
 #include "Loader.h"
 
+#include "src/__support/RPC/rpc.h"
+
 #include <hsa/hsa.h>
 #include <hsa/hsa_ext_amd.h>
 
@@ -31,8 +33,35 @@ struct kernel_args_t {
   int argc;
   void *argv;
   void *ret;
+  void *inbox;
+  void *outbox;
+  void *buffer;
 };
 
+static __llvm_libc::rpc::Server server;
+
+/// Queries the RPC client at least once and performs server-side work if there
+/// are any active requests.
+void handle_server() {
+  while (server.run(
+      [&](__llvm_libc::rpc::Buffer *buffer) {
+        switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) {
+        case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: {
+          fputs(reinterpret_cast<const char *>(&buffer->data[1]), stderr);
+          break;
+        }
+        case __llvm_libc::rpc::Opcode::EXIT: {
+          exit(buffer->data[1]);
+          break;
+        }
+        default:
+          return;
+        };
+      },
+      [](__llvm_libc::rpc::Buffer *buffer) {}))
+    ;
+}
+
 /// Print the error code and exit if \p code indicates an error.
 static void handle_error(hsa_status_t code) {
   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
@@ -278,6 +307,26 @@ int load(int argc, char **argv, void *image, size_t size) {
     handle_error(err);
   hsa_amd_memory_fill(dev_ret, 0, sizeof(int));
 
+  // Allocate finegrained memory for the RPC server and client to share.
+  void *server_inbox;
+  void *server_outbox;
+  void *buffer;
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+          finegrained_pool, sizeof(__llvm_libc::cpp::Atomic<int>),
+          /*flags=*/0, &server_inbox))
+    handle_error(err);
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+          finegrained_pool, sizeof(__llvm_libc::cpp::Atomic<int>),
+          /*flags=*/0, &server_outbox))
+    handle_error(err);
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
+          finegrained_pool, sizeof(__llvm_libc::rpc::Buffer),
+          /*flags=*/0, &buffer))
+    handle_error(err);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer);
+
   // Initialie all the arguments (explicit and implicit) to zero, then set the
   // explicit arguments to the values created above.
   std::memset(args, 0, args_size);
@@ -285,6 +334,9 @@ int load(int argc, char **argv, void *image, size_t size) {
   kernel_args->argc = argc;
   kernel_args->argv = dev_argv;
   kernel_args->ret = dev_ret;
+  kernel_args->inbox = server_outbox;
+  kernel_args->outbox = server_inbox;
+  kernel_args->buffer = buffer;
 
   // Obtain a packet from the queue.
   uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
@@ -316,6 +368,9 @@ int load(int argc, char **argv, void *image, size_t size) {
           hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
     handle_error(err);
 
+  // Initialize the RPC server's buffer for host-device communication.
+  server.reset(server_inbox, server_outbox, buffer);
+
   // Initialize the packet header and set the doorbell signal to begin execution
   // by the HSA runtime.
   uint16_t header =
@@ -326,11 +381,12 @@ int load(int argc, char **argv, void *image, size_t size) {
                    __ATOMIC_RELEASE);
   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 
-  // Wait until the kernel has completed execution on the device.
-  while (hsa_signal_wait_scacquire(packet->completion_signal,
-                                   HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX,
-                                   HSA_WAIT_STATE_ACTIVE) != 0)
-    ;
+  // Wait until the kernel has completed execution on the device. Periodically
+  // check the RPC client for work to be performed on the server.
+  while (hsa_signal_wait_scacquire(
+             packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
+             /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
+    handle_server();
 
   // Create a memory signal and copy the return value back from the device into
   // a new buffer.