[llvm] [Offload] Move RPC server handling to a dedicated thread (PR #112988)

Mon Oct 28 14:14:19 PDT 2024

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/112988

>From 425cea810466d96d8d6448ae748b361ded73ca36 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 17 Oct 2024 17:08:03 -0500
Subject: [PATCH 1/4] [OpenMP] Add support for custom callback in AMDGPUStream

Summary:
We have the ability to schedule callbacks after certain events complete.
Currently we can register an arbitrary callback in CUDA, but can't in
AMDGPU. I am planning on using this support to move the RPC handling to
a separate thread, then using these callbacks to suspend / resume it
when no kernels are running. This is a preliminary patch to keep this
noise out of that one.
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 68 ++++++++++++++--------
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index f0cc0c2e4d08e5..54f27629dde219 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -927,6 +927,8 @@ struct AMDGPUStreamTy {
     AMDGPUSignalManagerTy *SignalManager;
   };
 
+  using AMDGPUStreamCallbackTy = Error(void *Data);
+
   /// The stream is composed of N stream's slots. The struct below represents
   /// the fields of each slot. Each slot has a signal and an optional action
   /// function. When appending an HSA asynchronous operation to the stream, one
@@ -942,65 +944,81 @@ struct AMDGPUStreamTy {
     /// operation as input signal.
     AMDGPUSignalTy *Signal;
 
-    /// The action that must be performed after the operation's completion. Set
+    /// The actions that must be performed after the operation's completion. Set
     /// to nullptr when there is no action to perform.
-    Error (*ActionFunction)(void *);
+    llvm::SmallVector<AMDGPUStreamCallbackTy *> Callbacks;
 
     /// Space for the action's arguments. A pointer to these arguments is passed
     /// to the action function. Notice the space of arguments is limited.
-    union {
+    union ActionArgsTy {
       MemcpyArgsTy MemcpyArgs;
       ReleaseBufferArgsTy ReleaseBufferArgs;
       ReleaseSignalArgsTy ReleaseSignalArgs;
-    } ActionArgs;
+      void *CallbackArgs;
+    };
+
+    llvm::SmallVector<ActionArgsTy> ActionArgs;
 
     /// Create an empty slot.
-    StreamSlotTy() : Signal(nullptr), ActionFunction(nullptr) {}
+    StreamSlotTy() : Signal(nullptr), Callbacks({}), ActionArgs({}) {}
 
     /// Schedule a host memory copy action on the slot.
     Error schedHostMemoryCopy(void *Dst, const void *Src, size_t Size) {
-      ActionFunction = memcpyAction;
-      ActionArgs.MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
+      Callbacks.emplace_back(memcpyAction);
+      ActionArgs.emplace_back().MemcpyArgs = MemcpyArgsTy{Dst, Src, Size};
       return Plugin::success();
     }
 
     /// Schedule a release buffer action on the slot.
     Error schedReleaseBuffer(void *Buffer, AMDGPUMemoryManagerTy &Manager) {
-      ActionFunction = releaseBufferAction;
-      ActionArgs.ReleaseBufferArgs = ReleaseBufferArgsTy{Buffer, &Manager};
+      Callbacks.emplace_back(releaseBufferAction);
+      ActionArgs.emplace_back().ReleaseBufferArgs =
+          ReleaseBufferArgsTy{Buffer, &Manager};
       return Plugin::success();
     }
 
     /// Schedule a signal release action on the slot.
     Error schedReleaseSignal(AMDGPUSignalTy *SignalToRelease,
                              AMDGPUSignalManagerTy *SignalManager) {
-      ActionFunction = releaseSignalAction;
-      ActionArgs.ReleaseSignalArgs =
+      Callbacks.emplace_back(releaseSignalAction);
+      ActionArgs.emplace_back().ReleaseSignalArgs =
           ReleaseSignalArgsTy{SignalToRelease, SignalManager};
       return Plugin::success();
     }
 
+    /// Register a callback to be called on compleition
+    Error schedCallback(AMDGPUStreamCallbackTy *Func, void *Data) {
+      Callbacks.emplace_back(Func);
+      ActionArgs.emplace_back().CallbackArgs = Data;
+
+      return Plugin::success();
+    }
+
     // Perform the action if needed.
     Error performAction() {
-      if (!ActionFunction)
+      if (Callbacks.empty())
         return Plugin::success();
 
-      // Perform the action.
-      if (ActionFunction == memcpyAction) {
-        if (auto Err = memcpyAction(&ActionArgs))
-          return Err;
-      } else if (ActionFunction == releaseBufferAction) {
-        if (auto Err = releaseBufferAction(&ActionArgs))
-          return Err;
-      } else if (ActionFunction == releaseSignalAction) {
-        if (auto Err = releaseSignalAction(&ActionArgs))
-          return Err;
-      } else {
-        return Plugin::error("Unknown action function!");
+      for (auto [Callback, ActionArg] : llvm::zip(Callbacks, ActionArgs)) {
+        // Perform the action.
+        if (Callback == memcpyAction) {
+          if (auto Err = memcpyAction(&ActionArg))
+            return Err;
+        } else if (Callback == releaseBufferAction) {
+          if (auto Err = releaseBufferAction(&ActionArg))
+            return Err;
+        } else if (Callback == releaseSignalAction) {
+          if (auto Err = releaseSignalAction(&ActionArg))
+            return Err;
+        } else {
+          if (auto Err = Callback(ActionArg.CallbackArgs))
+            return Err;
+        }
       }
 
       // Invalidate the action.
-      ActionFunction = nullptr;
+      Callbacks.clear();
+      ActionArgs.clear();
 
       return Plugin::success();
     }

>From e5144d3fb93b13c2c69fa7e356900cc8db45e904 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Fri, 18 Oct 2024 16:48:33 -0500
Subject: [PATCH 2/4] [Offload] Move RPC server handling to a dedicated thread

Summary:
Handling the RPC server requires running through list of jobs that the
device has requested to be done. Currently this is handled by the thread
that does the waiting for the kernel to finish. However, this is not
sound on NVIDIA architectures and only works for async launches in the
OpenMP model that uses helper threads.

However, we also don't want to have this thread doing work
unnnecessarily. For this reason we track the execution of kernels and
cause the thread to sleep via a condition variable (usually backed by
some kind of futex or other intelligent sleeping mechanism) so that the
thread will be idle while no kernels are running.
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    | 59 ++++++++--------
 offload/plugins-nextgen/common/include/RPC.h  | 61 ++++++++++++++--
 .../common/src/PluginInterface.cpp            |  5 +-
 offload/plugins-nextgen/common/src/RPC.cpp    | 69 +++++++++++++++----
 .../cuda/dynamic_cuda/cuda.cpp                |  1 +
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |  3 +
 offload/plugins-nextgen/cuda/src/rtl.cpp      | 37 +++++-----
 offload/test/libc/server.c                    | 56 +++++++++++++++
 8 files changed, 226 insertions(+), 65 deletions(-)
 create mode 100644 offload/test/libc/server.c

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 54f27629dde219..bd1d60075bb36a 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -626,9 +626,9 @@ struct AMDGPUSignalTy {
   }
 
   /// Wait until the signal gets a zero value.
-  Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr,
+  Error wait(const uint64_t ActiveTimeout = 0,
              GenericDeviceTy *Device = nullptr) const {
-    if (ActiveTimeout && !RPCServer) {
+    if (ActiveTimeout) {
       hsa_signal_value_t Got = 1;
       Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
                                       ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
@@ -637,14 +637,11 @@ struct AMDGPUSignalTy {
     }
 
     // If there is an RPC device attached to this stream we run it as a server.
-    uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
-    auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
+    uint64_t Timeout = UINT64_MAX;
+    auto WaitState = HSA_WAIT_STATE_BLOCKED;
     while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
-                                     Timeout, WaitState) != 0) {
-      if (RPCServer && Device)
-        if (auto Err = RPCServer->runServer(*Device))
-          return Err;
-    }
+                                     Timeout, WaitState) != 0)
+      ;
     return Plugin::success();
   }
 
@@ -1052,11 +1049,6 @@ struct AMDGPUStreamTy {
   /// operation that was already finalized in a previous stream sycnhronize.
   uint32_t SyncCycle;
 
-  /// A pointer associated with an RPC server running on the given device. If
-  /// RPC is not being used this will be a null pointer. Otherwise, this
-  /// indicates that an RPC server is expected to be run on this stream.
-  RPCServerTy *RPCServer;
-
   /// Mutex to protect stream's management.
   mutable std::mutex Mutex;
 
@@ -1236,9 +1228,6 @@ struct AMDGPUStreamTy {
   /// Deinitialize the stream's signals.
   Error deinit() { return Plugin::success(); }
 
-  /// Attach an RPC server to this stream.
-  void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
-
   /// Push a asynchronous kernel to the stream. The kernel arguments must be
   /// placed in a special allocation for kernel args and must keep alive until
   /// the kernel finalizes. Once the kernel is finished, the stream will release
@@ -1266,10 +1255,30 @@ struct AMDGPUStreamTy {
     if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
       return Err;
 
+    // If we are running an RPC server we want to wake up the server thread
+    // whenever there is a kernel running and let it sleep otherwise.
+    if (Device.getRPCServer())
+      Device.Plugin.getRPCServer().Thread->notify();
+
     // Push the kernel with the output signal and an input signal (optional)
-    return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
-                                   GroupSize, StackSize, OutputSignal,
-                                   InputSignal);
+    if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads,
+                                           NumBlocks, GroupSize, StackSize,
+                                           OutputSignal, InputSignal))
+      return Err;
+
+    // Register a callback to indicate when the kernel is complete.
+    if (Device.getRPCServer()) {
+      if (auto Err = Slots[Curr].schedCallback(
+              [](void *Data) -> llvm::Error {
+                GenericPluginTy &Plugin =
+                    *reinterpret_cast<GenericPluginTy *>(Data);
+                Plugin.getRPCServer().Thread->finish();
+                return Error::success();
+              },
+              &Device.Plugin))
+        return Err;
+    }
+    return Plugin::success();
   }
 
   /// Push an asynchronous memory copy between pinned memory buffers.
@@ -1479,8 +1488,8 @@ struct AMDGPUStreamTy {
       return Plugin::success();
 
     // Wait until all previous operations on the stream have completed.
-    if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds,
-                                              RPCServer, &Device))
+    if (auto Err =
+            Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device))
       return Err;
 
     // Reset the stream and perform all pending post actions.
@@ -3024,7 +3033,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
     : Agent(Device.getAgent()), Queue(nullptr),
       SignalManager(Device.getSignalManager()), Device(Device),
       // Initialize the std::deque with some empty positions.
-      Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
+      Slots(32), NextSlot(0), SyncCycle(0),
       StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
       UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
 
@@ -3377,10 +3386,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
     return Err;
 
-  // If this kernel requires an RPC server we attach its pointer to the stream.
-  if (GenericDevice.getRPCServer())
-    Stream->setRPCServer(GenericDevice.getRPCServer());
-
   // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
   if (ImplArgs &&
       getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h
index 01bf539bcb3f32..d356c62b456f53 100644
--- a/offload/plugins-nextgen/common/include/RPC.h
+++ b/offload/plugins-nextgen/common/include/RPC.h
@@ -19,7 +19,11 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
 
+#include <atomic>
+#include <condition_variable>
 #include <cstdint>
+#include <mutex>
+#include <thread>
 
 namespace llvm::omp::target {
 namespace plugin {
@@ -37,6 +41,9 @@ struct RPCServerTy {
   /// Initializes the handles to the number of devices we may need to service.
   RPCServerTy(plugin::GenericPluginTy &Plugin);
 
+  /// Deinitialize the associated memory and resources.
+  llvm::Error shutDown();
+
   /// Check if this device image is using an RPC server. This checks for the
   /// precense of an externally visible symbol in the device image that will
   /// be present whenever RPC code is called.
@@ -51,17 +58,61 @@ struct RPCServerTy {
                          plugin::GenericGlobalHandlerTy &Handler,
                          plugin::DeviceImageTy &Image);
 
-  /// Runs the RPC server associated with the \p Device until the pending work
-  /// is cleared.
-  llvm::Error runServer(plugin::GenericDeviceTy &Device);
-
   /// Deinitialize the RPC server for the given device. This will free the
   /// memory associated with the k
   llvm::Error deinitDevice(plugin::GenericDeviceTy &Device);
 
 private:
   /// Array from this device's identifier to its attached devices.
-  llvm::SmallVector<uintptr_t> Handles;
+  std::unique_ptr<std::atomic<uintptr_t>[]> Handles;
+
+  /// A helper class for running the user thread that handles
+  struct ServerThread {
+    std::thread Worker;
+
+    /// A boolean indicating whether or not the worker thread should continue.
+    std::atomic<bool> Running;
+
+    /// The number of currently executing kernels across all devices that need
+    /// the server thread to be running.
+    std::atomic<uint32_t> NumUsers;
+
+    /// The condition variable used to suspend the thread if no work is needed.
+    std::condition_variable CV;
+    std::mutex Mutex;
+
+    /// A reference to all the RPC interfaces that the server is handling.
+    llvm::ArrayRef<std::atomic<uintptr_t>> Handles;
+
+    /// Initialize the worker thread to run in the background.
+    ServerThread(std::atomic<uintptr_t> Handles[], size_t Length);
+    ~ServerThread() { assert(!Running && "Thread not shut down explicitly\n"); }
+
+    /// Notify the worker thread that there is a user that needs it.
+    void notify() {
+      std::lock_guard<decltype(Mutex)> Lock(Mutex);
+      NumUsers.fetch_add(1, std::memory_order_relaxed);
+      CV.notify_all();
+    }
+
+    /// Indicate that one of the dependent users has finished.
+    void finish() {
+      [[maybe_unused]] uint32_t Old =
+          NumUsers.fetch_sub(1, std::memory_order_relaxed);
+      assert(Old > 0 && "Attempt to signal finish with no pending work");
+    }
+
+    /// Destroy the worker thread and wait.
+    void shutDown();
+
+    /// Run the server thread to continuously check the RPC interface for work
+    /// to be done for the device.
+    void run();
+  };
+
+public:
+  /// Pointer to the server thread instance.
+  std::unique_ptr<ServerThread> Thread;
 };
 
 } // namespace llvm::omp::target
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 25b815b7f96694..2be0fc0a713da5 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1624,8 +1624,11 @@ Error GenericPluginTy::deinit() {
   if (GlobalHandler)
     delete GlobalHandler;
 
-  if (RPCServer)
+  if (RPCServer) {
+    if (Error Err = RPCServer->shutDown())
+      return Err;
     delete RPCServer;
+  }
 
   if (RecordReplay)
     delete RecordReplay;
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index faa2cbd4f02fe1..dab112ead6c7e2 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -21,8 +21,64 @@ using namespace llvm;
 using namespace omp;
 using namespace target;
 
+void RPCServerTy::ServerThread::shutDown() {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  {
+    std::lock_guard<decltype(Mutex)> Lock(Mutex);
+    Running.store(false, std::memory_order_release);
+    CV.notify_all();
+  }
+  if (Worker.joinable())
+    Worker.join();
+#endif
+}
+
+void RPCServerTy::ServerThread::run() {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  std::unique_lock<decltype(Mutex)> Lock(Mutex);
+  for (;;) {
+    CV.wait(Lock, [&]() {
+      return NumUsers.load(std::memory_order_acquire) > 0 ||
+             !Running.load(std::memory_order_acquire);
+    });
+
+    if (!Running.load(std::memory_order_acquire))
+      return;
+
+    Lock.unlock();
+    while (NumUsers.load(std::memory_order_relaxed) > 0 &&
+           Running.load(std::memory_order_relaxed)) {
+      for (const auto &Handle : Handles) {
+        rpc_device_t RPCDevice{Handle};
+        [[maybe_unused]] rpc_status_t Err = rpc_handle_server(RPCDevice);
+        assert(Err == RPC_STATUS_SUCCESS &&
+               "Checking the RPC server should not fail");
+      }
+    }
+    Lock.lock();
+  }
+#endif
+}
+
+RPCServerTy::ServerThread::ServerThread(std::atomic<uintptr_t> Handles[],
+                                        size_t Length)
+    : Running(true), NumUsers(0), CV(), Mutex(), Handles(Handles, Length) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  Worker = std::thread([this]() { run(); });
+#endif
+}
+
 RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
-    : Handles(Plugin.getNumDevices()) {}
+    : Handles(
+          std::make_unique<std::atomic<uintptr_t>[]>(Plugin.getNumDevices())),
+      Thread(new ServerThread(Handles.get(), Plugin.getNumDevices())) {}
+
+llvm::Error RPCServerTy::shutDown() {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  Thread->shutDown();
+#endif
+  return Error::success();
+}
 
 llvm::Expected<bool>
 RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
@@ -109,17 +165,6 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
   return Error::success();
 }
 
-Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) {
-#ifdef LIBOMPTARGET_RPC_SUPPORT
-  rpc_device_t RPCDevice{Handles[Device.getDeviceId()]};
-  if (rpc_status_t Err = rpc_handle_server(RPCDevice))
-    return plugin::Plugin::error(
-        "Error while running RPC server on device %d: %d", Device.getDeviceId(),
-        Err);
-#endif
-  return Error::success();
-}
-
 Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
   rpc_device_t RPCDevice{Handles[Device.getDeviceId()]};
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 5ec3adb9e4e3a1..7878499dbfcb7e 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -63,6 +63,7 @@ DLWRAP(cuStreamCreate, 2)
 DLWRAP(cuStreamDestroy, 1)
 DLWRAP(cuStreamSynchronize, 1)
 DLWRAP(cuStreamQuery, 1)
+DLWRAP(cuStreamAddCallback, 4)
 DLWRAP(cuCtxSetCurrent, 1)
 DLWRAP(cuDevicePrimaryCtxRelease, 1)
 DLWRAP(cuDevicePrimaryCtxGetState, 3)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 16c8f7ad46c445..ad874735a25ed9 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -286,6 +286,8 @@ static inline void *CU_LAUNCH_PARAM_END = (void *)0x00;
 static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
 static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
 
+typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
 CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
@@ -326,6 +328,7 @@ CUresult cuStreamCreate(CUstream *, unsigned);
 CUresult cuStreamDestroy(CUstream);
 CUresult cuStreamSynchronize(CUstream);
 CUresult cuStreamQuery(CUstream);
+CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int);
 CUresult cuCtxSetCurrent(CUcontext);
 CUresult cuDevicePrimaryCtxRelease(CUdevice);
 CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 015c7775ba3513..7c876c603aa46c 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -632,15 +632,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
     CUresult Res;
     // If we have an RPC server running on this device we will continuously
     // query it for work rather than blocking.
-    if (!getRPCServer()) {
-      Res = cuStreamSynchronize(Stream);
-    } else {
-      do {
-        Res = cuStreamQuery(Stream);
-        if (auto Err = getRPCServer()->runServer(*this))
-          return Err;
-      } while (Res == CUDA_ERROR_NOT_READY);
-    }
+    Res = cuStreamSynchronize(Stream);
 
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
@@ -825,17 +817,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (auto Err = getStream(AsyncInfoWrapper, Stream))
       return Err;
 
-    // If there is already pending work on the stream it could be waiting for
-    // someone to check the RPC server.
-    if (auto *RPCServer = getRPCServer()) {
-      CUresult Res = cuStreamQuery(Stream);
-      while (Res == CUDA_ERROR_NOT_READY) {
-        if (auto Err = RPCServer->runServer(*this))
-          return Err;
-        Res = cuStreamQuery(Stream);
-      }
-    }
-
     CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
     return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s");
   }
@@ -1294,10 +1275,26 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                     reinterpret_cast<void *>(&LaunchParams.Size),
                     CU_LAUNCH_PARAM_END};
 
+  // If we are running an RPC server we want to wake up the server thread
+  // whenever there is a kernel running and let it sleep otherwise.
+  if (GenericDevice.getRPCServer())
+    GenericDevice.Plugin.getRPCServer().Thread->notify();
+
   CUresult Res = cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
                                 /*gridDimZ=*/1, NumThreads,
                                 /*blockDimY=*/1, /*blockDimZ=*/1,
                                 MaxDynCGroupMem, Stream, nullptr, Config);
+
+  // Register a callback to indicate when the kernel is complete.
+  if (GenericDevice.getRPCServer())
+    cuStreamAddCallback(
+        Stream,
+        [](CUstream Stream, CUresult Status, void *Data) {
+          GenericPluginTy &Plugin = *reinterpret_cast<GenericPluginTy *>(Data);
+          Plugin.getRPCServer().Thread->finish();
+        },
+        &GenericDevice.Plugin, /*flags=*/0);
+
   return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
 }
 
diff --git a/offload/test/libc/server.c b/offload/test/libc/server.c
new file mode 100644
index 00000000000000..eb81294436426a
--- /dev/null
+++ b/offload/test/libc/server.c
@@ -0,0 +1,56 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// REQUIRES: libc
+
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+#pragma omp begin declare variant match(device = {kind(gpu)})
+// Extension provided by the 'libc' project.
+unsigned long long rpc_host_call(void *fn, void *args, size_t size);
+#pragma omp declare target to(rpc_host_call) device_type(nohost)
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+// Dummy host implementation to make this work for all targets.
+unsigned long long rpc_host_call(void *fn, void *args, size_t size) {
+  return ((unsigned long long (*)(void *))fn)(args);
+}
+#pragma omp end declare variant
+
+long long foo(void *data) { return -1; }
+
+void *fn_ptr = NULL;
+#pragma omp declare target to(fn_ptr)
+
+int main() {
+  fn_ptr = (void *)&foo;
+#pragma omp target update to(fn_ptr)
+
+  for (int i = 0; i < 4; ++i) {
+#pragma omp target
+    {
+      long long res = rpc_host_call(fn_ptr, NULL, 0);
+      assert(res == -1 && "RPC call failed\n");
+    }
+
+    for (int j = 0; j < 128; ++j) {
+#pragma omp target nowait
+      {
+        long long res = rpc_host_call(fn_ptr, NULL, 0);
+        assert(res == -1 && "RPC call failed\n");
+      }
+    }
+#pragma omp taskwait
+
+#pragma omp target
+    {
+      long long res = rpc_host_call(fn_ptr, NULL, 0);
+      assert(res == -1 && "RPC call failed\n");
+    }
+  }
+
+  // CHECK: PASS
+  puts("PASS");
+}

>From 15cdffcd4ff47b94ad96e0b8633e4b2af02cf453 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 21 Oct 2024 07:05:23 -0500
Subject: [PATCH 3/4] Use cuLaunchHostFunc

---
 offload/plugins-nextgen/cuda/src/rtl.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 7c876c603aa46c..a46d7fc45f89d5 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -630,8 +630,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
   Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
     CUresult Res;
-    // If we have an RPC server running on this device we will continuously
-    // query it for work rather than blocking.
     Res = cuStreamSynchronize(Stream);
 
     // Once the stream is synchronized, return it to stream pool and reset
@@ -1287,13 +1285,13 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Register a callback to indicate when the kernel is complete.
   if (GenericDevice.getRPCServer())
-    cuStreamAddCallback(
+    cuLaunchHostFunc(
         Stream,
-        [](CUstream Stream, CUresult Status, void *Data) {
+        [](void *Data) {
           GenericPluginTy &Plugin = *reinterpret_cast<GenericPluginTy *>(Data);
           Plugin.getRPCServer().Thread->finish();
         },
-        &GenericDevice.Plugin, /*flags=*/0);
+        &GenericDevice.Plugin);
 
   return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
 }

>From 72bf3b0a79f607850f1f518da80e65355a99feb7 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Mon, 21 Oct 2024 08:38:39 -0500
Subject: [PATCH 4/4] Only create thread if used

---
 offload/plugins-nextgen/common/include/RPC.h  | 10 ++++++++-
 .../common/src/PluginInterface.cpp            |  3 +++
 offload/plugins-nextgen/common/src/RPC.cpp    | 21 ++++++++++++-------
 3 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h
index d356c62b456f53..18114108824ee2 100644
--- a/offload/plugins-nextgen/common/include/RPC.h
+++ b/offload/plugins-nextgen/common/include/RPC.h
@@ -44,6 +44,9 @@ struct RPCServerTy {
   /// Deinitialize the associated memory and resources.
   llvm::Error shutDown();
 
+  /// Initialize the worker thread.
+  llvm::Error startThread();
+
   /// Check if this device image is using an RPC server. This checks for the
   /// precense of an externally visible symbol in the device image that will
   /// be present whenever RPC code is called.
@@ -85,7 +88,9 @@ struct RPCServerTy {
     llvm::ArrayRef<std::atomic<uintptr_t>> Handles;
 
     /// Initialize the worker thread to run in the background.
-    ServerThread(std::atomic<uintptr_t> Handles[], size_t Length);
+    ServerThread(std::atomic<uintptr_t> Handles[], size_t Length)
+        : Running(true), NumUsers(0), CV(), Mutex(), Handles(Handles, Length) {}
+
     ~ServerThread() { assert(!Running && "Thread not shut down explicitly\n"); }
 
     /// Notify the worker thread that there is a user that needs it.
@@ -105,6 +110,9 @@ struct RPCServerTy {
     /// Destroy the worker thread and wait.
     void shutDown();
 
+    /// Initialize the worker thread.
+    void startThread();
+
     /// Run the server thread to continuously check the RPC interface for work
     /// to be done for the device.
     void run();
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 2be0fc0a713da5..92fac44c515acb 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1051,6 +1051,9 @@ Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
   if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
     return Err;
 
+  if (auto Err = Server.startThread())
+    return Err;
+
   RPCServer = &Server;
   DP("Running an RPC server on device %d\n", getDeviceId());
   return Plugin::success();
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index dab112ead6c7e2..5674e9ca1b1556 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -21,6 +21,12 @@ using namespace llvm;
 using namespace omp;
 using namespace target;
 
+void RPCServerTy::ServerThread::startThread() {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  Worker = std::thread([this]() { run(); });
+#endif
+}
+
 void RPCServerTy::ServerThread::shutDown() {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
   {
@@ -60,19 +66,18 @@ void RPCServerTy::ServerThread::run() {
 #endif
 }
 
-RPCServerTy::ServerThread::ServerThread(std::atomic<uintptr_t> Handles[],
-                                        size_t Length)
-    : Running(true), NumUsers(0), CV(), Mutex(), Handles(Handles, Length) {
-#ifdef LIBOMPTARGET_RPC_SUPPORT
-  Worker = std::thread([this]() { run(); });
-#endif
-}
-
 RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
     : Handles(
           std::make_unique<std::atomic<uintptr_t>[]>(Plugin.getNumDevices())),
       Thread(new ServerThread(Handles.get(), Plugin.getNumDevices())) {}
 
+llvm::Error RPCServerTy::startThread() {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  Thread->startThread();
+#endif
+  return Error::success();
+}
+
 llvm::Error RPCServerTy::shutDown() {
 #ifdef LIBOMPTARGET_RPC_SUPPORT
   Thread->shutDown();