[Openmp-commits] [openmp] 691dc2d - [Libomptarget] Begin	implementing support for RPC services
    Joseph Huber via Openmp-commits 
    openmp-commits at lists.llvm.org
       
    Fri Jul  7 10:36:53 PDT 2023
    
    
  
Author: Joseph Huber
Date: 2023-07-07T12:36:46-05:00
New Revision: 691dc2d10d4bc5a8e6cea266f201693aee46b40d
URL: https://github.com/llvm/llvm-project/commit/691dc2d10d4bc5a8e6cea266f201693aee46b40d
DIFF: https://github.com/llvm/llvm-project/commit/691dc2d10d4bc5a8e6cea266f201693aee46b40d.diff
LOG: [Libomptarget] Begin implementing support for RPC services
This patch adds the intial support for running an RPC server in
libomptarget to handle host services. We interface with the library
provided by the `libc` project to stand up a basic server. We introduce
a new type that is controlled by the plugin and has each device
intialize its interface. We then run a basic server to check the RPC
buffer.
This patch does not fully implement the interface. In the future each
plugin will want to define special handlers via the interface to support
things like malloc or H2D copies coming from RPC. We will also want to
allow the plugin to specify t he number of ports. This is currently
capped in the implementation but will be adjusted soon.
Right now running the server is handled by whatever thread ends up doing
the waiting. This is probably not a completely sound solution but I am
not overly familiar with the behaviour of OpenMP tasks and what would be
required here. This works okay with synchrnous regions, and somewhat
fine with `nowait` regions, but I've observed some weird behavior when
one of those regions calls `exit`.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D154312
Added: 
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h
    openmp/libomptarget/test/libc/malloc.c
    openmp/libomptarget/test/libc/puts.c
Modified: 
    libc/CMakeLists.txt
    openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
    openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
    openmp/libomptarget/test/CMakeLists.txt
    openmp/libomptarget/test/lit.cfg
    openmp/libomptarget/test/lit.site.cfg.in
Removed: 
    
################################################################################
diff  --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 6acdb65ce550e1..4be92ba1380c9e 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -92,7 +92,7 @@ include(LLVMLibCArchitectures)
 if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
   set(LIBC_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include)
   set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm)
-  set(LIBC_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/lib)
+  set(LIBC_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR})
 elseif(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND LIBC_ENABLE_USE_BY_CLANG)
   set(LIBC_INCLUDE_DIR ${LLVM_BINARY_DIR}/include/${LLVM_DEFAULT_TARGET_TRIPLE})
   set(LIBC_INSTALL_INCLUDE_DIR ${CMAKE_INSTALL_INCLUDEDIR}/${LLVM_DEFAULT_TARGET_TRIPLE})
diff  --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 1fcbcf29f9e353..45c542e1e202dc 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -519,17 +519,25 @@ struct AMDGPUSignalTy {
   }
 
   /// Wait until the signal gets a zero value.
-  Error wait(const uint64_t ActiveTimeout = 0) const {
-    if (ActiveTimeout) {
+  Error wait(const uint64_t ActiveTimeout = 0,
+             RPCHandleTy *RPCHandle = nullptr) const {
+    if (ActiveTimeout && !RPCHandle) {
       hsa_signal_value_t Got = 1;
       Got = hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0,
                                       ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
       if (Got == 0)
         return Plugin::success();
     }
+
+    // If there is an RPC device attached to this stream we run it as a server.
+    uint64_t Timeout = RPCHandle ? 8192 : UINT64_MAX;
+    auto WaitState = RPCHandle ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
     while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0,
-                                     UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0)
-      ;
+                                     Timeout, WaitState) != 0) {
+      if (RPCHandle)
+        if (auto Err = RPCHandle->runServer())
+          return Err;
+    }
     return Plugin::success();
   }
 
@@ -895,6 +903,11 @@ struct AMDGPUStreamTy {
   /// operation that was already finalized in a previous stream sycnhronize.
   uint32_t SyncCycle;
 
+  /// A pointer associated with an RPC server running on the given device. If
+  /// RPC is not being used this will be a null pointer. Otherwise, this
+  /// indicates that an RPC server is expected to be run on this stream.
+  RPCHandleTy *RPCHandle;
+
   /// Mutex to protect stream's management.
   mutable std::mutex Mutex;
 
@@ -1050,6 +1063,9 @@ struct AMDGPUStreamTy {
   /// Deinitialize the stream's signals.
   Error deinit() { return Plugin::success(); }
 
+  /// Attach an RPC handle to this stream.
+  void setRPCHandle(RPCHandleTy *Handle) { RPCHandle = Handle; }
+
   /// Push a asynchronous kernel to the stream. The kernel arguments must be
   /// placed in a special allocation for kernel args and must keep alive until
   /// the kernel finalizes. Once the kernel is finished, the stream will release
@@ -1264,7 +1280,8 @@ struct AMDGPUStreamTy {
       return Plugin::success();
 
     // Wait until all previous operations on the stream have completed.
-    if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds))
+    if (auto Err =
+            Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, RPCHandle))
       return Err;
 
     // Reset the stream and perform all pending post actions.
@@ -1786,6 +1803,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// AMDGPU devices do not have the concept of contexts.
   Error setContext() override { return Plugin::success(); }
 
+  /// We want to set up the RPC server for host services to the GPU if it is
+  /// availible.
+  bool shouldSetupRPCServer() const override {
+    return libomptargetSupportsRPC();
+  }
+
   /// Get the stream of the asynchronous info sructure or get a new one.
   AMDGPUStreamTy &getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) {
     AMDGPUStreamTy *&Stream = AsyncInfoWrapper.getQueueAs<AMDGPUStreamTy *>();
@@ -2507,7 +2530,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
     : Agent(Device.getAgent()), Queue(Device.getNextQueue()),
       SignalManager(Device.getSignalManager()),
       // Initialize the std::deque with some empty positions.
-      Slots(32), NextSlot(0), SyncCycle(0),
+      Slots(32), NextSlot(0), SyncCycle(0), RPCHandle(nullptr),
       StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
 
 /// Class implementing the AMDGPU-specific functionalities of the global
@@ -2837,6 +2860,10 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
   AMDGPUStreamTy &Stream = AMDGPUDevice.getStream(AsyncInfoWrapper);
 
+  // If this kernel requires an RPC server we attach its pointer to the stream.
+  if (GenericDevice.getRPCHandle())
+    Stream.setRPCHandle(GenericDevice.getRPCHandle());
+
   // Push the kernel launch into the stream.
   return Stream.pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
                                  GroupSize, ArgsMemoryManager);
diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
index 90d2113c4ea711..087125fd8b9379 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/CMakeLists.txt
@@ -13,7 +13,7 @@
 # NOTE: Don't try to build `PluginInterface` using `add_llvm_library` because we
 # don't want to export `PluginInterface` while `add_llvm_library` requires that.
 add_library(PluginInterface OBJECT
-  PluginInterface.cpp GlobalHandler.cpp JIT.cpp)
+  PluginInterface.cpp GlobalHandler.cpp JIT.cpp RPC.cpp)
 
 # Only enable JIT for those targets that LLVM can support.
 string(TOUPPER "${LLVM_TARGETS_TO_BUILD}" TargetsSupported)
@@ -62,6 +62,25 @@ target_link_libraries(PluginInterface
     MemoryManager
 )
 
+# Include the RPC server from the `libc` project if availible.
+set(libomptarget_supports_rpc FALSE)
+if(TARGET llvmlibc_rpc_server)
+  target_link_libraries(PluginInterface PRIVATE llvmlibc_rpc_server)
+  target_compile_definitions(PluginInterface PRIVATE LIBOMPTARGET_RPC_SUPPORT)
+  set(libomptarget_supports_rpc TRUE)
+else()
+  find_library(llvmlibc_rpc_server NAMES llvmlibc_rpc_server
+               PATHS ${LIBOMPTARGET_LLVM_LIBRARY_DIR} NO_DEFAULT_PATH)
+  if(llvmlibc_rpc_server)
+    message(WARNING ${llvmlibc_rpc_server})
+    target_link_libraries(PluginInterface PRIVATE llvmlibc_rpc_server)
+    target_compile_definitions(PluginInterface PRIVATE LIBOMPTARGET_RPC_SUPPORT)
+    set(libomptarget_supports_rpc TRUE)
+  endif()
+endif()
+set(LIBOMPTARGET_GPU_LIBC_SUPPORT ${libomptarget_supports_rpc} CACHE BOOL
+    "Libomptarget support for the GPU libc")
+
 if ((OMPT_TARGET_DEFAULT) AND (LIBOMPTARGET_OMPT_SUPPORT))
   target_link_libraries(PluginInterface PUBLIC OMPT)
 endif()
diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 9eaaaf817d9f17..e0224955a7de94 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -396,7 +396,7 @@ GenericDeviceTy::GenericDeviceTy(int32_t DeviceId, int32_t NumDevices,
       OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 32),
       DeviceId(DeviceId), GridValues(OMPGridValues),
       PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
-      PinnedAllocs(*this) {}
+      PinnedAllocs(*this), RPCHandle(nullptr) {}
 
 Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
   if (auto Err = initImpl(Plugin))
@@ -453,6 +453,10 @@ Error GenericDeviceTy::deinit() {
   if (RecordReplay.isRecordingOrReplaying())
     RecordReplay.deinit();
 
+  if (RPCHandle)
+    if (auto Err = RPCHandle->deinitDevice())
+      return std::move(Err);
+
   return deinitImpl();
 }
 
@@ -493,6 +497,9 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   if (auto Err = registerOffloadEntries(*Image))
     return std::move(Err);
 
+  if (auto Err = setupRPCServer(Plugin, *Image))
+    return std::move(Err);
+
   // Return the pointer to the table of entries.
   return Image->getOffloadEntryTable();
 }
@@ -525,6 +532,33 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
   return Plugin::success();
 }
 
+Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
+                                      DeviceImageTy &Image) {
+  // The plugin either does not need an RPC server or it is unavailible.
+  if (!shouldSetupRPCServer())
+    return Plugin::success();
+
+  // Check if this device needs to run an RPC server.
+  RPCServerTy &Server = Plugin.getRPCServer();
+  auto UsingOrErr =
+      Server.isDeviceUsingRPC(*this, Plugin.getGlobalHandler(), Image);
+  if (!UsingOrErr)
+    return UsingOrErr.takeError();
+
+  if (!UsingOrErr.get())
+    return Plugin::success();
+
+  if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
+    return std::move(Err);
+
+  auto DeviceOrErr = Server.getDevice(*this);
+  if (!DeviceOrErr)
+    return DeviceOrErr.takeError();
+  RPCHandle = *DeviceOrErr;
+  DP("Running an RPC server on device %d\n", getDeviceId());
+  return Plugin::success();
+}
+
 Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) {
   const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin;
   const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd;
@@ -1088,6 +1122,9 @@ Error GenericPluginTy::init() {
   GlobalHandler = Plugin::createGlobalHandler();
   assert(GlobalHandler && "Invalid global handler");
 
+  RPCServer = new RPCServerTy(NumDevices);
+  assert(RPCServer && "Invalid RPC server");
+
   return Plugin::success();
 }
 
@@ -1105,6 +1142,9 @@ Error GenericPluginTy::deinit() {
     assert(!Devices[DeviceId] && "Device was not deinitialized");
   }
 
+  if (RPCServer)
+    delete RPCServer;
+
   // Perform last deinitializations on the plugin.
   return deinitImpl();
 }
@@ -1139,6 +1179,14 @@ Error GenericPluginTy::deinitDevice(int32_t DeviceId) {
   return Plugin::success();
 }
 
+const bool llvm::omp::target::plugin::libomptargetSupportsRPC() {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  return true;
+#else
+  return false;
+#endif
+}
+
 /// Exposed library API function, basically wrappers around the GenericDeviceTy
 /// functionality with the same name. All non-async functions are redirected
 /// to the async versions right away with a NULL AsyncInfoPtr.
diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 8fe615b2f6f239..ab5ce8cff607f2 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -24,6 +24,7 @@
 #include "GlobalHandler.h"
 #include "JIT.h"
 #include "MemoryManager.h"
+#include "RPC.h"
 #include "Utilities.h"
 #include "omptarget.h"
 
@@ -600,6 +601,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// this behavior by overriding the shouldSetupDeviceEnvironment function.
   Error setupDeviceEnvironment(GenericPluginTy &Plugin, DeviceImageTy &Image);
 
+  // Setup the RPC server for this device if needed. This may not run on some
+  // plugins like the CPU targets. By default, it will not be executed so it is
+  // up to the target to override this using the shouldSetupRPCServer function.
+  Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
+
   /// Register the offload entries for a specific image on the device.
   Error registerOffloadEntries(DeviceImageTy &Image);
 
@@ -751,6 +757,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
     return OMPX_MinThreadsForLowTripCount;
   }
 
+  /// Get the RPC server running on this device.
+  RPCHandleTy *getRPCHandle() const { return RPCHandle; }
+
 private:
   /// Register offload entry for global variable.
   Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -780,6 +789,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// setupDeviceEnvironment() function.
   virtual bool shouldSetupDeviceEnvironment() const { return true; }
 
+  /// Indicate whether or not the device should setup the RPC server. This is
+  /// only necessary for unhosted targets like the GPU.
+  virtual bool shouldSetupRPCServer() const { return false; }
+
   /// Pointer to the memory manager or nullptr if not available.
   MemoryManagerTy *MemoryManager;
 
@@ -837,6 +850,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   /// Map of host pinned allocations used for optimize device transfers.
   PinnedAllocationMapTy PinnedAllocs;
+
+  /// A pointer to an RPC server instance attached to this device if present.
+  /// This is used to run the RPC server during task synchronization.
+  RPCHandleTy *RPCHandle;
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
@@ -892,6 +909,12 @@ struct GenericPluginTy {
   /// plugin.
   JITEngine &getJIT() { return JIT; }
 
+  /// Get a reference to the RPC server used to provide host services.
+  RPCServerTy &getRPCServer() {
+    assert(RPCServer && "RPC server not initialized");
+    return *RPCServer;
+  }
+
   /// Get the OpenMP requires flags set for this plugin.
   int64_t getRequiresFlags() const { return RequiresFlags; }
 
@@ -946,6 +969,9 @@ struct GenericPluginTy {
 
   /// The JIT engine shared by all devices connected to this plugin.
   JITEngine JIT;
+
+  /// The interface between the plugin and the GPU for host services.
+  RPCServerTy *RPCServer;
 };
 
 /// Class for simplifying the getter operation of the plugin. Anywhere on the
@@ -1209,6 +1235,9 @@ template <typename ResourceRef> class GenericDeviceResourceManagerTy {
   std::deque<ResourceRef> ResourcePool;
 };
 
+/// A static check on whether or not we support RPC in libomptarget.
+const bool libomptargetSupportsRPC();
+
 } // namespace plugin
 } // namespace target
 } // namespace omp
diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
new file mode 100644
index 00000000000000..41a37453bbb9d0
--- /dev/null
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
@@ -0,0 +1,173 @@
+//===- RPC.h - Interface for remote procedure calls from the GPU ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RPC.h"
+#include "Debug.h"
+#include "PluginInterface.h"
+
+// This header file may be present in-tree or from an LLVM installation. The
+// installed version lives alongside the GPU headers so we do not want to
+// include it directly.
+#if __has_include(<gpu-none-llvm/rpc_server.h>)
+#include <gpu-none-llvm/rpc_server.h>
+#elif defined(LIBOMPTARGET_RPC_SUPPORT)
+#include <rpc_server.h>
+#endif
+
+using namespace llvm;
+using namespace omp;
+using namespace target;
+
+RPCServerTy::RPCServerTy(uint32_t NumDevices) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  // If this fails then something is catastrophically wrong, just exit.
+  if (rpc_status_t Err = rpc_init(NumDevices))
+    FATAL_MESSAGE(1, "Error initializing the RPC server: %d\n", Err);
+  Handles.resize(NumDevices);
+#endif
+}
+
+llvm::Expected<bool>
+RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
+                              plugin::GenericGlobalHandlerTy &Handler,
+                              plugin::DeviceImageTy &Image) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  void *ClientPtr;
+  plugin::GlobalTy Global(rpc_client_symbol_name, sizeof(void *), &ClientPtr);
+  if (auto Err = Handler.readGlobalFromImage(Device, Image, Global)) {
+    llvm::consumeError(std::move(Err));
+    return false;
+  }
+
+  return true;
+#else
+  return false;
+#endif
+}
+
+Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
+                              plugin::GenericGlobalHandlerTy &Handler,
+                              plugin::DeviceImageTy &Image) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  uint32_t DeviceId = Device.getDeviceId();
+  auto Alloc = [](uint64_t Size, void *Data) {
+    plugin::GenericDeviceTy &Device =
+        *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
+    return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST);
+  };
+  // TODO: Allow the device to declare its requested port count.
+  if (rpc_status_t Err = rpc_server_init(DeviceId, RPC_MAXIMUM_PORT_COUNT,
+                                         Device.getWarpSize(), Alloc, &Device))
+    return plugin::Plugin::error(
+        "Failed to initialize RPC server for device %d: %d", DeviceId, Err);
+
+  // Register a custom opcode handler to perform plugin specific allocation.
+  // FIXME: We need to make sure this uses asynchronous allocations on CUDA.
+  auto MallocHandler = [](rpc_port_t Port, void *Data) {
+    rpc_recv_and_send(
+        Port,
+        [](rpc_buffer_t *Buffer, void *Data) {
+          plugin::GenericDeviceTy &Device =
+              *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
+          Buffer->data[0] = reinterpret_cast<uintptr_t>(
+              Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
+        },
+        Data);
+  };
+  if (rpc_status_t Err =
+          rpc_register_callback(DeviceId, RPC_MALLOC, MallocHandler, &Device))
+    return plugin::Plugin::error(
+        "Failed to register RPC malloc handler for device %d: %d\n", DeviceId,
+        Err);
+
+  // Register a custom opcode handler to perform plugin specific deallocation.
+  auto FreeHandler = [](rpc_port_t Port, void *Data) {
+    rpc_recv(
+        Port,
+        [](rpc_buffer_t *Buffer, void *Data) {
+          plugin::GenericDeviceTy &Device =
+              *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
+          Device.free(reinterpret_cast<void *>(Buffer->data[0]),
+                      TARGET_ALLOC_DEVICE);
+        },
+        Data);
+  };
+  if (rpc_status_t Err =
+          rpc_register_callback(DeviceId, RPC_FREE, FreeHandler, &Device))
+    return plugin::Plugin::error(
+        "Failed to register RPC free handler for device %d: %d\n", DeviceId,
+        Err);
+
+  // Get the address of the RPC client from the device.
+  void *ClientPtr;
+  plugin::GlobalTy ClientGlobal(rpc_client_symbol_name, sizeof(void *));
+  if (auto Err =
+          Handler.getGlobalMetadataFromDevice(Device, Image, ClientGlobal))
+    return Err;
+
+  if (auto Err = Device.dataRetrieve(&ClientPtr, ClientGlobal.getPtr(),
+                                     sizeof(void *), nullptr))
+    return Err;
+
+  const void *ClientBuffer = rpc_get_client_buffer(DeviceId);
+  if (auto Err = Device.dataSubmit(ClientPtr, ClientBuffer,
+                                   rpc_get_client_size(), nullptr))
+    return Err;
+
+  Handles[DeviceId] = std::make_unique<RPCHandleTy>(*this, Device);
+#endif
+  return Error::success();
+}
+
+llvm::Expected<RPCHandleTy *>
+RPCServerTy::getDevice(plugin::GenericDeviceTy &Device) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  uint32_t DeviceId = Device.getDeviceId();
+  if (!Handles[DeviceId] || !rpc_get_buffer(DeviceId) ||
+      !rpc_get_client_buffer(DeviceId))
+    return plugin::Plugin::error(
+        "Attempt to get an RPC device while not initialized");
+
+  return Handles[DeviceId].get();
+#else
+  return plugin::Plugin::error(
+      "Attempt to get an RPC device while not available");
+#endif
+}
+
+Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  if (rpc_status_t Err = rpc_handle_server(Device.getDeviceId()))
+    return plugin::Plugin::error(
+        "Error while running RPC server on device %d: %d", Device.getDeviceId(),
+        Err);
+#endif
+  return Error::success();
+}
+
+Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  auto Dealloc = [](void *Ptr, void *Data) {
+    plugin::GenericDeviceTy &Device =
+        *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
+    Device.free(Ptr, TARGET_ALLOC_HOST);
+  };
+  if (rpc_status_t Err =
+          rpc_server_shutdown(Device.getDeviceId(), Dealloc, &Device))
+    return plugin::Plugin::error(
+        "Failed to shut down RPC server for device %d: %d",
+        Device.getDeviceId(), Err);
+#endif
+  return Error::success();
+}
+
+RPCServerTy::~RPCServerTy() {
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  rpc_shutdown();
+#endif
+}
diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h
new file mode 100644
index 00000000000000..c072c01b87cb78
--- /dev/null
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.h
@@ -0,0 +1,87 @@
+//===- RPC.h - Interface for remote procedure calls from the GPU ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface to support remote procedure calls (RPC) from
+// the GPU. This is required to implement host services like printf or malloc.
+// The interface to the RPC server is provided by the 'libc' project in LLVM.
+// For more information visit https://libc.llvm.org/gpu/.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H
+#define OPENMP_LIBOMPTARGET_PLUGINS_NEXTGEN_COMMON_RPC_H
+
+#include "llvm/Support/Error.h"
+
+#include <stdint.h>
+
+namespace llvm::omp::target {
+namespace plugin {
+struct GenericDeviceTy;
+struct GenericGlobalHandlerTy;
+class DeviceImageTy;
+} // namespace plugin
+
+/// A generic class implementing the interface between the RPC server provided
+/// by the 'libc' project and 'libomptarget'. If the RPC server is not availible
+/// these routines will perform no action.
+struct RPCServerTy {
+public:
+  /// A wrapper around a single instance of the RPC server for a given device.
+  /// This is provided to simplify ownership of the underlying device.
+  struct RPCHandleTy {
+    RPCHandleTy(RPCServerTy &Server, plugin::GenericDeviceTy &Device)
+        : Server(Server), Device(Device) {}
+
+    llvm::Error runServer() { return Server.runServer(Device); }
+
+    llvm::Error deinitDevice() { return Server.deinitDevice(Device); }
+
+  private:
+    RPCServerTy &Server;
+    plugin::GenericDeviceTy &Device;
+  };
+
+  RPCServerTy(uint32_t NumDevices);
+
+  /// Check if this device image is using an RPC server. This checks for the
+  /// precense of an externally visible symbol in the device image that will
+  /// be present whenever RPC code is called.
+  llvm::Expected<bool> isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
+                                        plugin::GenericGlobalHandlerTy &Handler,
+                                        plugin::DeviceImageTy &Image);
+
+  /// Initialize the RPC server for the given device. This will allocate host
+  /// memory for the internal server and copy the data to the client on the
+  /// device. The device must be loaded before this is valid.
+  llvm::Error initDevice(plugin::GenericDeviceTy &Device,
+                         plugin::GenericGlobalHandlerTy &Handler,
+                         plugin::DeviceImageTy &Image);
+
+  /// Gets a reference to this server for a specific device.
+  llvm::Expected<RPCHandleTy *> getDevice(plugin::GenericDeviceTy &Device);
+
+  /// Runs the RPC server associated with the \p Device until the pending work
+  /// is cleared.
+  llvm::Error runServer(plugin::GenericDeviceTy &Device);
+
+  /// Deinitialize the RPC server for the given device. This will free the
+  /// memory associated with the k
+  llvm::Error deinitDevice(plugin::GenericDeviceTy &Device);
+
+  ~RPCServerTy();
+
+private:
+  llvm::SmallVector<std::unique_ptr<RPCHandleTy>> Handles;
+};
+
+using RPCHandleTy = RPCServerTy::RPCHandleTy;
+
+} // namespace llvm::omp::target
+
+#endif
diff  --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index c165b582f63d0b..fbd89da8a54dc0 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -366,6 +366,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
   }
 
+  /// We want to set up the RPC server for host services to the GPU if it is
+  /// availible.
+  bool shouldSetupRPCServer() const override {
+    return libomptargetSupportsRPC();
+  }
+
   /// Get the stream of the asynchronous info sructure or get a new one.
   CUstream getStream(AsyncInfoWrapperTy &AsyncInfoWrapper) {
     CUstream &Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
@@ -464,7 +470,18 @@ struct CUDADeviceTy : public GenericDeviceTy {
   /// Synchronize current thread with the pending operations on the async info.
   Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
-    CUresult Res = cuStreamSynchronize(Stream);
+    CUresult Res;
+    // If we have an RPC server running on this device we will continuously
+    // query it for work rather than blocking.
+    if (!getRPCHandle()) {
+      Res = cuStreamSynchronize(Stream);
+    } else {
+      do {
+        Res = cuStreamQuery(Stream);
+        if (auto Err = getRPCHandle()->runServer())
+          return Err;
+      } while (Res == CUDA_ERROR_NOT_READY);
+    }
 
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
diff  --git a/openmp/libomptarget/test/CMakeLists.txt b/openmp/libomptarget/test/CMakeLists.txt
index 943dbd0d0acf21..8908d2bb30adbd 100644
--- a/openmp/libomptarget/test/CMakeLists.txt
+++ b/openmp/libomptarget/test/CMakeLists.txt
@@ -19,6 +19,7 @@ string(REPLACE " " ";" LIBOMPTARGET_LIT_ARG_LIST "${LIBOMPTARGET_LIT_ARGS}")
 string(REGEX MATCHALL "([^\ ]+\ |[^\ ]+$)" SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}")
 foreach(CURRENT_TARGET IN LISTS SYSTEM_TARGETS)
   string(STRIP "${CURRENT_TARGET}" CURRENT_TARGET)
+
   add_openmp_testsuite(check-libomptarget-${CURRENT_TARGET}
     "Running libomptarget tests"
     ${CMAKE_CURRENT_BINARY_DIR}/${CURRENT_TARGET}
diff  --git a/openmp/libomptarget/test/libc/malloc.c b/openmp/libomptarget/test/libc/malloc.c
new file mode 100644
index 00000000000000..65c2d21f2067a3
--- /dev/null
+++ b/openmp/libomptarget/test/libc/malloc.c
@@ -0,0 +1,33 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// REQUIRES: libc
+
+// TODO: This requires async malloc on CUDA which is an 11.2 feature.
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#pragma omp declare target to(malloc)
+#pragma omp declare target to(free)
+
+int main() {
+  unsigned h_x;
+  unsigned *d_x;
+#pragma omp target map(from : d_x)
+  {
+    d_x = malloc(sizeof(unsigned));
+    *d_x = 1;
+  }
+
+#pragma omp target is_device_ptr(d_x) map(from : h_x)
+  { h_x = *d_x; }
+
+#pragma omp target is_device_ptr(d_x)
+  { free(d_x); }
+
+  // CHECK: PASS
+  if (h_x == 1)
+    fputs("PASS\n", stdout);
+}
diff  --git a/openmp/libomptarget/test/libc/puts.c b/openmp/libomptarget/test/libc/puts.c
new file mode 100644
index 00000000000000..18d87ed1b36ae6
--- /dev/null
+++ b/openmp/libomptarget/test/libc/puts.c
@@ -0,0 +1,35 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// REQUIRES: libc
+
+#include <stdio.h>
+
+#pragma omp declare target to(stdout)
+
+int main() {
+// CHECK: PASS
+#pragma omp target
+  { fputs("PASS\n", stdout); }
+
+// CHECK: PASS
+#pragma omp target nowait
+  { fputs("PASS\n", stdout); }
+
+// CHECK: PASS
+#pragma omp target nowait
+  { fputs("PASS\n", stdout); }
+
+#pragma omp taskwait
+
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+// CHECK: PASS
+#pragma omp target teams num_teams(4)
+#pragma omp parallel num_threads(2)
+  { fputs("PASS\n", stdout); }
+}
diff  --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 1fc01e22315baa..4dabca41c0bcaf 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -96,6 +96,9 @@ if 'flang' in config.llvm_enabled_projects:
   config.available_features.add('flang')
   tools.append(ToolSubst('%flang', command=FindTool('flang-new'), unresolved='fatal'))
 
+if config.libomptarget_has_libc:
+  config.available_features.add('libc')
+
 # Determine whether the test system supports unified memory.
 # For CUDA, this is the case with compute capability 70 (Volta) or higher.
 # For all other targets, we currently assume it is.
@@ -213,10 +216,12 @@ for libomptarget_target in config.libomptarget_all_targets:
             "%libomptarget-run-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compilexx-" + \
             libomptarget_target, \
-            "%clangxx-" + libomptarget_target + " %s -o %t"))
+            "%clangxx-" + libomptarget_target + " %s -o %t" \
+            " -lcgpu" if config.libomptarget_has_libc else ""))
         config.substitutions.append(("%libomptarget-compile-" + \
             libomptarget_target, \
-            "%clang-" + libomptarget_target + " %s -o %t"))
+            "%clang-" + libomptarget_target + " %s -o %t"
+            " -lcgpu" if config.libomptarget_has_libc else ""))
         config.substitutions.append(("%libomptarget-compileoptxx-run-and-check-" + \
             libomptarget_target, \
             "%libomptarget-compileoptxx-and-run-" + libomptarget_target + \
@@ -235,10 +240,12 @@ for libomptarget_target in config.libomptarget_all_targets:
             "%libomptarget-run-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compileoptxx-" + \
             libomptarget_target, \
-            "%clangxx-" + libomptarget_target + " -O3 %s -o %t"))
+            "%clangxx-" + libomptarget_target + " -O3 %s -o %t"
+            " -lcgpu" if config.libomptarget_has_libc else ""))
         config.substitutions.append(("%libomptarget-compileopt-" + \
             libomptarget_target, \
-            "%clang-" + libomptarget_target + " -O3 %s -o %t"))
+            "%clang-" + libomptarget_target + " -O3 %s -o %t"
+            " -lcgpu" if config.libomptarget_has_libc else ""))
         config.substitutions.append(("%libomptarget-run-" + \
             libomptarget_target, \
             "%t"))
diff  --git a/openmp/libomptarget/test/lit.site.cfg.in b/openmp/libomptarget/test/lit.site.cfg.in
index f4a63bebd2ccbf..47bd8b6450bc41 100644
--- a/openmp/libomptarget/test/lit.site.cfg.in
+++ b/openmp/libomptarget/test/lit.site.cfg.in
@@ -22,6 +22,7 @@ config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
 config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
 config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@
 config.llvm_enabled_projects = "@LLVM_ENABLE_PROJECTS@".split(";")
+config.libomptarget_has_libc = "@LIBOMPTARGET_GPU_LIBC_SUPPORT@"
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
        
    
    
More information about the Openmp-commits
mailing list