[Openmp-commits] [openmp] 06adac8 - [Libomptarget] Configure the RPC port count from the plugin

Fri Aug 11 10:54:54 PDT 2023

Author: Joseph Huber
Date: 2023-08-11T12:54:47-05:00
New Revision: 06adac8c4e26376fb54d87dede3ebce46783010a

URL: https://github.com/llvm/llvm-project/commit/06adac8c4e26376fb54d87dede3ebce46783010a
DIFF: https://github.com/llvm/llvm-project/commit/06adac8c4e26376fb54d87dede3ebce46783010a.diff

LOG: [Libomptarget] Configure the RPC port count from the plugin

This patch allows us to configure the port count to what the specific
card would desire for parallelism. For AMDGPU we need to use the maximum
number of hardware parallelism to avoid deadlocks. For NVPTX we don't
have this problem due to the friendlier scheduler, so we use the number
of warps active on an SM times the number of SMs as a good guess.

Note that the max ports currently is going to be smaller than these
numbers. That will be improved in the future.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D155903

Added: 
    

Modified: 
    openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
    openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
    openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 8da4b1f15d79a7..9ca150de680be3 100644

--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1785,6 +1785,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       return Err;
     GridValues.GV_Default_Num_Teams = ComputeUnits * OMPX_DefaultTeamsPerCU;
 
+    uint32_t WavesPerCU = 0;
+    if (auto Err =
+            getDeviceAttr(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, WavesPerCU))
+      return Err;
+    HardwareParallelism = ComputeUnits * WavesPerCU;
+
     // Get maximum size of any device queues and maximum number of queues.
     uint32_t MaxQueueSize;
     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_QUEUE_MAX_SIZE, MaxQueueSize))
@@ -1932,6 +1938,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return libomptargetSupportsRPC();
   }
 
+  /// AMDGPU returns the product of the number of compute units and the waves
+  /// per compute unit.
+  uint64_t requestedRPCPortCount() const override  {
+    return HardwareParallelism;
+  }
+
   /// Get the stream of the asynchronous info sructure or get a new one.
   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
                   AMDGPUStreamTy *&Stream) {
@@ -2577,6 +2589,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// The frequency of the steady clock inside the device.
   uint64_t ClockFrequency;
 
+  /// The total number of concurrent work items that can be running on the GPU.
+  uint64_t HardwareParallelism;
+
   /// Reference to the host device.
   AMDHostDeviceTy &HostDevice;
 };

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 9cbef5a247f360..b89ab3dd5f91c8 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -782,6 +782,19 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Get the RPC server running on this device.
   RPCServerTy *getRPCServer() const { return RPCServer; }
 
+  /// The number of parallel RPC ports to use on the device. In general, this
+  /// should be roughly equivalent to the amount of hardware parallelism the
+  /// device can support. This is because GPUs in general do not have forward
+  /// progress guarantees, so we minimize thread level dependencies by
+  /// allocating enough space such that each device thread can have a port. This
+  /// is likely overly pessimistic in the average case, but guarantees no
+  /// deadlocks at the cost of memory. This must be overloaded by targets
+  /// expecting to use the RPC server.
+  virtual uint64_t requestedRPCPortCount() const {
+    assert(!shouldSetupRPCServer() && "Default implementation cannot be used");
+    return 0;
+  }
+
 private:
   /// Register offload entry for global variable.
   Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
@@ -888,7 +901,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 #endif
 
 private:
-
   /// Return the kernel environment object for kernel \p Name.
   Expected<KernelEnvironmentTy>
   getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);

diff  --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
index 5254829ea85b6f..72bba012fcf93c 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/RPC.cpp
@@ -59,8 +59,9 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
         *reinterpret_cast<plugin::GenericDeviceTy *>(Data);
     return Device.allocate(Size, nullptr, TARGET_ALLOC_HOST);
   };
-  // TODO: Allow the device to declare its requested port count.
-  if (rpc_status_t Err = rpc_server_init(DeviceId, RPC_MAXIMUM_PORT_COUNT,
+  uint64_t NumPorts =
+      std::min(Device.requestedRPCPortCount(), RPC_MAXIMUM_PORT_COUNT);
+  if (rpc_status_t Err = rpc_server_init(DeviceId, NumPorts,
                                          Device.getWarpSize(), Alloc, &Device))
     return plugin::Plugin::error(
         "Failed to initialize RPC server for device %d: %d", DeviceId, Err);

diff  --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index e14cfc5deda10f..6b763f381d60a8 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -295,6 +295,19 @@ struct CUDADeviceTy : public GenericDeviceTy {
                                  ComputeCapability.Minor))
       return Err;
 
+    uint32_t NumMuliprocessors = 0;
+    uint32_t MaxThreadsPerSM = 0;
+    uint32_t WarpSize = 0;
+    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                                 NumMuliprocessors))
+      return Err;
+    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
+                                 MaxThreadsPerSM))
+      return Err;
+    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
+      return Err;
+    HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
+
     return Plugin::success();
   }
 
@@ -366,6 +379,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return libomptargetSupportsRPC();
   }
 
+  /// NVIDIA returns the product of the SM count and the number of warps that
+  /// fit if the maximum number of threads were scheduled on each SM.
+  uint64_t requestedRPCPortCount() const override {
+    return HardwareParallelism;
+  }
+
   /// Get the stream of the asynchronous info sructure or get a new one.
   Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
     // Get the stream (if any) from the async info.
@@ -876,6 +895,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
       return "sm_" + std::to_string(Major * 10 + Minor);
     }
   } ComputeCapability;
+
+  /// The maximum number of warps that can be resident on all the SMs
+  /// simultaneously.
+  uint32_t HardwareParallelism = 0;
 };
 
 Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,