[Openmp-commits] [openmp] [OpenMP] Support 'omp_get_num_procs' on the device (PR #65501)

Wed Sep 6 09:59:37 PDT 2023

https://github.com/jhuber6 created https://github.com/llvm/llvm-project/pull/65501:

Summary:
The `omp_get_num_procs()` function should return the amount of
parallelism availible. On the GPU, this was not defined. We have elected
to define this function as the maximum amount of wavefronts / warps that
can be simultaneously resident on the device. For AMDGPU this is the
number of CUs multiplied byth CU's per wave. For NVPTX this is the
maximum threads per SM divided by the warp size and multiplied by the
number of SMs.


>From ef60a1d7ad3220326181bbe49f486d5d9198ee51 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6 at vols.utk.edu>
Date: Wed, 6 Sep 2023 11:43:33 -0500
Subject: [PATCH] [OpenMP] Support 'omp_get_num_procs' on the device

Summary:
The `omp_get_num_procs()` function should return the amount of
parallelism availible. On the GPU, this was not defined. We have elected
to define this function as the maximum amount of wavefronts / warps that
can be simultaneously resident on the device. For AMDGPU this is the
number of CUs multiplied byth CU's per wave. For NVPTX this is the
maximum threads per SM divided by the warp size and multiplied by the
number of SMs.
---
 .../DeviceRTL/include/Configuration.h            |  3 +++
 .../libomptarget/DeviceRTL/src/Configuration.cpp |  4 ++++
 openmp/libomptarget/DeviceRTL/src/Mapping.cpp    |  4 +++-
 openmp/libomptarget/include/Environment.h        |  1 +
 .../plugins-nextgen/amdgpu/src/rtl.cpp           | 13 +++++++++----
 .../common/PluginInterface/PluginInterface.cpp   |  1 +
 .../common/PluginInterface/PluginInterface.h     |  5 +++++
 .../plugins-nextgen/cuda/src/rtl.cpp             | 16 +++++++++++-----
 openmp/libomptarget/test/api/omp_get_num_procs.c | 15 +++++++++++++++
 9 files changed, 52 insertions(+), 10 deletions(-)
 create mode 100644 openmp/libomptarget/test/api/omp_get_num_procs.c

diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
index 508e2a55bd8e216..4a68a2f1d46bf6f 100644
--- a/openmp/libomptarget/DeviceRTL/include/Configuration.h
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -46,6 +46,9 @@ void *getIndirectCallTablePtr();
 /// Returns the size of the indirect call table.
 uint64_t getIndirectCallTableSize();
 
+/// Returns the size of the indirect call table.
+uint64_t getHardwareParallelism();
+
 /// Return if debugging is enabled for the given debug kind.
 bool isDebugMode(DebugKind Level);
 
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
index da1e252fc076934..5deee9c53926e77 100644
--- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -55,6 +55,10 @@ void *config::getIndirectCallTablePtr() {
       __omp_rtl_device_environment.IndirectCallTable);
 }
 
+uint64_t config::getHardwareParallelism() {
+  return __omp_rtl_device_environment.HardwareParallelism;
+}
+
 uint64_t config::getIndirectCallTableSize() {
   return __omp_rtl_device_environment.IndirectCallTableSize;
 }
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
index 7c92ec17bf56a15..c75a694fce35b6d 100644
--- a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -333,7 +333,9 @@ uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
   return NumberOfBlocks;
 }
 
-uint32_t mapping::getNumberOfProcessorElements() { __builtin_trap(); }
+uint32_t mapping::getNumberOfProcessorElements() {
+  return static_cast<uint32_t>(config::getHardwareParallelism());
+}
 
 ///}
 
diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h
index 2d291c4505a1fe3..6606d7838cafa72 100644
--- a/openmp/libomptarget/include/Environment.h
+++ b/openmp/libomptarget/include/Environment.h
@@ -33,6 +33,7 @@ struct DeviceEnvironmentTy {
   uint64_t ClockFrequency;
   uintptr_t IndirectCallTable;
   uint64_t IndirectCallTableSize;
+  uint64_t HardwareParallelism;
 };
 
 // NOTE: Please don't change the order of those members as their indices are
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 494fc66292e5bff..c49f9bd06d6313a 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1942,16 +1942,21 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// AMDGPU devices do not have the concept of contexts.
   Error setContext() override { return Plugin::success(); }
 
+  /// AMDGPU returns the product of the number of compute units and the waves
+  /// per compute unit.
+  uint64_t getHardwareParallelism() const override {
+    return HardwareParallelism;
+  }
+
   /// We want to set up the RPC server for host services to the GPU if it is
   /// availible.
   bool shouldSetupRPCServer() const override {
     return libomptargetSupportsRPC();
   }
 
-  /// AMDGPU returns the product of the number of compute units and the waves
-  /// per compute unit.
-  uint64_t requestedRPCPortCount() const override  {
-    return HardwareParallelism;
+  /// The RPC interface should have enough space for all availible parallelism.
+  uint64_t requestedRPCPortCount() const override {
+    return getHardwareParallelism();
   }
 
   /// Get the stream of the asynchronous info sructure or get a new one.
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index e22258803e1bc01..c976c0bc59ed9b0 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -692,6 +692,7 @@ Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
   DeviceEnvironment.IndirectCallTable =
       reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
   DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
+  DeviceEnvironment.HardwareParallelism = getHardwareParallelism();
 
   // Create the metainfo of the device environment global.
   GlobalTy DevEnvGlobal("__omp_rtl_device_environment",
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index 736e864d79f4e83..57bf3575ca45c11 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -781,6 +781,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
     return OMPX_MinThreadsForLowTripCount;
   }
 
+  /// Get the total amount of hardware parallelism supported by the target
+  /// device. This is the total amount of warps or wavefronts that can be
+  /// resident on the device simultaneously.
+  virtual uint64_t getHardwareParallelism() const { return 0; }
+
   /// Get the RPC server running on this device.
   RPCServerTy *getRPCServer() const { return RPCServer; }
 
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 6b763f381d60a82..44b8d349033c0ff 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -301,8 +301,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                                  NumMuliprocessors))
       return Err;
-    if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
-                                 MaxThreadsPerSM))
+    if (auto Err =
+            getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
+                          MaxThreadsPerSM))
       return Err;
     if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
       return Err;
@@ -373,16 +374,21 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
   }
 
+  /// NVIDIA returns the product of the SM count and the number of warps that
+  /// fit if the maximum number of threads were scheduled on each SM.
+  uint64_t getHardwareParallelism() const override {
+    return HardwareParallelism;
+  }
+
   /// We want to set up the RPC server for host services to the GPU if it is
   /// availible.
   bool shouldSetupRPCServer() const override {
     return libomptargetSupportsRPC();
   }
 
-  /// NVIDIA returns the product of the SM count and the number of warps that
-  /// fit if the maximum number of threads were scheduled on each SM.
+  /// The RPC interface should have enough space for all availible parallelism.
   uint64_t requestedRPCPortCount() const override {
-    return HardwareParallelism;
+    return getHardwareParallelism();
   }
 
   /// Get the stream of the asynchronous info sructure or get a new one.
diff --git a/openmp/libomptarget/test/api/omp_get_num_procs.c b/openmp/libomptarget/test/api/omp_get_num_procs.c
new file mode 100644
index 000000000000000..f58274b5e15e78a
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_get_num_procs.c
@@ -0,0 +1,15 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+
+#include <stdio.h>
+
+int omp_get_num_procs();
+
+int main() {
+  int num_procs;
+#pragma omp target map(from : num_procs)
+  { num_procs = omp_get_num_procs(); }
+
+  // CHECK: PASS
+  if (num_procs > 0)
+    printf("PASS\n");
+}