[Openmp-commits] [openmp] 422adaa - [OpenMP] Add thread limit environment variable support to plugins

via Openmp-commits openmp-commits at lists.llvm.org
Tue Jun 22 13:25:50 PDT 2021


Author: Joseph Huber
Date: 2021-06-22T16:25:40-04:00
New Revision: 422adaa879b2de5a682eaed1a4f7cf86e9ea12b4

URL: https://github.com/llvm/llvm-project/commit/422adaa879b2de5a682eaed1a4f7cf86e9ea12b4
DIFF: https://github.com/llvm/llvm-project/commit/422adaa879b2de5a682eaed1a4f7cf86e9ea12b4.diff

LOG: [OpenMP] Add thread limit environment variable support to plugins

The OpenMP 5.1 standard defines the environment variable
`OMP_TEAMS_THREAD_LIMIT` to limit the number of threads that will be run in a
single block. This patch adds support for this into the AMDGPU and CUDA
plugins.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D103923

Added: 
    

Modified: 
    openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
    openmp/libomptarget/plugins/cuda/src/rtl.cpp

Removed: 
    


################################################################################
diff  --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 8a4e12cff549f..012bd1f9b7010 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -405,6 +405,7 @@ class RTLDeviceInfoTy {
   // OpenMP Environment properties
   int EnvNumTeams;
   int EnvTeamLimit;
+  int EnvTeamThreadLimit;
   int EnvMaxTeamsDefault;
 
   // OpenMP Requires Flags
@@ -645,6 +646,13 @@ class RTLDeviceInfoTy {
     } else {
       EnvMaxTeamsDefault = -1;
     }
+    envStr = getenv("OMP_TEAMS_THREAD_LIMIT");
+    if (envStr) {
+      EnvTeamThreadLimit = std::stoi(envStr);
+      DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
+    } else {
+      EnvTeamThreadLimit = -1;
+    }
 
     // Default state.
     RequiresFlags = OMP_REQ_UNDEFINED;
@@ -950,6 +958,14 @@ int32_t __tgt_rtl_init_device(int device_id) {
        DeviceInfo.GroupsPerDevice[device_id]);
   }
 
+  // Adjust threads to the env variables
+  if (DeviceInfo.EnvTeamThreadLimit > 0 &&
+      (enforce_upper_bound(&DeviceInfo.NumThreads[device_id],
+                           DeviceInfo.EnvTeamThreadLimit))) {
+    DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",
+       DeviceInfo.EnvTeamThreadLimit);
+  }
+
   // Set default number of threads
   DeviceInfo.NumThreads[device_id] = RTLDeviceInfoTy::Default_WG_Size;
   DP("Default number of threads set according to library's default %d\n",

diff  --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index e8fe63736b939..7b04bc9180e5b 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -281,6 +281,7 @@ class DeviceRTLTy {
   // OpenMP environment properties
   int EnvNumTeams;
   int EnvTeamLimit;
+  int EnvTeamThreadLimit;
   // OpenMP requires flags
   int64_t RequiresFlags;
 
@@ -436,7 +437,7 @@ class DeviceRTLTy {
 
   DeviceRTLTy()
       : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
-        RequiresFlags(OMP_REQ_UNDEFINED) {
+        EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED) {
 
     DP("Start initializing CUDA\n");
 
@@ -467,6 +468,11 @@ class DeviceRTLTy {
       EnvTeamLimit = std::stoi(EnvStr);
       DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
     }
+    if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
+      // OMP_TEAMS_THREAD_LIMIT has been set
+      EnvTeamThreadLimit = std::stoi(EnvStr);
+      DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
+    }
     if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
       // OMP_NUM_TEAMS has been set
       EnvNumTeams = std::stoi(EnvStr);
@@ -596,14 +602,23 @@ class DeviceRTLTy {
       DP("Error getting max block dimension, use default value %d\n",
          DeviceRTLTy::DefaultNumThreads);
       DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
-    } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) {
+    } else {
       DP("Using %d CUDA threads per block\n", MaxBlockDimX);
       DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
-    } else {
-      DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
-         "capping at the hard limit\n",
-         MaxBlockDimX, DeviceRTLTy::HardThreadLimit);
-      DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
+
+      if (EnvTeamThreadLimit > 0 &&
+          DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
+        DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
+           "OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
+           DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
+        DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
+      }
+      if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
+        DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
+           "capping at the hard limit\n",
+           DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
+        DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
+      }
     }
 
     // Get and set warp size


        


More information about the Openmp-commits mailing list