[Openmp-commits] [openmp] Add num_threads clause list format and strict modifier support (PR #85466)

Terry Wilmarth via Openmp-commits openmp-commits at lists.llvm.org
Tue Mar 19 08:16:06 PDT 2024


https://github.com/TerryLWilmarth updated https://github.com/llvm/llvm-project/pull/85466

>From dad0577dc49e8077b924b33a84ed0312c40ba431 Mon Sep 17 00:00:00 2001
From: Terry Wilmarth <terry.l.wilmarth at intel.com>
Date: Fri, 15 Mar 2024 15:51:49 -0500
Subject: [PATCH 1/2] Add num_threads clause list format and strict modifier
 support

Add support to the runtime for 6.0 spec features that allow
num_threads clause to take a list, and also make use of the strict
modifier.  Provides new compiler interface functions for these
features.
---
 openmp/runtime/src/kmp.h            |  46 +++++++--
 openmp/runtime/src/kmp_csupport.cpp |  44 +++++++++
 openmp/runtime/src/kmp_runtime.cpp  | 142 +++++++++++++++++++++++++---
 3 files changed, 212 insertions(+), 20 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 121e7e959129ea..7397244a16f606 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -532,6 +532,19 @@ enum clock_function_type {
 enum mic_type { non_mic, mic1, mic2, mic3, dummy };
 #endif
 
+// OpenMP 3.1 - Nested num threads array
+typedef struct kmp_nested_nthreads_t {
+  int *nth;
+  int size;
+  int used;
+  bool strict; // num_threads clause has strict modifier
+  ident_t *loc; // loc for strict modifier
+  int sev; // error severity for strict modifier
+  const char *msg; // error message for strict modifier
+} kmp_nested_nthreads_t;
+
+extern kmp_nested_nthreads_t __kmp_nested_nth;
+
 /* -- fast reduction stuff ------------------------------------------------ */
 
 #undef KMP_FAST_REDUCTION_BARRIER
@@ -2958,6 +2971,12 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   /* The data set by the primary thread at reinit, then R/W by the worker */
   KMP_ALIGN_CACHE int
       th_set_nproc; /* if > 0, then only use this request for the next fork */
+  int *th_set_nested_nth;
+  bool th_nt_strict; // num_threads clause has strict modifier
+  ident_t *th_nt_loc; // loc for strict modifier
+  int th_nt_sev; // error severity for strict modifier
+  const char *th_nt_msg; // error message for strict modifier
+  int th_set_nested_nth_sz;
 #if KMP_NESTED_HOT_TEAMS
   kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */
 #endif
@@ -3202,6 +3221,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   void *t_stack_id; // team specific stack stitching id (for ittnotify)
 #endif /* USE_ITT_BUILD */
   distributedBarrier *b; // Distributed barrier data associated with team
+  kmp_nested_nthreads_t *t_nested_nth;
 } kmp_base_team_t;
 
 union KMP_ALIGN_CACHE kmp_team {
@@ -3532,15 +3552,6 @@ extern enum mic_type __kmp_mic_type;
 extern double __kmp_load_balance_interval; // load balance algorithm interval
 #endif /* USE_LOAD_BALANCE */
 
-// OpenMP 3.1 - Nested num threads array
-typedef struct kmp_nested_nthreads_t {
-  int *nth;
-  int size;
-  int used;
-} kmp_nested_nthreads_t;
-
-extern kmp_nested_nthreads_t __kmp_nested_nth;
-
 #if KMP_USE_ADAPTIVE_LOCKS
 
 // Parameters for the speculative lock backoff system.
@@ -3775,6 +3786,11 @@ extern void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL);
   ___kmp_thread_free((th), (ptr)KMP_SRC_LOC_CURR)
 
 extern void __kmp_push_num_threads(ident_t *loc, int gtid, int num_threads);
+extern void __kmp_push_num_threads_list(ident_t *loc, int gtid,
+                                        kmp_uint32 list_length,
+                                        int *num_threads_list);
+extern void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+                                         const char *msg);
 
 extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
                                  kmp_proc_bind_t proc_bind);
@@ -4403,6 +4419,18 @@ KMP_EXPORT kmp_int32 __kmpc_in_parallel(ident_t *loc);
 KMP_EXPORT void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid);
 KMP_EXPORT void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
                                         kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_push_num_threads_strict(ident_t *loc,
+                                               kmp_int32 global_tid,
+                                               kmp_int32 num_threads,
+                                               int severity,
+                                               const char *message);
+
+KMP_EXPORT void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
+                                             kmp_uint32 list_length,
+                                             kmp_int32 *num_threads_list);
+KMP_EXPORT void __kmpc_push_num_threads_list_strict(
+    ident_t *loc, kmp_int32 global_tid, kmp_uint32 list_length,
+    kmp_int32 *num_threads_list, int severity, const char *message);
 
 KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
                                       int proc_bind);
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 878e78b5c7ad2d..0895ff6288f55c 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -236,6 +236,50 @@ void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
   __kmp_push_num_threads(loc, global_tid, num_threads);
 }
 
+void __kmpc_push_num_threads_strict(ident_t *loc, kmp_int32 global_tid,
+                                    kmp_int32 num_threads, int severity,
+                                    const char *message) {
+  __kmp_push_num_threads(loc, global_tid, num_threads);
+  __kmp_set_strict_num_threads(loc, global_tid, severity, message);
+}
+
+/*!
+ at ingroup PARALLEL
+ at param loc source location information
+ at param global_tid global thread number
+ at param list_length number of entries in the num_threads_list array
+ at param num_threads_list array of numbers of threads requested for this parallel
+construct and subsequent nested parallel constructs
+
+Set the number of threads to be used by the next fork spawned by this thread,
+and some nested forks as well.
+This call is only required if the parallel construct has a `num_threads` clause
+that has a list of integers as the argument.
+*/
+void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
+                                  kmp_uint32 list_length,
+                                  kmp_int32 *num_threads_list) {
+  KA_TRACE(20, ("__kmpc_push_num_threads_list: enter T#%d num_threads_list=",
+                global_tid));
+  KA_TRACE(20, ("%d", num_threads_list[0]));
+#ifdef KMP_DEBUG
+  for (kmp_uint32 i = 1; i < list_length; ++i)
+    KA_TRACE(20, (", %d", num_threads_list[i]));
+#endif
+  KA_TRACE(20, ("/n"));
+
+  __kmp_assert_valid_gtid(global_tid);
+  __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
+}
+
+void __kmpc_push_num_threads_list_strict(ident_t *loc, kmp_int32 global_tid,
+                                         kmp_uint32 list_length,
+                                         kmp_int32 *num_threads_list,
+                                         int severity, const char *message) {
+  __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
+  __kmp_set_strict_num_threads(loc, global_tid, severity, message);
+}
+
 void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
   KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
   /* the num_threads are automatically popped */
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 7edb0b440acc7f..11dcb7e8c1c015 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -113,6 +113,25 @@ void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
                                int new_nthreads);
 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
 
+static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
+                                                        int level) {
+  kmp_nested_nthreads_t *new_nested_nth =
+      (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
+          sizeof(kmp_nested_nthreads_t));
+  int new_size = level + thr->th.th_set_nested_nth_sz;
+  new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
+  for (int i = 0; i < level + 1; ++i)
+    new_nested_nth->nth[i] = 0;
+  for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
+    new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
+  new_nested_nth->size = new_nested_nth->used = new_size;
+  new_nested_nth->strict = thr->th.th_nt_strict;
+  new_nested_nth->loc = thr->th.th_nt_loc;
+  new_nested_nth->sev = thr->th.th_nt_sev;
+  new_nested_nth->msg = thr->th.th_nt_msg;
+  return new_nested_nth;
+}
+
 /* Calculate the identifier of the current thread */
 /* fast (and somewhat portable) way to get unique identifier of executing
    thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
@@ -930,6 +949,12 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
                   __kmp_get_gtid(), new_nthreads, set_nthreads));
   }
 #endif // KMP_DEBUG
+
+  if ((this_thr->th.th_nt_strict || parent_team->t.t_nested_nth->strict) &&
+      new_nthreads < set_nthreads) {
+    __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
+                 this_thr->th.th_nt_msg);
+  }
   return new_nthreads;
 }
 
@@ -1242,6 +1267,10 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     serial_team->t.t_serialized = 1;
     serial_team->t.t_nproc = 1;
     serial_team->t.t_parent = this_thr->th.th_team;
+    if (this_thr->th.th_team->t.t_nested_nth)
+      serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
+    else
+      serial_team->t.t_nested_nth = &__kmp_nested_nth;
     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
     this_thr->th.th_team = serial_team;
     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
@@ -1261,9 +1290,11 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
 
     // Thread value exists in the nested nthreads array for the next nested
     // level
-    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
-      this_thr->th.th_current_task->td_icvs.nproc =
-          __kmp_nested_nth.nth[level + 1];
+    kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+    if (this_thr->th.th_team->t.t_nested_nth)
+      nested_nth = this_thr->th.th_team->t.t_nested_nth;
+    if (nested_nth->used && (level + 1 < nested_nth->used)) {
+      this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
     }
 
     if (__kmp_nested_proc_bind.used &&
@@ -1312,10 +1343,14 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     int level = this_thr->th.th_team->t.t_level;
     // Thread value exists in the nested nthreads array for the next nested
     // level
-    if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
-      this_thr->th.th_current_task->td_icvs.nproc =
-          __kmp_nested_nth.nth[level + 1];
+
+    kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+    if (serial_team->t.t_nested_nth)
+      nested_nth = serial_team->t.t_nested_nth;
+    if (nested_nth->used && (level + 1 < nested_nth->used)) {
+      this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
     }
+
     serial_team->t.t_level++;
     KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
                   "of serial team %p to %d\n",
@@ -2074,9 +2109,18 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
     // See if we need to make a copy of the ICVs.
     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
-    if ((level + 1 < __kmp_nested_nth.used) &&
-        (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
-      nthreads_icv = __kmp_nested_nth.nth[level + 1];
+    kmp_nested_nthreads_t *nested_nth = NULL;
+    if (!master_th->th.th_set_nested_nth &&
+        (level + 1 < parent_team->t.t_nested_nth->used) &&
+        (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
+      nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
+    } else if (master_th->th.th_set_nested_nth) {
+      nested_nth = __kmp_override_nested_nth(master_th, level);
+      if ((level + 1 < nested_nth->used) &&
+          (nested_nth->nth[level + 1] != nthreads_icv))
+        nthreads_icv = nested_nth->nth[level + 1];
+      else
+        nthreads_icv = 0; // don't update
     } else {
       nthreads_icv = 0; // don't update
     }
@@ -2185,6 +2229,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
     KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
 
+    // Check if hot team has potentially outdated list, and if so, free it
+    if (team->t.t_nested_nth &&
+        team->t.t_nested_nth != parent_team->t.t_nested_nth) {
+      KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+      KMP_INTERNAL_FREE(team->t.t_nested_nth);
+      team->t.t_nested_nth = NULL;
+    }
+    team->t.t_nested_nth = parent_team->t.t_nested_nth;
+    if (master_th->th.th_set_nested_nth) {
+      if (!nested_nth)
+        nested_nth = __kmp_override_nested_nth(master_th, level);
+      team->t.t_nested_nth = nested_nth;
+      KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
+      master_th->th.th_set_nested_nth = NULL;
+      master_th->th.th_set_nested_nth_sz = 0;
+      master_th->th.th_nt_strict = false;
+    }
+
     // Update the floating point rounding in the team if required.
     propagateFPControl(team);
 #if OMPD_SUPPORT
@@ -3390,6 +3452,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   root_team->t.t_serialized = 1;
   // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
   root_team->t.t_sched.sched = r_sched.sched;
+  root_team->t.t_nested_nth = &__kmp_nested_nth;
   KA_TRACE(
       20,
       ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
@@ -3427,6 +3490,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
   hot_team->t.t_sched.sched = r_sched.sched;
   hot_team->t.t_size_changed = 0;
+  hot_team->t.t_nested_nth = &__kmp_nested_nth;
 }
 
 #ifdef KMP_DEBUG
@@ -4293,6 +4357,7 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
   else // no tasking --> always safe to reap
     this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
   this_thr->th.th_set_proc_bind = proc_bind_default;
+
 #if KMP_AFFINITY_SUPPORTED
   this_thr->th.th_new_place = this_thr->th.th_current_place;
 #endif
@@ -4556,6 +4621,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   /* allocate space for it. */
   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
 
+  new_thr->th.th_nt_strict = false;
+  new_thr->th.th_nt_loc = NULL;
+  new_thr->th.th_nt_sev = severity_fatal;
+  new_thr->th.th_nt_msg = NULL;
+
   TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
 
 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
@@ -4666,6 +4736,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   new_thr->th.th_active_in_pool = FALSE;
   TCW_4(new_thr->th.th_active, TRUE);
 
+  new_thr->th.th_set_nested_nth = NULL;
+  new_thr->th.th_set_nested_nth_sz = 0;
+
   /* adjust the global counters */
   __kmp_all_nth++;
   __kmp_nth++;
@@ -5456,7 +5529,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       }
     } // Check changes in number of threads
 
-    kmp_info_t *master = team->t.t_threads[0];
     if (master->th.th_teams_microtask) {
       for (f = 1; f < new_nproc; ++f) {
         // propagate teams construct specific info to workers
@@ -5562,6 +5634,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       __ompt_team_assign_id(team, ompt_parallel_data);
 #endif
 
+      team->t.t_nested_nth = NULL;
+
       KMP_MB();
 
       return team;
@@ -5633,6 +5707,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 
   KMP_MB();
 
+  team->t.t_nested_nth = NULL;
+
   KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
                 team->t.t_id));
 
@@ -5735,6 +5811,14 @@ void __kmp_free_team(kmp_root_t *root,
       }
     }
 
+    // Before clearing parent pointer, check if nested_nth list should be freed
+    if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
+        team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
+      KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+      KMP_INTERNAL_FREE(team->t.t_nested_nth);
+    }
+    team->t.t_nested_nth = NULL;
+
     // Reset pointer to parent team only for non-hot teams.
     team->t.t_parent = NULL;
     team->t.t_level = 0;
@@ -7837,7 +7921,6 @@ int __kmp_invoke_teams_master(int gtid) {
    encountered by this team. since this should be enclosed in the forkjoin
    critical section it should avoid race conditions with asymmetrical nested
    parallelism */
-
 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
   kmp_info_t *thr = __kmp_threads[gtid];
 
@@ -7845,6 +7928,39 @@ void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
     thr->th.th_set_nproc = num_threads;
 }
 
+void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
+                                 int *num_threads_list) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+
+  KMP_DEBUG_ASSERT(list_length > 1);
+
+  if (num_threads_list[0] > 0)
+    thr->th.th_set_nproc = num_threads_list[0];
+  thr->th.th_set_nested_nth =
+      (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
+  for (kmp_uint32 i = 0; i < list_length; ++i)
+    thr->th.th_set_nested_nth[i] = num_threads_list[i];
+  thr->th.th_set_nested_nth_sz = list_length;
+}
+
+void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+                                  const char *msg) {
+  kmp_info_t *thr = __kmp_threads[gtid];
+  thr->th.th_nt_strict = true;
+  thr->th.th_nt_loc = loc;
+  // if sev is unset make fatal
+  if (sev == severity_warning)
+    thr->th.th_nt_sev = sev;
+  else
+    thr->th.th_nt_sev = severity_fatal;
+  // if msg is unset, use an appropriate message
+  if (msg)
+    thr->th.th_nt_msg = msg;
+  else
+    thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
+                        "strict num_threads clause.";
+}
+
 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
                                     int num_threads) {
   KMP_DEBUG_ASSERT(thr);
@@ -8301,6 +8417,10 @@ void __kmp_cleanup(void) {
   __kmp_nested_nth.nth = NULL;
   __kmp_nested_nth.size = 0;
   __kmp_nested_nth.used = 0;
+  __kmp_nested_nth.strict = false;
+  __kmp_nested_nth.loc = NULL;
+  __kmp_nested_nth.sev = 0;
+  __kmp_nested_nth.msg = NULL;
   KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
   __kmp_nested_proc_bind.bind_types = NULL;
   __kmp_nested_proc_bind.size = 0;

>From 47155e3a2b5ff3fab61309f0377b6ddef5e97819 Mon Sep 17 00:00:00 2001
From: Terry Wilmarth <terry.l.wilmarth at intel.com>
Date: Tue, 19 Mar 2024 10:15:02 -0500
Subject: [PATCH 2/2] Add dllexports.

---
 openmp/runtime/src/dllexports | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 0d49643709e0a0..747b8280931568 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -1268,6 +1268,11 @@ kmp_set_disp_num_buffers                    890
     __kmpc_atomic_val_8_cas_cpt            2158
     %endif
 
+    # No longer need to put ordinal numbers
+    __kmpc_push_num_threads_list
+    __kmpc_push_num_threads_strict
+    __kmpc_push_num_threads_list_strict
+
 %endif
 
 __kmpc_set_thread_limit



More information about the Openmp-commits mailing list