[Openmp-commits] [openmp] [OpenMP] Add num_threads clause list format and strict modifier support (PR #85466)
Terry Wilmarth via Openmp-commits
openmp-commits at lists.llvm.org
Fri Mar 29 19:21:13 PDT 2024
https://github.com/TerryLWilmarth updated https://github.com/llvm/llvm-project/pull/85466
>From dad0577dc49e8077b924b33a84ed0312c40ba431 Mon Sep 17 00:00:00 2001
From: Terry Wilmarth <terry.l.wilmarth at intel.com>
Date: Fri, 15 Mar 2024 15:51:49 -0500
Subject: [PATCH 1/4] Add num_threads clause list format and strict modifier
support
Add support to the runtime for 6.0 spec features that allow
num_threads clause to take a list, and also make use of the strict
modifier. Provides new compiler interface functions for these
features.
---
openmp/runtime/src/kmp.h | 46 +++++++--
openmp/runtime/src/kmp_csupport.cpp | 44 +++++++++
openmp/runtime/src/kmp_runtime.cpp | 142 +++++++++++++++++++++++++---
3 files changed, 212 insertions(+), 20 deletions(-)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 121e7e959129ea..7397244a16f606 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -532,6 +532,19 @@ enum clock_function_type {
enum mic_type { non_mic, mic1, mic2, mic3, dummy };
#endif
+// OpenMP 3.1 - Nested num threads array
+typedef struct kmp_nested_nthreads_t {
+ int *nth;
+ int size;
+ int used;
+ bool strict; // num_threads clause has strict modifier
+ ident_t *loc; // loc for strict modifier
+ int sev; // error severity for strict modifier
+ const char *msg; // error message for strict modifier
+} kmp_nested_nthreads_t;
+
+extern kmp_nested_nthreads_t __kmp_nested_nth;
+
/* -- fast reduction stuff ------------------------------------------------ */
#undef KMP_FAST_REDUCTION_BARRIER
@@ -2958,6 +2971,12 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
/* The data set by the primary thread at reinit, then R/W by the worker */
KMP_ALIGN_CACHE int
th_set_nproc; /* if > 0, then only use this request for the next fork */
+ int *th_set_nested_nth;
+ bool th_nt_strict; // num_threads clause has strict modifier
+ ident_t *th_nt_loc; // loc for strict modifier
+ int th_nt_sev; // error severity for strict modifier
+ const char *th_nt_msg; // error message for strict modifier
+ int th_set_nested_nth_sz;
#if KMP_NESTED_HOT_TEAMS
kmp_hot_team_ptr_t *th_hot_teams; /* array of hot teams */
#endif
@@ -3202,6 +3221,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
void *t_stack_id; // team specific stack stitching id (for ittnotify)
#endif /* USE_ITT_BUILD */
distributedBarrier *b; // Distributed barrier data associated with team
+ kmp_nested_nthreads_t *t_nested_nth;
} kmp_base_team_t;
union KMP_ALIGN_CACHE kmp_team {
@@ -3532,15 +3552,6 @@ extern enum mic_type __kmp_mic_type;
extern double __kmp_load_balance_interval; // load balance algorithm interval
#endif /* USE_LOAD_BALANCE */
-// OpenMP 3.1 - Nested num threads array
-typedef struct kmp_nested_nthreads_t {
- int *nth;
- int size;
- int used;
-} kmp_nested_nthreads_t;
-
-extern kmp_nested_nthreads_t __kmp_nested_nth;
-
#if KMP_USE_ADAPTIVE_LOCKS
// Parameters for the speculative lock backoff system.
@@ -3775,6 +3786,11 @@ extern void ___kmp_thread_free(kmp_info_t *th, void *ptr KMP_SRC_LOC_DECL);
___kmp_thread_free((th), (ptr)KMP_SRC_LOC_CURR)
extern void __kmp_push_num_threads(ident_t *loc, int gtid, int num_threads);
+extern void __kmp_push_num_threads_list(ident_t *loc, int gtid,
+ kmp_uint32 list_length,
+ int *num_threads_list);
+extern void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+ const char *msg);
extern void __kmp_push_proc_bind(ident_t *loc, int gtid,
kmp_proc_bind_t proc_bind);
@@ -4403,6 +4419,18 @@ KMP_EXPORT kmp_int32 __kmpc_in_parallel(ident_t *loc);
KMP_EXPORT void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid);
KMP_EXPORT void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
kmp_int32 num_threads);
+KMP_EXPORT void __kmpc_push_num_threads_strict(ident_t *loc,
+ kmp_int32 global_tid,
+ kmp_int32 num_threads,
+ int severity,
+ const char *message);
+
+KMP_EXPORT void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
+ kmp_uint32 list_length,
+ kmp_int32 *num_threads_list);
+KMP_EXPORT void __kmpc_push_num_threads_list_strict(
+ ident_t *loc, kmp_int32 global_tid, kmp_uint32 list_length,
+ kmp_int32 *num_threads_list, int severity, const char *message);
KMP_EXPORT void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid,
int proc_bind);
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 878e78b5c7ad2d..0895ff6288f55c 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -236,6 +236,50 @@ void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid,
__kmp_push_num_threads(loc, global_tid, num_threads);
}
+void __kmpc_push_num_threads_strict(ident_t *loc, kmp_int32 global_tid,
+ kmp_int32 num_threads, int severity,
+ const char *message) {
+ __kmp_push_num_threads(loc, global_tid, num_threads);
+ __kmp_set_strict_num_threads(loc, global_tid, severity, message);
+}
+
+/*!
+ at ingroup PARALLEL
+ at param loc source location information
+ at param global_tid global thread number
+ at param list_length number of entries in the num_threads_list array
+ at param num_threads_list array of numbers of threads requested for this parallel
+construct and subsequent nested parallel constructs
+
+Set the number of threads to be used by the next fork spawned by this thread,
+and some nested forks as well.
+This call is only required if the parallel construct has a `num_threads` clause
+that has a list of integers as the argument.
+*/
+void __kmpc_push_num_threads_list(ident_t *loc, kmp_int32 global_tid,
+ kmp_uint32 list_length,
+ kmp_int32 *num_threads_list) {
+ KA_TRACE(20, ("__kmpc_push_num_threads_list: enter T#%d num_threads_list=",
+ global_tid));
+ KA_TRACE(20, ("%d", num_threads_list[0]));
+#ifdef KMP_DEBUG
+ for (kmp_uint32 i = 1; i < list_length; ++i)
+ KA_TRACE(20, (", %d", num_threads_list[i]));
+#endif
+ KA_TRACE(20, ("/n"));
+
+ __kmp_assert_valid_gtid(global_tid);
+ __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
+}
+
+void __kmpc_push_num_threads_list_strict(ident_t *loc, kmp_int32 global_tid,
+ kmp_uint32 list_length,
+ kmp_int32 *num_threads_list,
+ int severity, const char *message) {
+ __kmp_push_num_threads_list(loc, global_tid, list_length, num_threads_list);
+ __kmp_set_strict_num_threads(loc, global_tid, severity, message);
+}
+
void __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid) {
KA_TRACE(20, ("__kmpc_pop_num_threads: enter\n"));
/* the num_threads are automatically popped */
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 7edb0b440acc7f..11dcb7e8c1c015 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -113,6 +113,25 @@ void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads,
int new_nthreads);
void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads);
+static kmp_nested_nthreads_t *__kmp_override_nested_nth(kmp_info_t *thr,
+ int level) {
+ kmp_nested_nthreads_t *new_nested_nth =
+ (kmp_nested_nthreads_t *)KMP_INTERNAL_MALLOC(
+ sizeof(kmp_nested_nthreads_t));
+ int new_size = level + thr->th.th_set_nested_nth_sz;
+ new_nested_nth->nth = (int *)KMP_INTERNAL_MALLOC(new_size * sizeof(int));
+ for (int i = 0; i < level + 1; ++i)
+ new_nested_nth->nth[i] = 0;
+ for (int i = level + 1, j = 1; i < new_size; ++i, ++j)
+ new_nested_nth->nth[i] = thr->th.th_set_nested_nth[j];
+ new_nested_nth->size = new_nested_nth->used = new_size;
+ new_nested_nth->strict = thr->th.th_nt_strict;
+ new_nested_nth->loc = thr->th.th_nt_loc;
+ new_nested_nth->sev = thr->th.th_nt_sev;
+ new_nested_nth->msg = thr->th.th_nt_msg;
+ return new_nested_nth;
+}
+
/* Calculate the identifier of the current thread */
/* fast (and somewhat portable) way to get unique identifier of executing
thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
@@ -930,6 +949,12 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
__kmp_get_gtid(), new_nthreads, set_nthreads));
}
#endif // KMP_DEBUG
+
+ if ((this_thr->th.th_nt_strict || parent_team->t.t_nested_nth->strict) &&
+ new_nthreads < set_nthreads) {
+ __kmpc_error(this_thr->th.th_nt_loc, this_thr->th.th_nt_sev,
+ this_thr->th.th_nt_msg);
+ }
return new_nthreads;
}
@@ -1242,6 +1267,10 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
serial_team->t.t_serialized = 1;
serial_team->t.t_nproc = 1;
serial_team->t.t_parent = this_thr->th.th_team;
+ if (this_thr->th.th_team->t.t_nested_nth)
+ serial_team->t.t_nested_nth = this_thr->th.th_team->t.t_nested_nth;
+ else
+ serial_team->t.t_nested_nth = &__kmp_nested_nth;
serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
this_thr->th.th_team = serial_team;
serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
@@ -1261,9 +1290,11 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
// Thread value exists in the nested nthreads array for the next nested
// level
- if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
- this_thr->th.th_current_task->td_icvs.nproc =
- __kmp_nested_nth.nth[level + 1];
+ kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+ if (this_thr->th.th_team->t.t_nested_nth)
+ nested_nth = this_thr->th.th_team->t.t_nested_nth;
+ if (nested_nth->used && (level + 1 < nested_nth->used)) {
+ this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
}
if (__kmp_nested_proc_bind.used &&
@@ -1312,10 +1343,14 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
int level = this_thr->th.th_team->t.t_level;
// Thread value exists in the nested nthreads array for the next nested
// level
- if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
- this_thr->th.th_current_task->td_icvs.nproc =
- __kmp_nested_nth.nth[level + 1];
+
+ kmp_nested_nthreads_t *nested_nth = &__kmp_nested_nth;
+ if (serial_team->t.t_nested_nth)
+ nested_nth = serial_team->t.t_nested_nth;
+ if (nested_nth->used && (level + 1 < nested_nth->used)) {
+ this_thr->th.th_current_task->td_icvs.nproc = nested_nth->nth[level + 1];
}
+
serial_team->t.t_level++;
KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
"of serial team %p to %d\n",
@@ -2074,9 +2109,18 @@ int __kmp_fork_call(ident_t *loc, int gtid,
// See if we need to make a copy of the ICVs.
int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
- if ((level + 1 < __kmp_nested_nth.used) &&
- (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
- nthreads_icv = __kmp_nested_nth.nth[level + 1];
+ kmp_nested_nthreads_t *nested_nth = NULL;
+ if (!master_th->th.th_set_nested_nth &&
+ (level + 1 < parent_team->t.t_nested_nth->used) &&
+ (parent_team->t.t_nested_nth->nth[level + 1] != nthreads_icv)) {
+ nthreads_icv = parent_team->t.t_nested_nth->nth[level + 1];
+ } else if (master_th->th.th_set_nested_nth) {
+ nested_nth = __kmp_override_nested_nth(master_th, level);
+ if ((level + 1 < nested_nth->used) &&
+ (nested_nth->nth[level + 1] != nthreads_icv))
+ nthreads_icv = nested_nth->nth[level + 1];
+ else
+ nthreads_icv = 0; // don't update
} else {
nthreads_icv = 0; // don't update
}
@@ -2185,6 +2229,24 @@ int __kmp_fork_call(ident_t *loc, int gtid,
KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
+ // Check if hot team has potentially outdated list, and if so, free it
+ if (team->t.t_nested_nth &&
+ team->t.t_nested_nth != parent_team->t.t_nested_nth) {
+ KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+ KMP_INTERNAL_FREE(team->t.t_nested_nth);
+ team->t.t_nested_nth = NULL;
+ }
+ team->t.t_nested_nth = parent_team->t.t_nested_nth;
+ if (master_th->th.th_set_nested_nth) {
+ if (!nested_nth)
+ nested_nth = __kmp_override_nested_nth(master_th, level);
+ team->t.t_nested_nth = nested_nth;
+ KMP_INTERNAL_FREE(master_th->th.th_set_nested_nth);
+ master_th->th.th_set_nested_nth = NULL;
+ master_th->th.th_set_nested_nth_sz = 0;
+ master_th->th.th_nt_strict = false;
+ }
+
// Update the floating point rounding in the team if required.
propagateFPControl(team);
#if OMPD_SUPPORT
@@ -3390,6 +3452,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
root_team->t.t_serialized = 1;
// TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
root_team->t.t_sched.sched = r_sched.sched;
+ root_team->t.t_nested_nth = &__kmp_nested_nth;
KA_TRACE(
20,
("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
@@ -3427,6 +3490,7 @@ static void __kmp_initialize_root(kmp_root_t *root) {
// TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
hot_team->t.t_sched.sched = r_sched.sched;
hot_team->t.t_size_changed = 0;
+ hot_team->t.t_nested_nth = &__kmp_nested_nth;
}
#ifdef KMP_DEBUG
@@ -4293,6 +4357,7 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
else // no tasking --> always safe to reap
this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
this_thr->th.th_set_proc_bind = proc_bind_default;
+
#if KMP_AFFINITY_SUPPORTED
this_thr->th.th_new_place = this_thr->th.th_current_place;
#endif
@@ -4556,6 +4621,11 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
/* allocate space for it. */
new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
+ new_thr->th.th_nt_strict = false;
+ new_thr->th.th_nt_loc = NULL;
+ new_thr->th.th_nt_sev = severity_fatal;
+ new_thr->th.th_nt_msg = NULL;
+
TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
#if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
@@ -4666,6 +4736,9 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
new_thr->th.th_active_in_pool = FALSE;
TCW_4(new_thr->th.th_active, TRUE);
+ new_thr->th.th_set_nested_nth = NULL;
+ new_thr->th.th_set_nested_nth_sz = 0;
+
/* adjust the global counters */
__kmp_all_nth++;
__kmp_nth++;
@@ -5456,7 +5529,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
}
} // Check changes in number of threads
- kmp_info_t *master = team->t.t_threads[0];
if (master->th.th_teams_microtask) {
for (f = 1; f < new_nproc; ++f) {
// propagate teams construct specific info to workers
@@ -5562,6 +5634,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
__ompt_team_assign_id(team, ompt_parallel_data);
#endif
+ team->t.t_nested_nth = NULL;
+
KMP_MB();
return team;
@@ -5633,6 +5707,8 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
KMP_MB();
+ team->t.t_nested_nth = NULL;
+
KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
team->t.t_id));
@@ -5735,6 +5811,14 @@ void __kmp_free_team(kmp_root_t *root,
}
}
+ // Before clearing parent pointer, check if nested_nth list should be freed
+ if (team->t.t_nested_nth && team->t.t_nested_nth != &__kmp_nested_nth &&
+ team->t.t_nested_nth != team->t.t_parent->t.t_nested_nth) {
+ KMP_INTERNAL_FREE(team->t.t_nested_nth->nth);
+ KMP_INTERNAL_FREE(team->t.t_nested_nth);
+ }
+ team->t.t_nested_nth = NULL;
+
// Reset pointer to parent team only for non-hot teams.
team->t.t_parent = NULL;
team->t.t_level = 0;
@@ -7837,7 +7921,6 @@ int __kmp_invoke_teams_master(int gtid) {
encountered by this team. since this should be enclosed in the forkjoin
critical section it should avoid race conditions with asymmetrical nested
parallelism */
-
void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
kmp_info_t *thr = __kmp_threads[gtid];
@@ -7845,6 +7928,39 @@ void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
thr->th.th_set_nproc = num_threads;
}
+void __kmp_push_num_threads_list(ident_t *id, int gtid, kmp_uint32 list_length,
+ int *num_threads_list) {
+ kmp_info_t *thr = __kmp_threads[gtid];
+
+ KMP_DEBUG_ASSERT(list_length > 1);
+
+ if (num_threads_list[0] > 0)
+ thr->th.th_set_nproc = num_threads_list[0];
+ thr->th.th_set_nested_nth =
+ (int *)KMP_INTERNAL_MALLOC(list_length * sizeof(int));
+ for (kmp_uint32 i = 0; i < list_length; ++i)
+ thr->th.th_set_nested_nth[i] = num_threads_list[i];
+ thr->th.th_set_nested_nth_sz = list_length;
+}
+
+void __kmp_set_strict_num_threads(ident_t *loc, int gtid, int sev,
+ const char *msg) {
+ kmp_info_t *thr = __kmp_threads[gtid];
+ thr->th.th_nt_strict = true;
+ thr->th.th_nt_loc = loc;
+ // if sev is unset make fatal
+ if (sev == severity_warning)
+ thr->th.th_nt_sev = sev;
+ else
+ thr->th.th_nt_sev = severity_fatal;
+ // if msg is unset, use an appropriate message
+ if (msg)
+ thr->th.th_nt_msg = msg;
+ else
+ thr->th.th_nt_msg = "Cannot form team with number of threads specified by "
+ "strict num_threads clause.";
+}
+
static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams,
int num_threads) {
KMP_DEBUG_ASSERT(thr);
@@ -8301,6 +8417,10 @@ void __kmp_cleanup(void) {
__kmp_nested_nth.nth = NULL;
__kmp_nested_nth.size = 0;
__kmp_nested_nth.used = 0;
+ __kmp_nested_nth.strict = false;
+ __kmp_nested_nth.loc = NULL;
+ __kmp_nested_nth.sev = 0;
+ __kmp_nested_nth.msg = NULL;
KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
__kmp_nested_proc_bind.bind_types = NULL;
__kmp_nested_proc_bind.size = 0;
>From 47155e3a2b5ff3fab61309f0377b6ddef5e97819 Mon Sep 17 00:00:00 2001
From: Terry Wilmarth <terry.l.wilmarth at intel.com>
Date: Tue, 19 Mar 2024 10:15:02 -0500
Subject: [PATCH 2/4] Add dllexports.
---
openmp/runtime/src/dllexports | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports
index 0d49643709e0a0..747b8280931568 100644
--- a/openmp/runtime/src/dllexports
+++ b/openmp/runtime/src/dllexports
@@ -1268,6 +1268,11 @@ kmp_set_disp_num_buffers 890
__kmpc_atomic_val_8_cas_cpt 2158
%endif
+ # No longer need to put ordinal numbers
+ __kmpc_push_num_threads_list
+ __kmpc_push_num_threads_strict
+ __kmpc_push_num_threads_list_strict
+
%endif
__kmpc_set_thread_limit
>From e34c994d1b16d369dde5b210f44bd16416137929 Mon Sep 17 00:00:00 2001
From: Terry Wilmarth <terry.l.wilmarth at intel.com>
Date: Fri, 29 Mar 2024 21:04:35 -0500
Subject: [PATCH 3/4] Add tests.
---
.../parallel/omp_parallel_num_threads_list.c | 209 ++++++++++++++++++
.../omp_parallel_num_threads_strict.c | 99 +++++++++
2 files changed, 308 insertions(+)
create mode 100644 openmp/runtime/test/parallel/omp_parallel_num_threads_list.c
create mode 100644 openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c
diff --git a/openmp/runtime/test/parallel/omp_parallel_num_threads_list.c b/openmp/runtime/test/parallel/omp_parallel_num_threads_list.c
new file mode 100644
index 00000000000000..1c1771c255b317
--- /dev/null
+++ b/openmp/runtime/test/parallel/omp_parallel_num_threads_list.c
@@ -0,0 +1,209 @@
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2,2,2,2,2 %libomp-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+// When compiler supports num_threads clause list format, remove the following
+// and use num_threads clause directly
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __kmpc_global_thread_num(void *loc);
+void __kmpc_push_num_threads_list(void *loc, int gtid, unsigned length, int *list);
+
+#if defined(__cplusplus)
+}
+#endif
+
+int test_omp_parallel_num_threads_list()
+{
+ int num_failed;
+
+ // Initially, 5 levels specified via OMP_NUM_THREADS with 2 threads per level
+ // Check top 3 levels
+#pragma omp parallel reduction(+:num_failed) // 1st level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+:num_failed) // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+:num_failed) // 3rd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2);
+ } // end 3rd level parallel
+ } // end 2nd level parallel
+ } // end 1st level parallel
+
+ // Make sure that basic single element num_threads clause works
+#pragma omp parallel reduction(+:num_failed) num_threads(4) // 1st level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 4);
+#pragma omp parallel reduction(+:num_failed) // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+#pragma omp parallel reduction(+:num_failed) // 3rd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+ } // end 3rd level parallel
+ } // end 2nd level parallel
+ } // end 1st level parallel
+
+ // Check that basic single element num_threads clause works on second level
+#pragma omp parallel reduction(+:num_failed) // 1st level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+#pragma omp parallel reduction(+:num_failed) num_threads(4) // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 4);
+#pragma omp parallel reduction(+:num_failed) // 3rd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+ } // end 3rd level parallel
+ } // end 2nd level parallel
+ } // end 1st level parallel
+
+ // Try a short list. It should completely overwrite the old settings.
+ // We need to use the compiler interface for now.
+ int threads[2] = {3,3};
+ __kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2, threads);
+#pragma omp parallel reduction(+:num_failed) //num_threads(3,3) // 1st level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 3);
+#pragma omp parallel reduction(+:num_failed) // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 3);
+#pragma omp parallel reduction(+:num_failed) // 3rd level
+ {
+ // NOTE: should just keep using last element in list, to nesting depth
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 3);
+ } // end 3rd level parallel
+ } // end 2nd level parallel
+ } // end 1st level parallel
+
+ // Similar, but at a lower level.
+#pragma omp parallel reduction(+:num_failed) // 1st level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+ int threads[2] = {3,3};
+ __kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2, threads);
+#pragma omp parallel reduction(+:num_failed) // num_clause(3,3) // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 3);
+#pragma omp parallel reduction(+:num_failed) // 3rd level
+ {
+ // NOTE: just keep using last element in list, to nesting depth
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 3);
+ } // end 3rd level parallel
+ } // end 2nd level parallel
+ // Make sure a second inner parallel is NOT affected by the clause
+#pragma omp parallel reduction(+:num_failed) // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+#pragma omp parallel reduction(+:num_failed) // 3rd level
+ {
+#pragma omp single
+ // NOTE: just keep using last element in list, to nesting depth
+ num_failed = num_failed + !(omp_get_num_threads() == 2); // Unaffected
+ } // end 3rd level parallel
+ } // end 2nd level parallel
+ } // end 1st level parallel
+
+ // Test lists at multiple levels
+ int threads2[2] = {4,3};
+ __kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2, threads2);
+#pragma omp parallel reduction(+:num_failed) // num_clause(4,3) // 1st level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 4);
+#pragma omp parallel reduction(+:num_failed) // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 3);
+#pragma omp parallel reduction(+:num_failed) // 3rd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 3);
+ int threads3[2] = {2,5};
+ __kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2, threads3);
+#pragma omp parallel reduction(+:num_failed) //num_clause(2,5) // 4th level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+:num_failed) // 5th level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 5);
+#pragma omp parallel reduction(+:num_failed) // 6th level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 5);
+ } // end 6th level parallel
+ } // end 5th level parallel
+ } // end 4th level parallel
+#pragma omp parallel reduction(+:num_failed) // 4th level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 3);
+ } // end 4th level parallel
+ } // end 3rd level parallel
+ } // end 2nd level parallel
+#pragma omp parallel reduction(+:num_failed) // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 3);
+#pragma omp parallel reduction(+:num_failed) // 3rd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 3);
+ } // end 3rd level parallel
+ } // end 2nd level parallel
+ } // end 1st level parallel
+
+ // Now we should be back to the way we started.
+#pragma omp parallel reduction(+:num_failed) // 1st level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+:num_failed) // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel reduction(+:num_failed) // 3rd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2);
+ } // end 3rd level parallel
+ } // end 2nd level parallel
+ } // end 1st level parallel
+
+ return (!num_failed);
+}
+
+int main()
+{
+ int i;
+ int num_failed=0;
+
+ for(i = 0; i < REPETITIONS; i++) {
+ if(!test_omp_parallel_num_threads_list()) {
+ num_failed++;
+ }
+ }
+ return num_failed;
+}
diff --git a/openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c b/openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c
new file mode 100644
index 00000000000000..358ac915c1713f
--- /dev/null
+++ b/openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c
@@ -0,0 +1,99 @@
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2,2,2,2,2 OMP_THREAD_LIMIT=16 %libomp-run
+#include <stdio.h>
+#include "omp_testsuite.h"
+
+// When compiler supports num_threads clause list format and strict modifier,
+// remove the following and use num_threads clause directly
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int __kmpc_global_thread_num(void *loc);
+void __kmpc_push_num_threads_list(void *loc, int gtid, unsigned length, int *list);
+void __kmpc_push_num_threads_strict(void *loc, int gtid, int nth, int sev, const char *msg);
+void __kmpc_push_num_threads_list_strict(void *loc, int gtid, unsigned length, int *list, int sev, const char *msg);
+
+#if defined(__cplusplus)
+}
+#endif
+
+int test_omp_parallel_num_threads_strict()
+{
+ int num_failed;
+
+ // Test regular runtime warning about exceeding thread limit.
+ // Tolerate whatever value was given.
+#pragma omp parallel num_threads(22)
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() <= 22);
+
+ // Test with 4 threads and strict -- no problem, no warning.
+ __kmpc_push_num_threads_strict(NULL, __kmpc_global_thread_num(NULL), 4,
+ 1, "This warning shouldn't happen.");
+#pragma omp parallel //num_threads(strict:4)
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 4);
+
+ // Exceed limit, specify user warning message. Tolerate whatever was given.
+ __kmpc_push_num_threads_strict(NULL, __kmpc_global_thread_num(NULL), 20,
+ 1, "User-supplied warning for strict.");
+#pragma omp parallel //num_threads(strict:20) severity(warning) \
+ message("User-supplied warning for strict.")
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() <= 20);
+
+ // Exceed limit, no user message, use runtime default message for strict.
+ // Tolerate whatever value was given.
+ __kmpc_push_num_threads_strict(NULL, __kmpc_global_thread_num(NULL), 21,
+ 1, NULL);
+#pragma omp parallel //num_threads(strict:21)
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() <= 21);
+
+
+ // Exceed limit in nested level. Should see user warning message.
+ int threads3[2] = {2,24};
+ __kmpc_push_num_threads_list_strict(NULL, __kmpc_global_thread_num(NULL), 2,
+ threads3, 1,
+ "User-supplied warning on strict list.");
+#pragma omp parallel //num_threads(strict:2,24) severity(warning) \
+ message("User-supplied warning on strict. list") // 1st level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() <= 24);
+ }
+ }
+
+ // No strict limit in nested level. Regular runtime limiting applies.
+ __kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2,
+ threads3);
+#pragma omp parallel //num_threads(2,24) // 1st level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() == 2);
+#pragma omp parallel // 2nd level
+ {
+#pragma omp single
+ num_failed = num_failed + !(omp_get_num_threads() <= 24);
+ }
+ }
+
+ return (!num_failed);
+}
+
+int main()
+{
+ int i;
+ int num_failed=0;
+
+ for(i = 0; i < REPETITIONS; i++) {
+ if(!test_omp_parallel_num_threads_strict()) {
+ num_failed++;
+ }
+ }
+ return num_failed;
+}
>From 177f9eb335bcc5ffe8ac381381d0c3703d9548c2 Mon Sep 17 00:00:00 2001
From: Terry Wilmarth <terry.l.wilmarth at intel.com>
Date: Fri, 29 Mar 2024 21:18:18 -0500
Subject: [PATCH 4/4] Fix test.
---
.../parallel/omp_parallel_num_threads_strict.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c b/openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c
index 358ac915c1713f..6fccfa7f4e7646 100644
--- a/openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c
+++ b/openmp/runtime/test/parallel/omp_parallel_num_threads_strict.c
@@ -23,21 +23,21 @@ int test_omp_parallel_num_threads_strict()
// Test regular runtime warning about exceeding thread limit.
// Tolerate whatever value was given.
-#pragma omp parallel num_threads(22)
+#pragma omp parallel reduction(+:num_failed) num_threads(22)
#pragma omp single
num_failed = num_failed + !(omp_get_num_threads() <= 22);
// Test with 4 threads and strict -- no problem, no warning.
__kmpc_push_num_threads_strict(NULL, __kmpc_global_thread_num(NULL), 4,
1, "This warning shouldn't happen.");
-#pragma omp parallel //num_threads(strict:4)
+#pragma omp parallel reduction(+:num_failed) //num_threads(strict:4)
#pragma omp single
num_failed = num_failed + !(omp_get_num_threads() == 4);
// Exceed limit, specify user warning message. Tolerate whatever was given.
__kmpc_push_num_threads_strict(NULL, __kmpc_global_thread_num(NULL), 20,
1, "User-supplied warning for strict.");
-#pragma omp parallel //num_threads(strict:20) severity(warning) \
+#pragma omp parallel reduction(+:num_failed) //num_threads(strict:20) severity(warning) \
message("User-supplied warning for strict.")
#pragma omp single
num_failed = num_failed + !(omp_get_num_threads() <= 20);
@@ -46,7 +46,7 @@ int test_omp_parallel_num_threads_strict()
// Tolerate whatever value was given.
__kmpc_push_num_threads_strict(NULL, __kmpc_global_thread_num(NULL), 21,
1, NULL);
-#pragma omp parallel //num_threads(strict:21)
+#pragma omp parallel reduction(+:num_failed) //num_threads(strict:21)
#pragma omp single
num_failed = num_failed + !(omp_get_num_threads() <= 21);
@@ -56,12 +56,12 @@ int test_omp_parallel_num_threads_strict()
__kmpc_push_num_threads_list_strict(NULL, __kmpc_global_thread_num(NULL), 2,
threads3, 1,
"User-supplied warning on strict list.");
-#pragma omp parallel //num_threads(strict:2,24) severity(warning) \
+#pragma omp parallel reduction(+:num_failed) //num_threads(strict:2,24) severity(warning) \
message("User-supplied warning on strict. list") // 1st level
{
#pragma omp single
num_failed = num_failed + !(omp_get_num_threads() == 2);
-#pragma omp parallel // 2nd level
+#pragma omp parallel reduction(+:num_failed) // 2nd level
{
#pragma omp single
num_failed = num_failed + !(omp_get_num_threads() <= 24);
@@ -71,11 +71,11 @@ int test_omp_parallel_num_threads_strict()
// No strict limit in nested level. Regular runtime limiting applies.
__kmpc_push_num_threads_list(NULL, __kmpc_global_thread_num(NULL), 2,
threads3);
-#pragma omp parallel //num_threads(2,24) // 1st level
+#pragma omp parallel reduction(+:num_failed) //num_threads(2,24) // 1st level
{
#pragma omp single
num_failed = num_failed + !(omp_get_num_threads() == 2);
-#pragma omp parallel // 2nd level
+#pragma omp parallel reduction(+:num_failed) // 2nd level
{
#pragma omp single
num_failed = num_failed + !(omp_get_num_threads() <= 24);
More information about the Openmp-commits
mailing list