[Openmp-commits] [openmp] [OpenMP] Fix task state and taskteams for serial teams (PR #86859)

Jonathan Peyton via Openmp-commits openmp-commits at lists.llvm.org
Tue Apr 2 09:54:54 PDT 2024


https://github.com/jpeyton52 updated https://github.com/llvm/llvm-project/pull/86859

>From 3b956bd31ada92819c0f38a08567b5d92d22c063 Mon Sep 17 00:00:00 2001
From: Jonathan Peyton <jonathan.l.peyton at intel.com>
Date: Mon, 18 Mar 2024 16:22:14 -0500
Subject: [PATCH 1/6] [OpenMP] Fix task state manipulation

* Serial teams now use a stack (similar to dispatch buffers)
* Serial teams always use t_task_team[0] as the task team
  and the second pointer is a next pointer for the stack

t_task_team[2] is interpreted as a stack of task teams where each
level is a nested level

 inner serial team                   outer serial team
[ t_task_team[0] ] -> (task_team)    [ t_task_team[0] ] -> (task_team)
[ next           ] ----------------> [ next           ] -> ...

* Remove the task state memo stack from thread structure.
* Instead of a thread-private stack, use team structure to
  store th_task_state of the primary thread. When coming out of a
  parallel, restore the primary thread's task state.
  The new field in the team structure doesn't
  cause sizeof(team) to change and is in the cache line which is
  only read/written by the primary thread.

Fixes: #50602
Fixes: #69368
Fixes: #69733
Fixes: #79416
---
 openmp/runtime/src/kmp.h                      |  29 +-
 openmp/runtime/src/kmp_barrier.cpp            |  15 +-
 openmp/runtime/src/kmp_csupport.cpp           |  11 +
 openmp/runtime/src/kmp_runtime.cpp            | 179 ++++-------
 openmp/runtime/src/kmp_tasking.cpp            |  98 +++---
 openmp/runtime/test/tasking/issue-50602.c     |  28 ++
 openmp/runtime/test/tasking/issue-69368.c     |  27 ++
 openmp/runtime/test/tasking/issue-69733.c     |  33 ++
 openmp/runtime/test/tasking/issue-79416.c     |  17 +
 .../test/tasking/task_teams_stress_test.cpp   | 304 ++++++++++++++++++
 10 files changed, 555 insertions(+), 186 deletions(-)
 create mode 100644 openmp/runtime/test/tasking/issue-50602.c
 create mode 100644 openmp/runtime/test/tasking/issue-69368.c
 create mode 100644 openmp/runtime/test/tasking/issue-69733.c
 create mode 100644 openmp/runtime/test/tasking/issue-79416.c
 create mode 100644 openmp/runtime/test/tasking/task_teams_stress_test.cpp

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 18ccf10fe17d0f..7e3f0e8ef32aa2 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2871,6 +2871,11 @@ union KMP_ALIGN_CACHE kmp_task_team {
   char tt_pad[KMP_PAD(kmp_base_task_team_t, CACHE_LINE)];
 };
 
+typedef struct kmp_task_team_list_t {
+  kmp_task_team_t *task_team;
+  kmp_task_team_list_t *next;
+} kmp_task_team_list_t;
+
 #if (USE_FAST_MEMORY == 3) || (USE_FAST_MEMORY == 5)
 // Free lists keep same-size free memory slots for fast memory allocation
 // routines
@@ -3008,10 +3013,6 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   kmp_task_team_t *th_task_team; // Task team struct
   kmp_taskdata_t *th_current_task; // Innermost Task being executed
   kmp_uint8 th_task_state; // alternating 0/1 for task team identification
-  kmp_uint8 *th_task_state_memo_stack; // Stack holding memos of th_task_state
-  // at nested levels
-  kmp_uint32 th_task_state_top; // Top element of th_task_state_memo_stack
-  kmp_uint32 th_task_state_stack_sz; // Size of th_task_state_memo_stack
   kmp_uint32 th_reap_state; // Non-zero indicates thread is not
   // tasking, thus safe to reap
 
@@ -3133,6 +3134,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   kmp_disp_t *t_dispatch; // thread's dispatch data
   kmp_task_team_t *t_task_team[2]; // Task team struct; switch between 2
   kmp_proc_bind_t t_proc_bind; // bind type for par region
+  int t_primary_task_state; // primary thread's task state saved
 #if USE_ITT_BUILD
   kmp_uint64 t_region_time; // region begin timestamp
 #endif /* USE_ITT_BUILD */
@@ -3204,6 +3206,12 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   distributedBarrier *b; // Distributed barrier data associated with team
 } kmp_base_team_t;
 
+// Assert that the list structure fits and aligns within
+// the double task team pointer
+KMP_BUILD_ASSERT(sizeof(kmp_task_team_t * [2]) == sizeof(kmp_task_team_list_t));
+KMP_BUILD_ASSERT(alignof(kmp_task_team_t * [2]) ==
+                 alignof(kmp_task_team_list_t));
+
 union KMP_ALIGN_CACHE kmp_team {
   kmp_base_team_t t;
   double t_align; /* use worst case alignment */
@@ -4114,9 +4122,10 @@ extern void __kmp_fulfill_event(kmp_event_t *event);
 extern void __kmp_free_task_team(kmp_info_t *thread,
                                  kmp_task_team_t *task_team);
 extern void __kmp_reap_task_teams(void);
+extern void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team);
+extern void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team);
 extern void __kmp_wait_to_unref_task_teams(void);
-extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team,
-                                  int always);
+extern void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team);
 extern void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team);
 extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
 #if USE_ITT_BUILD
@@ -4127,6 +4136,14 @@ extern void __kmp_task_team_wait(kmp_info_t *this_thr, kmp_team_t *team
                                  int wait = 1);
 extern void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread,
                                   int gtid);
+#if KMP_DEBUG
+#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr)                         \
+  KMP_DEBUG_ASSERT(                                                            \
+      __kmp_tasking_mode != tskm_task_teams || team->t.t_nproc == 1 ||         \
+      thr->th.th_task_team == team->t.t_task_team[thr->th.th_task_state])
+#else
+#define KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, thr) /* Nothing */
+#endif
 
 extern int __kmp_is_address_mapped(void *addr);
 extern kmp_uint64 __kmp_hardware_timestamp(void);
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index e9ab15f1723b66..b381694c0953e2 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -1858,8 +1858,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
     }
 
     if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
-      // use 0 to only setup the current team if nthreads > 1
-      __kmp_task_team_setup(this_thr, team, 0);
+      __kmp_task_team_setup(this_thr, team);
 
     if (cancellable) {
       cancelled = __kmp_linear_barrier_gather_cancellable(
@@ -2042,7 +2041,7 @@ static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
             this_thr->th.th_task_team->tt.tt_hidden_helper_task_encountered ==
                 TRUE);
         __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
-        __kmp_task_team_setup(this_thr, team, 0);
+        __kmp_task_team_setup(this_thr, team);
 
 #if USE_ITT_BUILD
         if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
@@ -2243,9 +2242,7 @@ void __kmp_join_barrier(int gtid) {
                   __kmp_gtid_from_thread(this_thr), team_id,
                   team->t.t_task_team[this_thr->th.th_task_state],
                   this_thr->th.th_task_team));
-    if (this_thr->th.th_task_team)
-      KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
-                       team->t.t_task_team[this_thr->th.th_task_state]);
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, this_thr);
   }
 #endif /* KMP_DEBUG */
 
@@ -2440,10 +2437,8 @@ void __kmp_fork_barrier(int gtid, int tid) {
     }
 #endif
 
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      // 0 indicates setup current task team if nthreads > 1
-      __kmp_task_team_setup(this_thr, team, 0);
-    }
+    if (__kmp_tasking_mode != tskm_immediate_exec)
+      __kmp_task_team_setup(this_thr, team);
 
     /* The primary thread may have changed its blocktime between join barrier
        and fork barrier. Copy the blocktime info to the thread, where
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 0268f692ff7fdc..f45fe646d1d9aa 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -654,6 +654,12 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
         serial_team->t.t_dispatch->th_disp_buffer->next;
     __kmp_free(disp_buffer);
   }
+
+  /* pop the task team stack */
+  if (serial_team->t.t_serialized > 1) {
+    __kmp_pop_task_team_node(this_thr, serial_team);
+  }
+
   this_thr->th.th_def_allocator = serial_team->t.t_def_allocator; // restore
 
   --serial_team->t.t_serialized;
@@ -692,6 +698,11 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     this_thr->th.th_current_task->td_flags.executing = 1;
 
     if (__kmp_tasking_mode != tskm_immediate_exec) {
+      // Restore task state from serial team structure
+      KMP_DEBUG_ASSERT(serial_team->t.t_primary_task_state == 0 ||
+                       serial_team->t.t_primary_task_state == 1);
+      this_thr->th.th_task_state =
+          (kmp_uint8)serial_team->t.t_primary_task_state;
       // Copy the task team from the new child / old parent team to the thread.
       this_thr->th.th_task_team =
           this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index a60bdb968371e0..e426d235b49c91 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -1042,6 +1042,41 @@ static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
     }
   }
 
+  // Take care of primary thread's task state
+  if (__kmp_tasking_mode != tskm_immediate_exec) {
+    if (use_hot_team) {
+      KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team->t.t_parent, master_th);
+      KA_TRACE(
+          20,
+          ("__kmp_fork_team_threads: Primary T#%d pushing task_team %p / team "
+           "%p, new task_team %p / team %p\n",
+           __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
+           team->t.t_parent, team->t.t_task_team[master_th->th.th_task_state],
+           team));
+
+      // Store primary thread's current task state on new team
+      KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+                       master_th->th.th_task_state);
+
+      // Restore primary thread's task state to hot team's state
+      // by using thread 1's task state
+      if (team->t.t_nproc > 1) {
+        KMP_DEBUG_ASSERT(team->t.t_threads[1]->th.th_task_state == 0 ||
+                         team->t.t_threads[1]->th.th_task_state == 1);
+        KMP_CHECK_UPDATE(master_th->th.th_task_state,
+                         team->t.t_threads[1]->th.th_task_state);
+      } else {
+        master_th->th.th_task_state = 0;
+      }
+    } else {
+      // Store primary thread's current task_state on new team
+      KMP_CHECK_UPDATE(team->t.t_primary_task_state,
+                       master_th->th.th_task_state);
+      // Are not using hot team, so set task state to 0.
+      master_th->th.th_task_state = 0;
+    }
+  }
+
   if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
     for (i = 0; i < team->t.t_nproc; i++) {
       kmp_info_t *thr = team->t.t_threads[i];
@@ -1145,18 +1180,6 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   KMP_DEBUG_ASSERT(serial_team);
   KMP_MB();
 
-  if (__kmp_tasking_mode != tskm_immediate_exec) {
-    KMP_DEBUG_ASSERT(
-        this_thr->th.th_task_team ==
-        this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
-    KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
-                     NULL);
-    KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
-                  "team %p, new task_team = NULL\n",
-                  global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
-    this_thr->th.th_task_team = NULL;
-  }
-
   kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
   if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
     proc_bind = proc_bind_false;
@@ -1242,6 +1265,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     serial_team->t.t_serialized = 1;
     serial_team->t.t_nproc = 1;
     serial_team->t.t_parent = this_thr->th.th_team;
+    // Save previous team's task state on serial team structure
+    serial_team->t.t_primary_task_state = this_thr->th.th_task_state;
     serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
     this_thr->th.th_team = serial_team;
     serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
@@ -1281,6 +1306,8 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     this_thr->th.th_team_nproc = 1;
     this_thr->th.th_team_master = this_thr;
     this_thr->th.th_team_serialized = 1;
+    this_thr->th.th_task_team = NULL;
+    this_thr->th.th_task_state = 0;
 
     serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
     serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
@@ -1332,6 +1359,9 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     }
     this_thr->th.th_dispatch = serial_team->t.t_dispatch;
 
+    /* allocate/push task team stack */
+    __kmp_push_task_team_node(this_thr, serial_team);
+
     KMP_MB();
   }
   KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
@@ -1985,17 +2015,12 @@ int __kmp_fork_call(ident_t *loc, int gtid,
                                  ap);
     } // End parallel closely nested in teams construct
 
-#if KMP_DEBUG
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                       parent_team->t.t_task_team[master_th->th.th_task_state]);
-    }
-#endif
-
     // Need this to happen before we determine the number of threads, not while
     // we are allocating the team
     //__kmp_push_current_task_to_thread(master_th, parent_team, 0);
 
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(parent_team, master_th);
+
     // Determine the number of threads
     int enter_teams =
         __kmp_is_entering_teams(active_level, level, teams_level, ap);
@@ -2186,64 +2211,6 @@ int __kmp_fork_call(ident_t *loc, int gtid,
       ompd_bp_parallel_begin();
 #endif
 
-    if (__kmp_tasking_mode != tskm_immediate_exec) {
-      // Set primary thread's task team to team's task team. Unless this is hot
-      // team, it should be NULL.
-      KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                       parent_team->t.t_task_team[master_th->th.th_task_state]);
-      KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team "
-                    "%p, new task_team %p / team %p\n",
-                    __kmp_gtid_from_thread(master_th),
-                    master_th->th.th_task_team, parent_team,
-                    team->t.t_task_team[master_th->th.th_task_state], team));
-
-      if (active_level || master_th->th.th_task_team) {
-        // Take a memo of primary thread's task_state
-        KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-        if (master_th->th.th_task_state_top >=
-            master_th->th.th_task_state_stack_sz) { // increase size
-          kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
-          kmp_uint8 *old_stack, *new_stack;
-          kmp_uint32 i;
-          new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
-          for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
-            new_stack[i] = master_th->th.th_task_state_memo_stack[i];
-          }
-          for (i = master_th->th.th_task_state_stack_sz; i < new_size;
-               ++i) { // zero-init rest of stack
-            new_stack[i] = 0;
-          }
-          old_stack = master_th->th.th_task_state_memo_stack;
-          master_th->th.th_task_state_memo_stack = new_stack;
-          master_th->th.th_task_state_stack_sz = new_size;
-          __kmp_free(old_stack);
-        }
-        // Store primary thread's task_state on stack
-        master_th->th
-            .th_task_state_memo_stack[master_th->th.th_task_state_top] =
-            master_th->th.th_task_state;
-        master_th->th.th_task_state_top++;
-#if KMP_NESTED_HOT_TEAMS
-        if (master_th->th.th_hot_teams &&
-            active_level < __kmp_hot_teams_max_level &&
-            team == master_th->th.th_hot_teams[active_level].hot_team) {
-          // Restore primary thread's nested state if nested hot team
-          master_th->th.th_task_state =
-              master_th->th
-                  .th_task_state_memo_stack[master_th->th.th_task_state_top];
-        } else {
-#endif
-          master_th->th.th_task_state = 0;
-#if KMP_NESTED_HOT_TEAMS
-        }
-#endif
-      }
-#if !KMP_NESTED_HOT_TEAMS
-      KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
-                       (team == root->r.r_hot_team));
-#endif
-    }
-
     KA_TRACE(
         20,
         ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
@@ -2451,8 +2418,7 @@ void __kmp_join_call(ident_t *loc, int gtid
                   __kmp_gtid_from_thread(master_th), team,
                   team->t.t_task_team[master_th->th.th_task_state],
                   master_th->th.th_task_team));
-    KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
-                     team->t.t_task_team[master_th->th.th_task_state]);
+    KMP_DEBUG_ASSERT_TASKTEAM_INVARIANT(team, master_th);
   }
 #endif
 
@@ -2690,24 +2656,11 @@ void __kmp_join_call(ident_t *loc, int gtid
   }
 
   if (__kmp_tasking_mode != tskm_immediate_exec) {
-    if (master_th->th.th_task_state_top >
-        0) { // Restore task state from memo stack
-      KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
-      // Remember primary thread's state if we re-use this nested hot team
-      master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
-          master_th->th.th_task_state;
-      --master_th->th.th_task_state_top; // pop
-      // Now restore state at this level
-      master_th->th.th_task_state =
-          master_th->th
-              .th_task_state_memo_stack[master_th->th.th_task_state_top];
-    } else if (team != root->r.r_hot_team) {
-      // Reset the task state of primary thread if we are not hot team because
-      // in this case all the worker threads will be free, and their task state
-      // will be reset. If not reset the primary's, the task state will be
-      // inconsistent.
-      master_th->th.th_task_state = 0;
-    }
+    // Restore primary thread's task state from team structure
+    KMP_DEBUG_ASSERT(team->t.t_primary_task_state == 0 ||
+                     team->t.t_primary_task_state == 1);
+    master_th->th.th_task_state = (kmp_uint8)team->t.t_primary_task_state;
+
     // Copy the task team from the parent team to the primary thread
     master_th->th.th_task_team =
         parent_team->t.t_task_team[master_th->th.th_task_state];
@@ -4396,17 +4349,6 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
 
   this_thr->th.th_next_pool = NULL;
 
-  if (!this_thr->th.th_task_state_memo_stack) {
-    size_t i;
-    this_thr->th.th_task_state_memo_stack =
-        (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
-    this_thr->th.th_task_state_top = 0;
-    this_thr->th.th_task_state_stack_sz = 4;
-    for (i = 0; i < this_thr->th.th_task_state_stack_sz;
-         ++i) // zero init the stack
-      this_thr->th.th_task_state_memo_stack[i] = 0;
-  }
-
   KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
   KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
 
@@ -4461,8 +4403,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     TCW_4(__kmp_nth, __kmp_nth + 1);
 
     new_thr->th.th_task_state = 0;
-    new_thr->th.th_task_state_top = 0;
-    new_thr->th.th_task_state_stack_sz = 4;
 
     if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) {
       // Make sure pool thread has transitioned to waiting on own thread struct
@@ -5260,6 +5200,15 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         // Activate team threads via th_used_in_team
         __kmp_add_threads_to_team(team, new_nproc);
       }
+      // When decreasing team size, threads no longer in the team should
+      // unref task team.
+      if (__kmp_tasking_mode != tskm_immediate_exec) {
+        for (f = new_nproc; f < team->t.t_nproc; f++) {
+          kmp_info_t *th = team->t.t_threads[f];
+          KMP_DEBUG_ASSERT(th);
+          th->th.th_task_team = NULL;
+        }
+      }
 #if KMP_NESTED_HOT_TEAMS
       if (__kmp_hot_teams_mode == 0) {
         // AC: saved number of threads should correspond to team's value in this
@@ -5270,11 +5219,6 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
         /* release the extra threads we don't need any more */
         for (f = new_nproc; f < team->t.t_nproc; f++) {
           KMP_DEBUG_ASSERT(team->t.t_threads[f]);
-          if (__kmp_tasking_mode != tskm_immediate_exec) {
-            // When decreasing team size, threads no longer in the team should
-            // unref task team.
-            team->t.t_threads[f]->th.th_task_team = NULL;
-          }
           __kmp_free_thread(team->t.t_threads[f]);
           team->t.t_threads[f] = NULL;
         }
@@ -6246,11 +6190,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
     thread->th.th_pri_common = NULL;
   }
 
-  if (thread->th.th_task_state_memo_stack != NULL) {
-    __kmp_free(thread->th.th_task_state_memo_stack);
-    thread->th.th_task_state_memo_stack = NULL;
-  }
-
 #if KMP_USE_BGET
   if (thread->th.th_local.bget_data != NULL) {
     __kmp_finalize_bget(thread);
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 155e17ba7ec874..4ab6b76306907a 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -1511,8 +1511,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
       KA_TRACE(30,
                ("T#%d creating task team in __kmp_task_alloc for proxy task\n",
                 gtid));
-      // 1 indicates setup the current team regardless of nthreads
-      __kmp_task_team_setup(thread, team, 1);
+      __kmp_task_team_setup(thread, team);
       thread->th.th_task_team = team->t.t_task_team[thread->th.th_task_state];
     }
     kmp_task_team_t *task_team = thread->th.th_task_team;
@@ -4050,6 +4049,40 @@ void __kmp_reap_task_teams(void) {
   }
 }
 
+// View the array of two task team pointers as a pair of pointers:
+//  1) a single task_team pointer
+//  2) next pointer for stack
+// Serial teams can create a stack of task teams for nested serial teams.
+void __kmp_push_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+  kmp_task_team_list_t *current =
+      (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
+  kmp_task_team_list_t *node =
+      (kmp_task_team_list_t *)__kmp_allocate(sizeof(kmp_task_team_list_t));
+  node->task_team = current->task_team;
+  node->next = current->next;
+  thread->th.th_task_team = current->task_team = NULL;
+  current->next = node;
+}
+
+// Serial team pops a task team off the stack
+void __kmp_pop_task_team_node(kmp_info_t *thread, kmp_team_t *team) {
+  KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
+  kmp_task_team_list_t *current =
+      (kmp_task_team_list_t *)(&team->t.t_task_team[0]);
+  if (current->task_team) {
+    __kmp_free_task_team(thread, current->task_team);
+  }
+  kmp_task_team_list_t *next = current->next;
+  if (next) {
+    current->task_team = next->task_team;
+    current->next = next->next;
+    KMP_DEBUG_ASSERT(next != current);
+    __kmp_free(next);
+    thread->th.th_task_team = current->task_team;
+  }
+}
+
 // __kmp_wait_to_unref_task_teams:
 // Some threads could still be in the fork barrier release code, possibly
 // trying to steal tasks.  Wait for each thread to unreference its task team.
@@ -4114,55 +4147,28 @@ void __kmp_wait_to_unref_task_teams(void) {
   }
 }
 
-void __kmp_shift_task_state_stack(kmp_info_t *this_thr, kmp_uint8 value) {
-  // Shift values from th_task_state_top+1 to task_state_stack_sz
-  if (this_thr->th.th_task_state_top + 1 >=
-      this_thr->th.th_task_state_stack_sz) { // increase size
-    kmp_uint32 new_size = 2 * this_thr->th.th_task_state_stack_sz;
-    kmp_uint8 *old_stack, *new_stack;
-    kmp_uint32 i;
-    new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
-    for (i = 0; i <= this_thr->th.th_task_state_top; ++i) {
-      new_stack[i] = this_thr->th.th_task_state_memo_stack[i];
-    }
-    // If we need to reallocate do the shift at the same time.
-    for (; i < this_thr->th.th_task_state_stack_sz; ++i) {
-      new_stack[i + 1] = this_thr->th.th_task_state_memo_stack[i];
-    }
-    for (i = this_thr->th.th_task_state_stack_sz; i < new_size;
-         ++i) { // zero-init rest of stack
-      new_stack[i] = 0;
-    }
-    old_stack = this_thr->th.th_task_state_memo_stack;
-    this_thr->th.th_task_state_memo_stack = new_stack;
-    this_thr->th.th_task_state_stack_sz = new_size;
-    __kmp_free(old_stack);
-  } else {
-    kmp_uint8 *end;
-    kmp_uint32 i;
-
-    end = &this_thr->th
-               .th_task_state_memo_stack[this_thr->th.th_task_state_stack_sz];
-
-    for (i = this_thr->th.th_task_state_stack_sz - 1;
-         i > this_thr->th.th_task_state_top; i--, end--)
-      end[0] = end[-1];
-  }
-  this_thr->th.th_task_state_memo_stack[this_thr->th.th_task_state_top + 1] =
-      value;
-}
-
 // __kmp_task_team_setup:  Create a task_team for the current team, but use
 // an already created, unused one if it already exists.
-void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
+void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
 
+  // For serial teams, setup the first task team pointer to point to task team.
+  // The other pointer is a stack of task teams from previous serial levels.
+  if (team->t.t_task_team[0] == NULL && team->t.t_nproc == 1) {
+    team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
+    KA_TRACE(20,
+             ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
+              " for serial/root team %p\n",
+              __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
+
+    return;
+  }
   // If this task_team hasn't been created yet, allocate it. It will be used in
   // the region after the next.
   // If it exists, it is the current task team and shouldn't be touched yet as
   // it may still be in use.
   if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
-      (always || team->t.t_nproc > 1)) {
+      team->t.t_nproc > 1) {
     team->t.t_task_team[this_thr->th.th_task_state] =
         __kmp_allocate_task_team(this_thr, team);
     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
@@ -4171,14 +4177,6 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
                   team->t.t_task_team[this_thr->th.th_task_state], team->t.t_id,
                   this_thr->th.th_task_state));
   }
-  if (this_thr->th.th_task_state == 1 && always && team->t.t_nproc == 1) {
-    // fix task state stack to adjust for proxy and helper tasks
-    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d needs to shift stack"
-                  " for team %d at parity=%d\n",
-                  __kmp_gtid_from_thread(this_thr), team->t.t_id,
-                  this_thr->th.th_task_state));
-    __kmp_shift_task_state_stack(this_thr, this_thr->th.th_task_state);
-  }
 
   // After threads exit the release, they will call sync, and then point to this
   // other task_team; make sure it is allocated and properly initialized. As
diff --git a/openmp/runtime/test/tasking/issue-50602.c b/openmp/runtime/test/tasking/issue-50602.c
new file mode 100644
index 00000000000000..f97d754c9bc305
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-50602.c
@@ -0,0 +1,28 @@
+// RUN: %libomp-compile-and-run
+// RUN: env OMP_NUM_THREADS=1 %libomp-run
+#include <omp.h>
+
+int main(int argc, char *argv[]) {
+  int i;
+
+  omp_set_max_active_levels(1);
+  omp_set_dynamic(0);
+
+  for (i = 0; i < 10; ++i) {
+#pragma omp parallel
+    {
+      omp_event_handle_t event;
+      int a = 0;
+
+#pragma omp task shared(a) detach(event)
+      { a = 1; }
+
+#pragma omp parallel
+      { a = 2; }
+
+      omp_fulfill_event(event);
+#pragma omp taskwait
+    }
+  }
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-69368.c b/openmp/runtime/test/tasking/issue-69368.c
new file mode 100644
index 00000000000000..57bd7412a51e92
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-69368.c
@@ -0,0 +1,27 @@
+// RUN: %libomp-compile-and-run
+// RUN: env OMP_NUM_THREADS=1 %libomp-run
+
+int main() {
+  int i;
+  int a[2];
+  volatile int attempt = 0;
+
+  for (i = 0; i < 10; ++i) {
+    a[0] = a[1] = 0;
+#pragma omp parallel for
+    for (int i = 0; i < 2; i++) {
+      a[i] = 2;
+    }
+    if (a[0] != 2 || a[1] != 2)
+      return 1;
+
+#pragma omp teams distribute parallel for if (attempt >= 2)
+    for (int i = 0; i < 2; i++) {
+      a[i] = 1;
+    }
+    if (a[0] != 1 || a[1] != 1)
+      return 1;
+  }
+
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-69733.c b/openmp/runtime/test/tasking/issue-69733.c
new file mode 100644
index 00000000000000..55764d76500a79
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-69733.c
@@ -0,0 +1,33 @@
+// RUN: %libomp-compile-and-run
+#include <omp.h>
+
+void nested_parallel(int nth1, int nth2) {
+#pragma omp parallel num_threads(nth1)
+  {
+#pragma omp parallel num_threads(nth2)
+    {
+      omp_event_handle_t ev;
+#pragma omp task detach(ev)
+      {}
+      omp_fulfill_event(ev);
+    }
+  }
+}
+
+int main() {
+  int i;
+
+  omp_set_max_active_levels(2);
+  omp_set_dynamic(0);
+
+  for (i = 0; i < 10; ++i)
+    nested_parallel(1, 1);
+  for (i = 0; i < 10; ++i)
+    nested_parallel(1, 2);
+  for (i = 0; i < 10; ++i)
+    nested_parallel(2, 1);
+  for (i = 0; i < 10; ++i)
+    nested_parallel(2, 2);
+
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/issue-79416.c b/openmp/runtime/test/tasking/issue-79416.c
new file mode 100644
index 00000000000000..6ca944a55225f5
--- /dev/null
+++ b/openmp/runtime/test/tasking/issue-79416.c
@@ -0,0 +1,17 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+
+void run(int teams, int th) {
+#pragma omp teams num_teams(teams)
+#pragma omp parallel num_threads(th)
+#pragma omp task
+  {}
+}
+
+int main() {
+  fprintf(stderr, "run(1, 2)\n");
+  run(1, 2);
+  fprintf(stderr, "run(1, 3)\n");
+  run(1, 3);
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/task_teams_stress_test.cpp b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
new file mode 100644
index 00000000000000..6ca08555922673
--- /dev/null
+++ b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
@@ -0,0 +1,304 @@
+// RUN: %libomp-cxx-compile
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run
+
+// This test stresses the task team mechanism by running a simple
+// increment task over and over with varying number of threads and nesting.
+// The test covers nested serial teams and mixing serial teams with
+// normal active teams.
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+// The number of times to run each test
+#define NTIMES 5
+
+// Regular single increment task
+void task_inc_a(int* a) {
+  #pragma omp task
+  {
+    #pragma omp atomic
+    (*a)++;
+  }
+}
+
+// Splitting increment task that binary splits the incrementing task
+void task_inc_split_a(int *a, int low, int high) {
+  #pragma omp task firstprivate(low, high)
+  {
+    if (low == high) {
+      #pragma omp atomic
+      (*a)++;
+    } else if (low < high) {
+      int mid = (high - low) / 2 + low;
+      task_inc_split_a(a, low, mid);
+      task_inc_split_a(a, mid+1, high);
+    }
+  }
+}
+
+// Detached tasks force serial regions to create task teams
+void task_inc_a_detached(int *a, omp_event_handle_t handle) {
+  #pragma omp task detach(handle)
+  {
+    #pragma omp atomic
+    (*a)++;
+    omp_fulfill_event(handle);
+  }
+}
+
+void check_a(int *a, int expected) {
+  if (*a != expected) {
+    fprintf(stderr, "FAIL: a = %d instead of expected = %d. Compile with "
+                    "-DVERBOSE for more verbose output.\n",
+            *a, expected);
+    exit(EXIT_FAILURE);
+  }
+}
+
+// Every thread creates a single "increment" task
+void test_tasks(omp_event_handle_t *handles, int expected, int *a) {
+  int tid = omp_get_thread_num();
+
+  task_inc_a(a);
+
+  #pragma omp barrier
+  check_a(a, expected);
+  #pragma omp barrier
+  check_a(a, expected);
+  #pragma omp barrier
+
+  task_inc_a_detached(a, handles[tid]);
+
+  #pragma omp barrier
+  check_a(a, 2 * expected);
+  #pragma omp barrier
+  task_inc_a(a);
+  #pragma omp barrier
+  check_a(a, 3 * expected);
+}
+
+// Testing single level of parallelism with increment tasks
+void test_base(int nthreads) {
+#ifdef VERBOSE
+  #pragma omp master
+  printf("    test_base(%d)\n", nthreads);
+#endif
+  int a = 0;
+  omp_event_handle_t *handles;
+  handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
+  #pragma omp parallel num_threads(nthreads) shared(a)
+  {
+    test_tasks(handles, nthreads, &a);
+  }
+  free(handles);
+}
+
+// Testing nested parallel with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+void test_nest(int first, int second) {
+#ifdef VERBOSE
+  #pragma omp master
+  printf("   test_nest(%d, %d)\n", first, second);
+#endif
+  #pragma omp parallel num_threads(first)
+  {
+    test_base(second);
+  }
+}
+
+// Testing 2-level nested parallels with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+void test_nest2(int first, int second, int third) {
+#ifdef VERBOSE
+  #pragma omp master
+  printf("  test_nest2(%d, %d, %d)\n", first, second, third);
+#endif
+  #pragma omp parallel num_threads(first)
+  {
+    test_nest(second, third);
+  }
+}
+
+// Testing 3-level nested parallels with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+// fourth = nthreads of third nested parallel
+void test_nest3(int first, int second, int third, int fourth) {
+#ifdef VERBOSE
+  #pragma omp master
+  printf(" test_nest3(%d, %d, %d, %d)\n", first, second, third, fourth);
+#endif
+  #pragma omp parallel num_threads(first)
+  {
+    test_nest2(second, third, fourth);
+  }
+}
+
+// Testing 4-level nested parallels with increment tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+// fourth = nthreads of third nested parallel
+// fifth = nthreads of fourth nested parallel
+void test_nest4(int first, int second, int third, int fourth, int fifth) {
+#ifdef VERBOSE
+  #pragma omp master
+  printf("test_nest4(%d, %d, %d, %d, %d)\n", first, second, third, fourth, fifth);
+#endif
+  #pragma omp parallel num_threads(first)
+  {
+    test_nest3(second, third, fourth, fifth);
+  }
+}
+
+// Single thread starts a binary splitting "increment" task
+// Detached tasks are still single "increment" task
+void test_tasks_split(omp_event_handle_t *handles, int expected, int *a) {
+  int tid = omp_get_thread_num();
+
+  #pragma omp single
+  task_inc_split_a(a, 1, expected); // task team A
+
+  #pragma omp barrier
+  check_a(a, expected);
+  #pragma omp barrier
+  check_a(a, expected);
+  #pragma omp barrier
+
+  task_inc_a_detached(a, handles[tid]); // task team B
+
+  #pragma omp barrier
+  check_a(a, 2 * expected);
+  #pragma omp barrier
+  #pragma omp single
+  task_inc_split_a(a, 1, expected); // task team B
+  #pragma omp barrier
+  check_a(a, 3 * expected);
+}
+
+// Testing single level of parallelism with splitting incrementing tasks
+void test_base_split(int nthreads) {
+#ifdef VERBOSE
+  #pragma omp master
+  printf("  test_base_split(%d)\n", nthreads);
+#endif
+  int a = 0;
+  omp_event_handle_t *handles;
+  handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
+  #pragma omp parallel num_threads(nthreads) shared(a)
+  {
+    test_tasks_split(handles, nthreads, &a);
+  }
+  free(handles);
+}
+
+// Testing nested parallels with splitting tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+void test_nest_split(int first, int second) {
+#ifdef VERBOSE
+  #pragma omp master
+  printf(" test_nest_split(%d, %d)\n", first, second);
+#endif
+  #pragma omp parallel num_threads(first)
+  {
+    test_base_split(second);
+  }
+}
+
+// Testing doubly nested parallels with splitting tasks
+// first = nthreads of outer parallel
+// second = nthreads of nested parallel
+// third = nthreads of second nested parallel
+void test_nest2_split(int first, int second, int third) {
+#ifdef VERBOSE
+  #pragma omp master
+  printf("test_nest2_split(%d, %d, %d)\n", first, second, third);
+#endif
+  #pragma omp parallel num_threads(first)
+  {
+    test_nest_split(second, third);
+  }
+}
+
+template <typename... Args>
+void run_ntimes(int n, void (*func)(Args...), Args... args) {
+  for (int i = 0; i < n; ++i) {
+    func(args...);
+  }
+}
+
+int main() {
+  omp_set_max_active_levels(5);
+
+  run_ntimes(NTIMES, test_base, 4);
+  run_ntimes(NTIMES, test_base, 1);
+  run_ntimes(NTIMES, test_base, 8);
+  run_ntimes(NTIMES, test_base, 2);
+  run_ntimes(NTIMES, test_base, 6);
+  run_ntimes(NTIMES, test_nest, 1, 1);
+  run_ntimes(NTIMES, test_nest, 1, 5);
+  run_ntimes(NTIMES, test_nest, 2, 6);
+  run_ntimes(NTIMES, test_nest, 1, 1);
+  run_ntimes(NTIMES, test_nest, 4, 3);
+  run_ntimes(NTIMES, test_nest, 3, 2);
+  run_ntimes(NTIMES, test_nest, 1, 1);
+  run_ntimes(NTIMES, test_nest2, 1, 1, 2);
+  run_ntimes(NTIMES, test_nest2, 1, 2, 1);
+  run_ntimes(NTIMES, test_nest2, 2, 2, 1);
+  run_ntimes(NTIMES, test_nest2, 2, 1, 1);
+  run_ntimes(NTIMES, test_nest2, 4, 2, 1);
+  run_ntimes(NTIMES, test_nest2, 4, 2, 2);
+  run_ntimes(NTIMES, test_nest2, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest2, 4, 2, 2);
+  run_ntimes(NTIMES, test_nest3, 1, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest3, 1, 2, 1, 1);
+  run_ntimes(NTIMES, test_nest3, 1, 1, 2, 1);
+  run_ntimes(NTIMES, test_nest3, 1, 1, 1, 2);
+  run_ntimes(NTIMES, test_nest3, 2, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 2, 1, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 2, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 1, 2, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 2, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 2);
+  run_ntimes(NTIMES, test_nest4, 1, 1, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest4, 1, 2, 1, 2, 1);
+
+  run_ntimes(NTIMES, test_base_split, 4);
+  run_ntimes(NTIMES, test_base_split, 2);
+
+  run_ntimes(NTIMES, test_base_split, 7);
+
+  run_ntimes(NTIMES, test_base_split, 1);
+  run_ntimes(NTIMES, test_nest_split, 4, 2);
+  run_ntimes(NTIMES, test_nest_split, 2, 1);
+
+  run_ntimes(NTIMES, test_nest_split, 7, 2);
+  run_ntimes(NTIMES, test_nest_split, 1, 1);
+  run_ntimes(NTIMES, test_nest_split, 1, 4);
+
+  run_ntimes(NTIMES, test_nest2_split, 1, 1, 2);
+  run_ntimes(NTIMES, test_nest2_split, 1, 2, 1);
+  run_ntimes(NTIMES, test_nest2_split, 2, 2, 1);
+  run_ntimes(NTIMES, test_nest2_split, 2, 1, 1);
+  run_ntimes(NTIMES, test_nest2_split, 4, 2, 1);
+  run_ntimes(NTIMES, test_nest2_split, 4, 2, 2);
+  run_ntimes(NTIMES, test_nest2_split, 1, 1, 1);
+  run_ntimes(NTIMES, test_nest2_split, 4, 2, 2);
+
+  printf("PASS\n");
+  return EXIT_SUCCESS;
+}

>From e8b1c577cb4fa8248bec6d36678bd5ed5a45b5d8 Mon Sep 17 00:00:00 2001
From: Jonathan Peyton <jonathan.l.peyton at intel.com>
Date: Wed, 27 Mar 2024 14:33:14 -0500
Subject: [PATCH 2/6] fix clang-format issues

---
 openmp/runtime/src/kmp.h                      |   4 +-
 .../test/tasking/task_teams_stress_test.cpp   | 116 ++++++++----------
 2 files changed, 53 insertions(+), 67 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 7e3f0e8ef32aa2..64a3ea6d5be5db 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -3208,8 +3208,8 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
 
 // Assert that the list structure fits and aligns within
 // the double task team pointer
-KMP_BUILD_ASSERT(sizeof(kmp_task_team_t * [2]) == sizeof(kmp_task_team_list_t));
-KMP_BUILD_ASSERT(alignof(kmp_task_team_t * [2]) ==
+KMP_BUILD_ASSERT(sizeof(kmp_task_team_t *[2]) == sizeof(kmp_task_team_list_t));
+KMP_BUILD_ASSERT(alignof(kmp_task_team_t *[2]) ==
                  alignof(kmp_task_team_list_t));
 
 union KMP_ALIGN_CACHE kmp_team {
diff --git a/openmp/runtime/test/tasking/task_teams_stress_test.cpp b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
index 6ca08555922673..ee7a3af11e8c2f 100644
--- a/openmp/runtime/test/tasking/task_teams_stress_test.cpp
+++ b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
@@ -21,34 +21,34 @@
 #define NTIMES 5
 
 // Regular single increment task
-void task_inc_a(int* a) {
-  #pragma omp task
+void task_inc_a(int *a) {
+#pragma omp task
   {
-    #pragma omp atomic
+#pragma omp atomic
     (*a)++;
   }
 }
 
 // Splitting increment task that binary splits the incrementing task
 void task_inc_split_a(int *a, int low, int high) {
-  #pragma omp task firstprivate(low, high)
+#pragma omp task firstprivate(low, high)
   {
     if (low == high) {
-      #pragma omp atomic
+#pragma omp atomic
       (*a)++;
     } else if (low < high) {
       int mid = (high - low) / 2 + low;
       task_inc_split_a(a, low, mid);
-      task_inc_split_a(a, mid+1, high);
+      task_inc_split_a(a, mid + 1, high);
     }
   }
 }
 
 // Detached tasks force serial regions to create task teams
 void task_inc_a_detached(int *a, omp_event_handle_t handle) {
-  #pragma omp task detach(handle)
+#pragma omp task detach(handle)
   {
-    #pragma omp atomic
+#pragma omp atomic
     (*a)++;
     omp_fulfill_event(handle);
   }
@@ -56,8 +56,9 @@ void task_inc_a_detached(int *a, omp_event_handle_t handle) {
 
 void check_a(int *a, int expected) {
   if (*a != expected) {
-    fprintf(stderr, "FAIL: a = %d instead of expected = %d. Compile with "
-                    "-DVERBOSE for more verbose output.\n",
+    fprintf(stderr,
+            "FAIL: a = %d instead of expected = %d. Compile with "
+            "-DVERBOSE for more verbose output.\n",
             *a, expected);
     exit(EXIT_FAILURE);
   }
@@ -69,35 +70,33 @@ void test_tasks(omp_event_handle_t *handles, int expected, int *a) {
 
   task_inc_a(a);
 
-  #pragma omp barrier
+#pragma omp barrier
   check_a(a, expected);
-  #pragma omp barrier
+#pragma omp barrier
   check_a(a, expected);
-  #pragma omp barrier
+#pragma omp barrier
 
   task_inc_a_detached(a, handles[tid]);
 
-  #pragma omp barrier
+#pragma omp barrier
   check_a(a, 2 * expected);
-  #pragma omp barrier
+#pragma omp barrier
   task_inc_a(a);
-  #pragma omp barrier
+#pragma omp barrier
   check_a(a, 3 * expected);
 }
 
 // Testing single level of parallelism with increment tasks
 void test_base(int nthreads) {
 #ifdef VERBOSE
-  #pragma omp master
+#pragma omp master
   printf("    test_base(%d)\n", nthreads);
 #endif
   int a = 0;
   omp_event_handle_t *handles;
   handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
-  #pragma omp parallel num_threads(nthreads) shared(a)
-  {
-    test_tasks(handles, nthreads, &a);
-  }
+#pragma omp parallel num_threads(nthreads) shared(a)
+  { test_tasks(handles, nthreads, &a); }
   free(handles);
 }
 
@@ -106,13 +105,11 @@ void test_base(int nthreads) {
 // second = nthreads of nested parallel
 void test_nest(int first, int second) {
 #ifdef VERBOSE
-  #pragma omp master
+#pragma omp master
   printf("   test_nest(%d, %d)\n", first, second);
 #endif
-  #pragma omp parallel num_threads(first)
-  {
-    test_base(second);
-  }
+#pragma omp parallel num_threads(first)
+  { test_base(second); }
 }
 
 // Testing 2-level nested parallels with increment tasks
@@ -121,13 +118,11 @@ void test_nest(int first, int second) {
 // third = nthreads of second nested parallel
 void test_nest2(int first, int second, int third) {
 #ifdef VERBOSE
-  #pragma omp master
+#pragma omp master
   printf("  test_nest2(%d, %d, %d)\n", first, second, third);
 #endif
-  #pragma omp parallel num_threads(first)
-  {
-    test_nest(second, third);
-  }
+#pragma omp parallel num_threads(first)
+  { test_nest(second, third); }
 }
 
 // Testing 3-level nested parallels with increment tasks
@@ -137,13 +132,11 @@ void test_nest2(int first, int second, int third) {
 // fourth = nthreads of third nested parallel
 void test_nest3(int first, int second, int third, int fourth) {
 #ifdef VERBOSE
-  #pragma omp master
+#pragma omp master
   printf(" test_nest3(%d, %d, %d, %d)\n", first, second, third, fourth);
 #endif
-  #pragma omp parallel num_threads(first)
-  {
-    test_nest2(second, third, fourth);
-  }
+#pragma omp parallel num_threads(first)
+  { test_nest2(second, third, fourth); }
 }
 
 // Testing 4-level nested parallels with increment tasks
@@ -154,13 +147,12 @@ void test_nest3(int first, int second, int third, int fourth) {
 // fifth = nthreads of fourth nested parallel
 void test_nest4(int first, int second, int third, int fourth, int fifth) {
 #ifdef VERBOSE
-  #pragma omp master
-  printf("test_nest4(%d, %d, %d, %d, %d)\n", first, second, third, fourth, fifth);
+#pragma omp master
+  printf("test_nest4(%d, %d, %d, %d, %d)\n", first, second, third, fourth,
+         fifth);
 #endif
-  #pragma omp parallel num_threads(first)
-  {
-    test_nest3(second, third, fourth, fifth);
-  }
+#pragma omp parallel num_threads(first)
+  { test_nest3(second, third, fourth, fifth); }
 }
 
 // Single thread starts a binary splitting "increment" task
@@ -168,39 +160,37 @@ void test_nest4(int first, int second, int third, int fourth, int fifth) {
 void test_tasks_split(omp_event_handle_t *handles, int expected, int *a) {
   int tid = omp_get_thread_num();
 
-  #pragma omp single
+#pragma omp single
   task_inc_split_a(a, 1, expected); // task team A
 
-  #pragma omp barrier
+#pragma omp barrier
   check_a(a, expected);
-  #pragma omp barrier
+#pragma omp barrier
   check_a(a, expected);
-  #pragma omp barrier
+#pragma omp barrier
 
   task_inc_a_detached(a, handles[tid]); // task team B
 
-  #pragma omp barrier
+#pragma omp barrier
   check_a(a, 2 * expected);
-  #pragma omp barrier
-  #pragma omp single
+#pragma omp barrier
+#pragma omp single
   task_inc_split_a(a, 1, expected); // task team B
-  #pragma omp barrier
+#pragma omp barrier
   check_a(a, 3 * expected);
 }
 
 // Testing single level of parallelism with splitting incrementing tasks
 void test_base_split(int nthreads) {
 #ifdef VERBOSE
-  #pragma omp master
+#pragma omp master
   printf("  test_base_split(%d)\n", nthreads);
 #endif
   int a = 0;
   omp_event_handle_t *handles;
   handles = (omp_event_handle_t *)malloc(sizeof(omp_event_handle_t) * nthreads);
-  #pragma omp parallel num_threads(nthreads) shared(a)
-  {
-    test_tasks_split(handles, nthreads, &a);
-  }
+#pragma omp parallel num_threads(nthreads) shared(a)
+  { test_tasks_split(handles, nthreads, &a); }
   free(handles);
 }
 
@@ -209,13 +199,11 @@ void test_base_split(int nthreads) {
 // second = nthreads of nested parallel
 void test_nest_split(int first, int second) {
 #ifdef VERBOSE
-  #pragma omp master
+#pragma omp master
   printf(" test_nest_split(%d, %d)\n", first, second);
 #endif
-  #pragma omp parallel num_threads(first)
-  {
-    test_base_split(second);
-  }
+#pragma omp parallel num_threads(first)
+  { test_base_split(second); }
 }
 
 // Testing doubly nested parallels with splitting tasks
@@ -224,13 +212,11 @@ void test_nest_split(int first, int second) {
 // third = nthreads of second nested parallel
 void test_nest2_split(int first, int second, int third) {
 #ifdef VERBOSE
-  #pragma omp master
+#pragma omp master
   printf("test_nest2_split(%d, %d, %d)\n", first, second, third);
 #endif
-  #pragma omp parallel num_threads(first)
-  {
-    test_nest_split(second, third);
-  }
+#pragma omp parallel num_threads(first)
+  { test_nest_split(second, third); }
 }
 
 template <typename... Args>

>From c4fa08054856aede39bc458a1ffad4cde2ebf2cc Mon Sep 17 00:00:00 2001
From: Jonathan Peyton <jonathan.l.peyton at intel.com>
Date: Fri, 29 Mar 2024 16:21:18 -0500
Subject: [PATCH 3/6] [OpenMP] Centralize task team initialization

---
 openmp/runtime/src/kmp_tasking.cpp | 52 +++++++++++++++---------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 4ab6b76306907a..cb9beeae1b2ba4 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -3939,6 +3939,20 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
   __kmp_release_bootstrap_lock(&task_team->tt.tt_task_pri_lock);
 }
 
+static inline void __kmp_task_team_init(kmp_task_team_t *task_team,
+                                        kmp_team_t *team) {
+  int team_nth = team->t.t_nproc;
+  // Only need to init if task team is isn't active or team size changed
+  if (!task_team->tt.tt_active || team_nth != task_team->tt.tt_nproc) {
+    TCW_4(task_team->tt.tt_found_tasks, FALSE);
+    TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
+    TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
+    TCW_4(task_team->tt.tt_nproc, team_nth);
+    KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, team_nth);
+    TCW_4(task_team->tt.tt_active, TRUE);
+  }
+}
+
 // __kmp_allocate_task_team:
 // Allocates a task team associated with a specific team, taking it from
 // the global task team free list if possible.  Also initializes data
@@ -3946,7 +3960,6 @@ static void __kmp_free_task_pri_list(kmp_task_team_t *task_team) {
 static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
                                                  kmp_team_t *team) {
   kmp_task_team_t *task_team = NULL;
-  int nthreads;
 
   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d entering; team = %p\n",
                 (thread ? __kmp_gtid_from_thread(thread) : -1), team));
@@ -3988,14 +4001,7 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
     // task_team->tt.tt_next = NULL;
   }
 
-  TCW_4(task_team->tt.tt_found_tasks, FALSE);
-  TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
-  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
-  task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
-
-  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
-  TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
-  TCW_4(task_team->tt.tt_active, TRUE);
+  __kmp_task_team_init(task_team, team);
 
   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
                 "unfinished_threads init'd to %d\n",
@@ -4154,14 +4160,17 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
 
   // For serial teams, setup the first task team pointer to point to task team.
   // The other pointer is a stack of task teams from previous serial levels.
-  if (team->t.t_task_team[0] == NULL && team->t.t_nproc == 1) {
-    team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
-    KA_TRACE(20,
-             ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
-              " for serial/root team %p\n",
-              __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
+  if (team->t.t_nproc == 1) {
+    if (team->t.t_task_team[0] == NULL) {
+      team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
+      KA_TRACE(20,
+               ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
+                " for serial/root team %p\n",
+                __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
 
-    return;
+      return;
+    } else
+      __kmp_task_team_init(team->t.t_task_team[0], team);
   }
   // If this task_team hasn't been created yet, allocate it. It will be used in
   // the region after the next.
@@ -4198,16 +4207,7 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
     } else { // Leave the old task team struct in place for the upcoming region;
       // adjust as needed
       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
-      if (!task_team->tt.tt_active ||
-          team->t.t_nproc != task_team->tt.tt_nproc) {
-        TCW_4(task_team->tt.tt_nproc, team->t.t_nproc);
-        TCW_4(task_team->tt.tt_found_tasks, FALSE);
-        TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
-        TCW_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
-        KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
-                          team->t.t_nproc);
-        TCW_4(task_team->tt.tt_active, TRUE);
-      }
+      __kmp_task_team_init(task_team, team);
       // if team size has changed, the first thread to enable tasking will
       // realloc threads_data if necessary
       KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "

>From c4ed5aa76027f9b790173916ddfe3e581c17d08a Mon Sep 17 00:00:00 2001
From: Jonathan Peyton <jonathan.l.peyton at intel.com>
Date: Thu, 28 Mar 2024 17:07:41 -0500
Subject: [PATCH 4/6] [OpenMP] add test from issue 81488

---
 openmp/runtime/test/target/issue-81488.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 openmp/runtime/test/target/issue-81488.c

diff --git a/openmp/runtime/test/target/issue-81488.c b/openmp/runtime/test/target/issue-81488.c
new file mode 100644
index 00000000000000..87ff0a829f9092
--- /dev/null
+++ b/openmp/runtime/test/target/issue-81488.c
@@ -0,0 +1,23 @@
+// RUN: %libomp-compile-and-run
+// RUN: env OMP_NUM_THREADS=1 %libomp-run
+
+#include <omp.h>
+
+#define Nz 8
+#define DEVICE_ID 0
+
+int main(void) {
+  for (int n = 0; n < 10; ++n) {
+#pragma omp parallel
+    {
+#pragma omp single
+      {
+#pragma omp target teams distribute parallel for nowait device(DEVICE_ID)
+        for (int i = 0; i < Nz; ++i) {
+        }
+      }
+#pragma omp barrier
+    }
+  }
+  return 0;
+}

>From 27b2fa0cf63ac50c27616d8f3c7e7237d6be8efd Mon Sep 17 00:00:00 2001
From: Jonathan Peyton <jonathan.l.peyton at intel.com>
Date: Fri, 29 Mar 2024 16:24:59 -0500
Subject: [PATCH 5/6] [OpenMP] Update tests for hidden helpers

---
 openmp/runtime/test/target/issue-81488.c      | 23 +++++++++++----
 openmp/runtime/test/tasking/issue-50602.c     | 12 ++++++++
 openmp/runtime/test/tasking/issue-69733.c     | 29 +++++++++++++++----
 openmp/runtime/test/tasking/issue-79416.c     | 19 +++++++++---
 .../test/tasking/task_teams_stress_test.cpp   | 28 ++++++++++++++++--
 5 files changed, 94 insertions(+), 17 deletions(-)

diff --git a/openmp/runtime/test/target/issue-81488.c b/openmp/runtime/test/target/issue-81488.c
index 87ff0a829f9092..2f79599ba7b95d 100644
--- a/openmp/runtime/test/target/issue-81488.c
+++ b/openmp/runtime/test/target/issue-81488.c
@@ -1,22 +1,33 @@
-// RUN: %libomp-compile-and-run
-// RUN: env OMP_NUM_THREADS=1 %libomp-run
+// RUN: %libomp-compile
+// RUN: env OMP_NUM_THREADS=1 LIBOMP_USE_HIDDEN_HELPER_TASK=1 LIBOMP_NUM_HIDDEN_HELPER_THREADS=8 %libomp-run
 
+#include <stdio.h>
+#include <stdlib.h>
 #include <omp.h>
 
 #define Nz 8
 #define DEVICE_ID 0
 
+int a[Nz];
+
 int main(void) {
   for (int n = 0; n < 10; ++n) {
-#pragma omp parallel
+    for (int k = 0; k < Nz; ++k) {
+      a[k] = -1;
+    }
+    #pragma omp parallel shared(a)
     {
-#pragma omp single
+      #pragma omp single
       {
-#pragma omp target teams distribute parallel for nowait device(DEVICE_ID)
+        #pragma omp target teams distribute parallel for nowait device(DEVICE_ID) map(tofrom: a[0:8])
         for (int i = 0; i < Nz; ++i) {
+          a[i] = i;
         }
       }
-#pragma omp barrier
+      #pragma omp barrier
+    }
+    for (int k = 0; k < Nz; ++k) {
+      printf("a[%d] = %d\n", k, a[k]);
     }
   }
   return 0;
diff --git a/openmp/runtime/test/tasking/issue-50602.c b/openmp/runtime/test/tasking/issue-50602.c
index f97d754c9bc305..ceada58fca297c 100644
--- a/openmp/runtime/test/tasking/issue-50602.c
+++ b/openmp/runtime/test/tasking/issue-50602.c
@@ -1,5 +1,8 @@
 // RUN: %libomp-compile-and-run
 // RUN: env OMP_NUM_THREADS=1 %libomp-run
+// RUN: %libomp-compile -DUSE_HIDDEN_HELPERS=1
+// RUN: %libomp-run
+// RUN: env OMP_NUM_THREADS=1 %libomp-run
 #include <omp.h>
 
 int main(int argc, char *argv[]) {
@@ -11,16 +14,25 @@ int main(int argc, char *argv[]) {
   for (i = 0; i < 10; ++i) {
 #pragma omp parallel
     {
+#ifndef USE_HIDDEN_HELPERS
       omp_event_handle_t event;
+#endif
       int a = 0;
 
+#ifdef USE_HIDDEN_HELPERS
+#pragma omp target map(tofrom: a) nowait
+#else
 #pragma omp task shared(a) detach(event)
+#endif
       { a = 1; }
 
 #pragma omp parallel
       { a = 2; }
 
+#ifndef USE_HIDDEN_HELPERS
       omp_fulfill_event(event);
+#endif
+
 #pragma omp taskwait
     }
   }
diff --git a/openmp/runtime/test/tasking/issue-69733.c b/openmp/runtime/test/tasking/issue-69733.c
index 55764d76500a79..b1577770dfbaaf 100644
--- a/openmp/runtime/test/tasking/issue-69733.c
+++ b/openmp/runtime/test/tasking/issue-69733.c
@@ -1,7 +1,7 @@
 // RUN: %libomp-compile-and-run
 #include <omp.h>
 
-void nested_parallel(int nth1, int nth2) {
+void nested_parallel_detached(int nth1, int nth2) {
 #pragma omp parallel num_threads(nth1)
   {
 #pragma omp parallel num_threads(nth2)
@@ -14,6 +14,16 @@ void nested_parallel(int nth1, int nth2) {
   }
 }
 
+void nested_parallel_hidden_helpers(int nth1, int nth2) {
+#pragma omp parallel num_threads(nth1)
+  {
+#pragma omp parallel num_threads(nth2)
+    {
+#pragma omp target nowait
+      {}
+    }
+  }
+}
 int main() {
   int i;
 
@@ -21,13 +31,22 @@ int main() {
   omp_set_dynamic(0);
 
   for (i = 0; i < 10; ++i)
-    nested_parallel(1, 1);
+    nested_parallel_detached(1, 1);
+  for (i = 0; i < 10; ++i)
+    nested_parallel_detached(1, 2);
+  for (i = 0; i < 10; ++i)
+    nested_parallel_detached(2, 1);
+  for (i = 0; i < 10; ++i)
+    nested_parallel_detached(2, 2);
+
+  for (i = 0; i < 10; ++i)
+    nested_parallel_hidden_helpers(1, 1);
   for (i = 0; i < 10; ++i)
-    nested_parallel(1, 2);
+    nested_parallel_hidden_helpers(1, 2);
   for (i = 0; i < 10; ++i)
-    nested_parallel(2, 1);
+    nested_parallel_hidden_helpers(2, 1);
   for (i = 0; i < 10; ++i)
-    nested_parallel(2, 2);
+    nested_parallel_hidden_helpers(2, 2);
 
   return 0;
 }
diff --git a/openmp/runtime/test/tasking/issue-79416.c b/openmp/runtime/test/tasking/issue-79416.c
index 6ca944a55225f5..03e348c02c311b 100644
--- a/openmp/runtime/test/tasking/issue-79416.c
+++ b/openmp/runtime/test/tasking/issue-79416.c
@@ -9,9 +9,20 @@ void run(int teams, int th) {
 }
 
 int main() {
-  fprintf(stderr, "run(1, 2)\n");
-  run(1, 2);
-  fprintf(stderr, "run(1, 3)\n");
-  run(1, 3);
+  int i;
+  for (i = 0; i < 10; ++i) {
+    printf("run(1, 1)\n");
+    run(1, 1);
+    printf("run(1, 2)\n");
+    run(1, 2);
+    printf("run(1, 3)\n");
+    run(1, 3);
+    printf("run(2, 1)\n");
+    run(2, 1);
+    printf("run(2, 2)\n");
+    run(2, 2);
+    printf("run(2, 3)\n");
+    run(2, 3);
+  }
   return 0;
 }
diff --git a/openmp/runtime/test/tasking/task_teams_stress_test.cpp b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
index ee7a3af11e8c2f..ae240781a059e0 100644
--- a/openmp/runtime/test/tasking/task_teams_stress_test.cpp
+++ b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
@@ -6,6 +6,15 @@
 // RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
 // RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
 // RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run
+//
+// RUN: %libomp-cxx-compile -DUSE_HIDDEN_HELPERS=1
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=0 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=1 KMP_HOT_TEAMS_MODE=1 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=2 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=3 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=4 %libomp-run
+// RUN: env KMP_HOT_TEAMS_MAX_LEVEL=5 %libomp-run
 
 // This test stresses the task team mechanism by running a simple
 // increment task over and over with varying number of threads and nesting.
@@ -44,6 +53,16 @@ void task_inc_split_a(int *a, int low, int high) {
   }
 }
 
+#ifdef USE_HIDDEN_HELPERS
+// Hidden helper tasks force serial regions to create task teams
+void task_inc_a_hidden_helper(int *a) {
+#pragma omp target map(tofrom: a[0]) nowait
+  {
+#pragma omp atomic
+    (*a)++;
+  }
+}
+#else
 // Detached tasks force serial regions to create task teams
 void task_inc_a_detached(int *a, omp_event_handle_t handle) {
 #pragma omp task detach(handle)
@@ -53,6 +72,7 @@ void task_inc_a_detached(int *a, omp_event_handle_t handle) {
     omp_fulfill_event(handle);
   }
 }
+#endif
 
 void check_a(int *a, int expected) {
   if (*a != expected) {
@@ -76,7 +96,11 @@ void test_tasks(omp_event_handle_t *handles, int expected, int *a) {
   check_a(a, expected);
 #pragma omp barrier
 
-  task_inc_a_detached(a, handles[tid]);
+#ifdef USE_HIDDEN_HELPERS
+  task_inc_a_hidden_helper(a);
+#else
+  task_inc_a_detached(a, handles);
+#endif
 
 #pragma omp barrier
   check_a(a, 2 * expected);
@@ -169,7 +193,7 @@ void test_tasks_split(omp_event_handle_t *handles, int expected, int *a) {
   check_a(a, expected);
 #pragma omp barrier
 
-  task_inc_a_detached(a, handles[tid]); // task team B
+  task_inc_a_for_serial(a, handles[tid]); // task team B
 
 #pragma omp barrier
   check_a(a, 2 * expected);

>From 5da12c265af119013b75185ef02d333f620d2fd5 Mon Sep 17 00:00:00 2001
From: Jonathan Peyton <jonathan.l.peyton at intel.com>
Date: Mon, 1 Apr 2024 12:17:54 -0500
Subject: [PATCH 6/6] [OpenMP] update tests further for testing task teams

---
 openmp/runtime/src/kmp_tasking.cpp            |  70 +++++-----
 openmp/runtime/test/tasking/issue-69733.c     | 127 +++++++++++++++---
 openmp/runtime/test/tasking/issue-79416.c     |  45 ++++---
 .../test/tasking/task_teams_stress_test.cpp   |   8 +-
 4 files changed, 174 insertions(+), 76 deletions(-)

diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index cb9beeae1b2ba4..290869620a4260 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -3386,8 +3386,6 @@ static inline int __kmp_execute_tasks_template(
 
   nthreads = task_team->tt.tt_nproc;
   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
-  KMP_DEBUG_ASSERT(nthreads > 1 || task_team->tt.tt_found_proxy_tasks ||
-                   task_team->tt.tt_hidden_helper_task_encountered);
   KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
 
   while (1) { // Outer loop keeps trying to find tasks in case of single thread
@@ -4158,9 +4156,12 @@ void __kmp_wait_to_unref_task_teams(void) {
 void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
 
-  // For serial teams, setup the first task team pointer to point to task team.
-  // The other pointer is a stack of task teams from previous serial levels.
-  if (team->t.t_nproc == 1) {
+  // For the serial and root teams, setup the first task team pointer to point
+  // to task team. The other pointer is a stack of task teams from previous
+  // serial levels.
+  if (team == this_thr->th.th_serial_team ||
+      team == this_thr->th.th_root->r.r_root_team) {
+    KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
     if (team->t.t_task_team[0] == NULL) {
       team->t.t_task_team[0] = __kmp_allocate_task_team(this_thr, team);
       KA_TRACE(20,
@@ -4168,16 +4169,16 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
                 " for serial/root team %p\n",
                 __kmp_gtid_from_thread(this_thr), team->t.t_task_team[0], team));
 
-      return;
     } else
       __kmp_task_team_init(team->t.t_task_team[0], team);
+    return;
   }
+
   // If this task_team hasn't been created yet, allocate it. It will be used in
   // the region after the next.
   // If it exists, it is the current task team and shouldn't be touched yet as
   // it may still be in use.
-  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL &&
-      team->t.t_nproc > 1) {
+  if (team->t.t_task_team[this_thr->th.th_task_state] == NULL) {
     team->t.t_task_team[this_thr->th.th_task_state] =
         __kmp_allocate_task_team(this_thr, team);
     KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created new task_team %p"
@@ -4192,29 +4193,26 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
   // threads spin in the barrier release phase, they will continue to use the
   // previous task_team struct(above), until they receive the signal to stop
   // checking for tasks (they can't safely reference the kmp_team_t struct,
-  // which could be reallocated by the primary thread). No task teams are formed
-  // for serialized teams.
-  if (team->t.t_nproc > 1) {
-    int other_team = 1 - this_thr->th.th_task_state;
-    KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
-    if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
-      team->t.t_task_team[other_team] =
-          __kmp_allocate_task_team(this_thr, team);
-      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
-                    "task_team %p for team %d at parity=%d\n",
-                    __kmp_gtid_from_thread(this_thr),
-                    team->t.t_task_team[other_team], team->t.t_id, other_team));
-    } else { // Leave the old task team struct in place for the upcoming region;
-      // adjust as needed
-      kmp_task_team_t *task_team = team->t.t_task_team[other_team];
-      __kmp_task_team_init(task_team, team);
-      // if team size has changed, the first thread to enable tasking will
-      // realloc threads_data if necessary
-      KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
-                    "%p for team %d at parity=%d\n",
-                    __kmp_gtid_from_thread(this_thr),
-                    team->t.t_task_team[other_team], team->t.t_id, other_team));
-    }
+  // which could be reallocated by the primary thread).
+  int other_team = 1 - this_thr->th.th_task_state;
+  KMP_DEBUG_ASSERT(other_team >= 0 && other_team < 2);
+  if (team->t.t_task_team[other_team] == NULL) { // setup other team as well
+    team->t.t_task_team[other_team] =
+        __kmp_allocate_task_team(this_thr, team);
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d created second new "
+                  "task_team %p for team %d at parity=%d\n",
+                  __kmp_gtid_from_thread(this_thr),
+                  team->t.t_task_team[other_team], team->t.t_id, other_team));
+  } else { // Leave the old task team struct in place for the upcoming region;
+    // adjust as needed
+    kmp_task_team_t *task_team = team->t.t_task_team[other_team];
+    __kmp_task_team_init(task_team, team);
+    // if team size has changed, the first thread to enable tasking will
+    // realloc threads_data if necessary
+    KA_TRACE(20, ("__kmp_task_team_setup: Primary T#%d reset next task_team "
+                  "%p for team %d at parity=%d\n",
+                  __kmp_gtid_from_thread(this_thr),
+                  team->t.t_task_team[other_team], team->t.t_id, other_team));
   }
 
   // For regular thread, task enabling should be called when the task is going
@@ -4240,9 +4238,11 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team) {
 
 // __kmp_task_team_sync: Propagation of task team data from team to threads
 // which happens just after the release phase of a team barrier.  This may be
-// called by any thread, but only for teams with # threads > 1.
+// called by any thread. This is not called for serial or root teams.
 void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
   KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec);
+  KMP_DEBUG_ASSERT(team != this_thr->th.th_serial_team);
+  KMP_DEBUG_ASSERT(team != this_thr->th.th_root->r.r_root_team);
 
   // Toggle the th_task_state field, to switch which task_team this thread
   // refers to
@@ -4260,8 +4260,7 @@ void __kmp_task_team_sync(kmp_info_t *this_thr, kmp_team_t *team) {
 }
 
 // __kmp_task_team_wait: Primary thread waits for outstanding tasks after the
-// barrier gather phase. Only called by primary thread if #threads in team > 1
-// or if proxy tasks were created.
+// barrier gather phase. Only called by the primary thread.
 //
 // wait is a flag that defaults to 1 (see kmp.h), but waiting can be turned off
 // by passing in 0 optionally as the last argument. When wait is zero, primary
@@ -4295,9 +4294,6 @@ void __kmp_task_team_wait(
         ("__kmp_task_team_wait: Primary T#%d deactivating task_team %p: "
          "setting active to false, setting local and team's pointer to NULL\n",
          __kmp_gtid_from_thread(this_thr), task_team));
-    KMP_DEBUG_ASSERT(task_team->tt.tt_nproc > 1 ||
-                     task_team->tt.tt_found_proxy_tasks == TRUE ||
-                     task_team->tt.tt_hidden_helper_task_encountered == TRUE);
     TCW_SYNC_4(task_team->tt.tt_found_proxy_tasks, FALSE);
     TCW_SYNC_4(task_team->tt.tt_hidden_helper_task_encountered, FALSE);
     KMP_CHECK_UPDATE(task_team->tt.tt_untied_task_encountered, 0);
diff --git a/openmp/runtime/test/tasking/issue-69733.c b/openmp/runtime/test/tasking/issue-69733.c
index b1577770dfbaaf..172056ed2eb006 100644
--- a/openmp/runtime/test/tasking/issue-69733.c
+++ b/openmp/runtime/test/tasking/issue-69733.c
@@ -1,52 +1,145 @@
 // RUN: %libomp-compile-and-run
+
+#include <stdio.h>
+#include <stdlib.h>
 #include <omp.h>
 
+int a;
+
+void inc_a() {
+#pragma omp atomic
+  a++;
+}
+
+void root_team_detached() {
+  a = 0;
+  omp_event_handle_t ev;
+#pragma omp task detach(ev)
+  inc_a();
+  omp_fulfill_event(ev);
+  if (a != 1) {
+    fprintf(stderr, "error: root_team_detached(): a != 1\n");
+    exit(EXIT_FAILURE);
+  }
+}
+
+void root_team_hidden_helpers() {
+  a = 0;
+#pragma omp target nowait
+  inc_a();
+
+#pragma omp taskwait
+
+  if (a != 1) {
+    fprintf(stderr, "error: root_team_hidden_helpers(): a != 1\n");
+    exit(EXIT_FAILURE);
+  }
+}
+
+void parallel_detached(int nth1) {
+  a = 0;
+  omp_event_handle_t *evs = (omp_event_handle_t*)malloc(sizeof(omp_event_handle_t) * nth1);
+#pragma omp parallel num_threads(nth1)
+  {
+    int tid = omp_get_thread_num();
+    omp_event_handle_t e = evs[tid];
+#pragma omp task detach(e)
+    inc_a();
+    omp_fulfill_event(e);
+  }
+  free(evs);
+  if (a != nth1) {
+    fprintf(stderr, "error: parallel_detached(): a (%d) != %d\n", a, nth1);
+    exit(EXIT_FAILURE);
+  }
+}
+
+void parallel_hidden_helpers(int nth1) {
+  a = 0;
+#pragma omp parallel num_threads(nth1)
+  {
+#pragma omp target nowait
+    inc_a();
+  }
+  if (a != nth1) {
+    fprintf(stderr,
+            "error: parallel_hidden_helpers(): a (%d) != %d\n", a, nth1);
+    exit(EXIT_FAILURE);
+  }
+}
+
 void nested_parallel_detached(int nth1, int nth2) {
+  a = 0;
+  omp_event_handle_t **evs = (omp_event_handle_t**)malloc(sizeof(omp_event_handle_t*) * nth1);
 #pragma omp parallel num_threads(nth1)
   {
-#pragma omp parallel num_threads(nth2)
+    int tid = omp_get_thread_num();
+    evs[tid] = (omp_event_handle_t*)malloc(sizeof(omp_event_handle_t) * nth2);
+#pragma omp parallel num_threads(nth2) shared(tid)
     {
-      omp_event_handle_t ev;
-#pragma omp task detach(ev)
-      {}
-      omp_fulfill_event(ev);
+      int tid2 = omp_get_thread_num();
+      omp_event_handle_t e = evs[tid][tid2];
+#pragma omp task detach(e)
+      inc_a();
+      omp_fulfill_event(e);
     }
+    free(evs[tid]);
+  }
+  free(evs);
+  if (a != nth1 * nth2) {
+    fprintf(stderr, "error: nested_parallel_detached(): a (%d) != %d * %d\n", a,
+            nth1, nth2);
+    exit(EXIT_FAILURE);
   }
 }
 
 void nested_parallel_hidden_helpers(int nth1, int nth2) {
+  a = 0;
 #pragma omp parallel num_threads(nth1)
   {
 #pragma omp parallel num_threads(nth2)
     {
 #pragma omp target nowait
-      {}
+      inc_a();
     }
   }
+  if (a != nth1 * nth2) {
+    fprintf(stderr,
+            "error: nested_parallel_hidden_helpers(): a (%d) != %d * %d\n", a,
+            nth1, nth2);
+    exit(EXIT_FAILURE);
+  }
 }
+
 int main() {
-  int i;
+  int i, nth1, nth2;
 
   omp_set_max_active_levels(2);
   omp_set_dynamic(0);
 
   for (i = 0; i < 10; ++i)
-    nested_parallel_detached(1, 1);
-  for (i = 0; i < 10; ++i)
-    nested_parallel_detached(1, 2);
-  for (i = 0; i < 10; ++i)
-    nested_parallel_detached(2, 1);
+    root_team_detached();
+
   for (i = 0; i < 10; ++i)
-    nested_parallel_detached(2, 2);
+    root_team_hidden_helpers();
 
   for (i = 0; i < 10; ++i)
-    nested_parallel_hidden_helpers(1, 1);
+    for (nth1 = 1; nth1 <= 4; ++nth1)
+      parallel_detached(nth1);
+
   for (i = 0; i < 10; ++i)
-    nested_parallel_hidden_helpers(1, 2);
+    for (nth1 = 1; nth1 <= 4; ++nth1)
+      parallel_hidden_helpers(nth1);
+
   for (i = 0; i < 10; ++i)
-    nested_parallel_hidden_helpers(2, 1);
+    for (nth1 = 1; nth1 <= 4; ++nth1)
+      for (nth2 = 1; nth2 <= 4; ++nth2)
+      nested_parallel_detached(nth1, nth2);
+
   for (i = 0; i < 10; ++i)
-    nested_parallel_hidden_helpers(2, 2);
+    for (nth1 = 1; nth1 <= 4; ++nth1)
+      for (nth2 = 1; nth2 <= 4; ++nth2)
+        nested_parallel_hidden_helpers(nth1, nth2);
 
   return 0;
 }
diff --git a/openmp/runtime/test/tasking/issue-79416.c b/openmp/runtime/test/tasking/issue-79416.c
index 03e348c02c311b..ee96fce809745e 100644
--- a/openmp/runtime/test/tasking/issue-79416.c
+++ b/openmp/runtime/test/tasking/issue-79416.c
@@ -1,28 +1,33 @@
 // RUN: %libomp-compile-and-run
 #include <stdio.h>
+#include <stdlib.h>
 
-void run(int teams, int th) {
-#pragma omp teams num_teams(teams)
-#pragma omp parallel num_threads(th)
+int a;
+
+void run(int nteams, int nth) {
+  a = 0;
+#pragma omp teams num_teams(nteams)
+  {
+#pragma omp parallel num_threads(nth)
+    {
 #pragma omp task
-  {}
+      {
+#pragma omp atomic
+        a++;
+      }
+    }
+  }
+  if (a == 0)
+    exit(EXIT_FAILURE);
 }
 
 int main() {
-  int i;
-  for (i = 0; i < 10; ++i) {
-    printf("run(1, 1)\n");
-    run(1, 1);
-    printf("run(1, 2)\n");
-    run(1, 2);
-    printf("run(1, 3)\n");
-    run(1, 3);
-    printf("run(2, 1)\n");
-    run(2, 1);
-    printf("run(2, 2)\n");
-    run(2, 2);
-    printf("run(2, 3)\n");
-    run(2, 3);
-  }
-  return 0;
+  int i, nteams, nth;
+  for (nteams = 1; nteams <= 2; ++nteams)
+    for (nth = 1; nth <= 3; ++nth)
+      for (i = 0; i < 10; ++i) {
+        printf("run(%d, %d)\n", nteams, nth);
+        run(nteams, nth);
+      }
+  return EXIT_SUCCESS;
 }
diff --git a/openmp/runtime/test/tasking/task_teams_stress_test.cpp b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
index ae240781a059e0..d124af67910140 100644
--- a/openmp/runtime/test/tasking/task_teams_stress_test.cpp
+++ b/openmp/runtime/test/tasking/task_teams_stress_test.cpp
@@ -99,7 +99,7 @@ void test_tasks(omp_event_handle_t *handles, int expected, int *a) {
 #ifdef USE_HIDDEN_HELPERS
   task_inc_a_hidden_helper(a);
 #else
-  task_inc_a_detached(a, handles);
+  task_inc_a_detached(a, handles[tid]);
 #endif
 
 #pragma omp barrier
@@ -193,7 +193,11 @@ void test_tasks_split(omp_event_handle_t *handles, int expected, int *a) {
   check_a(a, expected);
 #pragma omp barrier
 
-  task_inc_a_for_serial(a, handles[tid]); // task team B
+#ifdef USE_HIDDEN_HELPERS
+  task_inc_a_hidden_helper(a);
+#else
+  task_inc_a_detached(a, handles[tid]);
+#endif
 
 #pragma omp barrier
   check_a(a, 2 * expected);



More information about the Openmp-commits mailing list