[Openmp-commits] [openmp] [OpenMP] Adding a throttling threshold to bound dependent tasking mem… (PR #82274)

PEREIRA Romain via Openmp-commits openmp-commits at lists.llvm.org
Sat Mar 9 17:11:21 PST 2024


https://github.com/rpereira-dev updated https://github.com/llvm/llvm-project/pull/82274

>From 459dfd35d47fbb0a1a7f2f0408febd18eb745f1b Mon Sep 17 00:00:00 2001
From: Romain Pereira <romain.pereira at inria.fr>
Date: Mon, 19 Feb 2024 20:21:51 +0100
Subject: [PATCH 1/8] [OpenMP] Adding a throttling threshold to bound dependent
 tasking memory footprint

---
 openmp/runtime/src/kmp.h                      |  4 ++
 openmp/runtime/src/kmp_global.cpp             | 13 +++-
 openmp/runtime/src/kmp_settings.cpp           | 37 ++++++++++-
 openmp/runtime/src/kmp_tasking.cpp            | 53 ++++++++++------
 .../runtime/test/tasking/omp_throttling_max.c | 62 +++++++++++++++++++
 .../omp_throttling_max_ready_per_thread.c     | 62 +++++++++++++++++++
 6 files changed, 211 insertions(+), 20 deletions(-)
 create mode 100644 openmp/runtime/test/tasking/omp_throttling_max.c
 create mode 100644 openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 259c57b5afbca5..5409004a7e9d53 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2422,7 +2422,11 @@ typedef enum kmp_tasking_mode {
 extern kmp_tasking_mode_t
     __kmp_tasking_mode; /* determines how/when to execute tasks */
 extern int __kmp_task_stealing_constraint;
+extern std::atomic<kmp_int32> __kmp_n_tasks_in_flight;
 extern int __kmp_enable_task_throttling;
+extern kmp_int32 __kmp_task_maximum;
+extern kmp_int32 __kmp_task_maximum_ready_per_thread;
+
 extern kmp_int32 __kmp_default_device; // Set via OMP_DEFAULT_DEVICE if
 // specified, defaults to 0 otherwise
 // Set via OMP_MAX_TASK_PRIORITY if specified, defaults to 0 otherwise
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 5017cd3de4be57..6dc9ac2d175246 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -353,8 +353,19 @@ omp_memspace_handle_t const llvm_omp_target_device_mem_space =
 KMP_BUILD_ASSERT(sizeof(kmp_tasking_flags_t) == 4);
 
 int __kmp_task_stealing_constraint = 1; /* Constrain task stealing by default */
-int __kmp_enable_task_throttling = 1;
 
+std::atomic<kmp_int32> __kmp_n_tasks_in_flight = 0; /* n° of tasks in flight */
+
+kmp_int32 __kmp_enable_task_throttling = 1; /* Serialize tasks once a threshold
+                                            is reached, such as the number of
+                                            ready tasks or the total number of
+                                            tasks */
+
+kmp_int32 __kmp_task_maximum = 65536; /* number of tasks threshold before
+                                         serializing */
+
+kmp_int32 __kmp_task_maximum_ready_per_thread = 256; /* number of ready tasks
+                                                        before serializing */
 #ifdef DEBUG_SUSPEND
 int __kmp_suspend_count = 0;
 #endif
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index ec86ee07472c1e..8491da4a3371f2 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -5360,6 +5360,33 @@ static void __kmp_stg_print_task_throttling(kmp_str_buf_t *buffer,
   __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling);
 } // __kmp_stg_print_task_throttling
 
+// -----------------------------------------------------------------------------
+// KMP_TASK_MAXIMUM
+static void __kmp_stg_parse_task_maximum(char const *name, char const *value,
+                                         void *data) {
+  __kmp_stg_parse_int(name, value, 1, INT_MAX, &__kmp_task_maximum);
+} // __kmp_stg_parse_task_maximum
+
+static void __kmp_stg_print_task_maximum(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_task_maximum);
+} // __kmp_stg_print_task_maximum
+
+// -----------------------------------------------------------------------------
+// KMP_TASK_MAXIMUM_READY_PER_THREAD
+static void __kmp_stg_parse_task_maximum_ready_per_thread(char const *name,
+                                                          char const *value,
+                                                          void *data) {
+  __kmp_stg_parse_int(name, value, 1, INT_MAX,
+                      &__kmp_task_maximum_ready_per_thread);
+} // __kmp_stg_parse_task_maximum_ready_per_thread
+
+static void __kmp_stg_print_task_maximum_ready_per_thread(kmp_str_buf_t *buffer,
+                                                          char const *name,
+                                                          void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_task_maximum_ready_per_thread);
+} // __kmp_stg_print_task_maximum_ready_per_thread
+
 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
 // -----------------------------------------------------------------------------
 // KMP_USER_LEVEL_MWAIT
@@ -5750,6 +5777,13 @@ static kmp_setting_t __kmp_stg_table[] = {
     {"KMP_ENABLE_TASK_THROTTLING", __kmp_stg_parse_task_throttling,
      __kmp_stg_print_task_throttling, NULL, 0, 0},
 
+    {"KMP_TASK_MAXIMUM", __kmp_stg_parse_task_maximum,
+     __kmp_stg_print_task_maximum, NULL, 0, 0},
+
+    {"KMP_TASK_MAXIMUM_READY_PER_THREAD",
+     __kmp_stg_parse_task_maximum_ready_per_thread,
+     __kmp_stg_print_task_maximum_ready_per_thread, NULL, 0, 0},
+
     {"OMP_DISPLAY_ENV", __kmp_stg_parse_omp_display_env,
      __kmp_stg_print_omp_display_env, NULL, 0, 0},
     {"OMP_CANCELLATION", __kmp_stg_parse_omp_cancellation,
@@ -5764,7 +5798,8 @@ static kmp_setting_t __kmp_stg_table[] = {
 #if OMPX_TASKGRAPH
     {"KMP_MAX_TDGS", __kmp_stg_parse_max_tdgs, __kmp_std_print_max_tdgs, NULL,
      0, 0},
-    {"KMP_TDG_DOT", __kmp_stg_parse_tdg_dot, __kmp_stg_print_tdg_dot, NULL, 0, 0},
+    {"KMP_TDG_DOT", __kmp_stg_parse_tdg_dot, __kmp_stg_print_tdg_dot, NULL, 0,
+     0},
 #endif
 
 #if OMPT_SUPPORT
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 6e8b948efa064f..9cfb0486fc71da 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -438,10 +438,9 @@ static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
 
   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
   // Check if deque is full
-  if (TCR_4(thread_data->td.td_deque_ntasks) >=
-      TASK_DEQUE_SIZE(thread_data->td)) {
-    if (__kmp_enable_task_throttling &&
-        __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
+  if (__kmp_enable_task_throttling && TCR_4(thread_data->td.td_deque_ntasks) >=
+                                          __kmp_task_maximum_ready_per_thread) {
+    if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
                               thread->th.th_current_task)) {
       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
       KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
@@ -543,40 +542,51 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
 
   int locked = 0;
   // Check if deque is full
-  if (TCR_4(thread_data->td.td_deque_ntasks) >=
-      TASK_DEQUE_SIZE(thread_data->td)) {
-    if (__kmp_enable_task_throttling &&
+  int requires_resize = TCR_4(thread_data->td.td_deque_ntasks) >=
+                        TASK_DEQUE_SIZE(thread_data->td);
+  int requires_throttling =
+      __kmp_enable_task_throttling && TCR_4(thread_data->td.td_deque_ntasks) >=
+                                          __kmp_task_maximum_ready_per_thread;
+  int thread_can_execute;
+  if (requires_resize || requires_throttling) {
+    thread_can_execute =
         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
-                              thread->th.th_current_task)) {
+                              thread->th.th_current_task);
+    if (requires_throttling && thread_can_execute) {
       KA_TRACE(20, ("__kmp_push_task: T#%d deque is full; returning "
                     "TASK_NOT_PUSHED for task %p\n",
                     gtid, taskdata));
       return TASK_NOT_PUSHED;
-    } else {
+    } else { /* maybe requires_resize */
       __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
       locked = 1;
-      if (TCR_4(thread_data->td.td_deque_ntasks) >=
-          TASK_DEQUE_SIZE(thread_data->td)) {
-        // expand deque to push the task which is not allowed to execute
+      requires_resize = TCR_4(thread_data->td.td_deque_ntasks) >=
+                        TASK_DEQUE_SIZE(thread_data->td);
+      // expand deque to push the task which is not allowed to execute
+      if (requires_resize)
         __kmp_realloc_task_deque(thread, thread_data);
-      }
     }
   }
   // Lock the deque for the task push operation
   if (!locked) {
     __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
     // Need to recheck as we can get a proxy task from thread outside of OpenMP
-    if (TCR_4(thread_data->td.td_deque_ntasks) >=
-        TASK_DEQUE_SIZE(thread_data->td)) {
-      if (__kmp_enable_task_throttling &&
+    requires_resize = TCR_4(thread_data->td.td_deque_ntasks) >=
+                      TASK_DEQUE_SIZE(thread_data->td);
+    requires_throttling = __kmp_enable_task_throttling &&
+                          TCR_4(thread_data->td.td_deque_ntasks) >=
+                              __kmp_task_maximum_ready_per_thread;
+    if (requires_resize || requires_throttling) {
+      thread_can_execute =
           __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
-                                thread->th.th_current_task)) {
+                                thread->th.th_current_task);
+      if (requires_throttling && thread_can_execute) {
         __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
         KA_TRACE(20, ("__kmp_push_task: T#%d deque is full on 2nd check; "
                       "returning TASK_NOT_PUSHED for task %p\n",
                       gtid, taskdata));
         return TASK_NOT_PUSHED;
-      } else {
+      } else { /* requires_resize */
         // expand deque to push the task which is not allowed to execute
         __kmp_realloc_task_deque(thread, thread_data);
       }
@@ -914,6 +924,7 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
 #else /* ! USE_FAST_MEMORY */
   __kmp_thread_free(thread, taskdata);
 #endif
+  --__kmp_n_tasks_in_flight;
 #if OMPX_TASKGRAPH
   } else {
     taskdata->td_flags.complete = 0;
@@ -1464,6 +1475,11 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   if (UNLIKELY(!TCR_4(__kmp_init_middle)))
     __kmp_middle_initialize();
 
+  // task throttling: to many tasks co-existing, emptying queue now
+  if (__kmp_enable_task_throttling)
+    while (TCR_4(__kmp_n_tasks_in_flight.load()) >= __kmp_task_maximum)
+      __kmpc_omp_taskyield(NULL, gtid, 0);
+
   if (flags->hidden_helper) {
     if (__kmp_enable_hidden_helper) {
       if (!TCR_4(__kmp_init_hidden_helper))
@@ -1558,6 +1574,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
                                                                sizeof_shareds);
 #endif /* USE_FAST_MEMORY */
+  ++__kmp_n_tasks_in_flight;
 
   task = KMP_TASKDATA_TO_TASK(taskdata);
 
diff --git a/openmp/runtime/test/tasking/omp_throttling_max.c b/openmp/runtime/test/tasking/omp_throttling_max.c
new file mode 100644
index 00000000000000..582927c713fd34
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_throttling_max.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=0      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=1      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=256    %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=65536  %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=100000 %libomp-run
+
+/**
+ *  This test ensures that task throttling on the maximum number of tasks
+ *  threshold works properly.
+ *
+ *  It creates 2 threads (1 producer, 1 consummer)
+ *  The producer infinitely create tasks 'T_i' until one executed
+ *  The consumer is blocked until the producer starts throttling
+ *  Executing any 'T_i' unblocks the consumer and stop the producer
+ *
+ *  The assertion tests ensures that the producer does not create more than the
+ *  total number of tasks provided by the programmer
+ */
+
+#include <assert.h>
+#include <omp.h>
+#include <stdlib.h>
+
+/* default value */
+#define MAX_TASKS_DEFAULT (65536)
+
+int main(void) {
+  /* maximum number of tasks in-flight */
+  char *max_tasks_str = getenv("KMP_TASK_MAXIMUM");
+  int max_tasks = max_tasks_str ? atoi(max_tasks_str) : MAX_TASKS_DEFAULT;
+  if (max_tasks <= 0)
+    max_tasks = 1;
+
+  /* check if throttling is enabled (it is by default) */
+  char *throttling_str = getenv("KMP_ENABLE_TASK_THROTTLING");
+  int throttling = throttling_str ? *throttling_str == '1' : 1;
+  assert(throttling);
+
+  volatile int done = 0;
+
+/* testing KMP_TASK_MAXIMUM */
+#pragma omp parallel num_threads(2) default(none)                              \
+    shared(max_tasks, throttling, done)
+  {
+    if (omp_get_thread_num() == 1)
+      while (!done)
+        ;
+
+#pragma omp master
+    {
+      int ntasks = 0;
+      while (!done) {
+#pragma omp task default(none) shared(done) depend(out : max_tasks, throttling)
+        done = 1;
+
+        assert(++ntasks <= max_tasks + 1);
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c b/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
new file mode 100644
index 00000000000000..6d801971d7af19
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
@@ -0,0 +1,62 @@
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=0      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=1      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=256    %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=65536  %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=100000 %libomp-run
+
+/**
+ *  This test ensures that task throttling on the maximum number of ready tasks
+ *  per thread threshold works properly.
+ *
+ *  It creates 2 threads (1 producer, 1 consummer)
+ *  The producer infinitely create tasks 'T_i' until one executed
+ *  The consumer is blocked until the producer starts throttling
+ *  Executing any 'T_i' unblocks the consumer and stop the producer
+ *
+ *  The assertion tests ensures that the producer does not create more than the
+ *  total number of tasks provided by the programmer
+ */
+
+#include <assert.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define MAX_TASKS_READY_DEFAULT (1 << 8)
+
+int main(void) {
+  /* maximum number of ready tasks in-flight */
+  char *max_tasks_ready_str = getenv("KMP_TASK_MAXIMUM_READY_PER_THREAD");
+  int max_tasks_ready =
+      max_tasks_ready_str ? atoi(max_tasks_ready_str) : MAX_TASKS_READY_DEFAULT;
+  if (max_tasks_ready <= 0)
+    max_tasks_ready = 1;
+
+  /* check if throttling is enabled (it is by default) */
+  char *throttling_str = getenv("KMP_ENABLE_TASK_THROTTLING");
+  int throttling = throttling_str ? *throttling_str == '1' : 1;
+
+  volatile int done = 0;
+
+/* testing KMP_TASK_MAXIMUM_READY */
+#pragma omp parallel num_threads(2) default(none)                              \
+    shared(max_tasks_ready, throttling, done)
+  {
+    if (omp_get_thread_num() == 1)
+      while (!done)
+        ;
+
+#pragma omp master
+    {
+      int ntasks = 0;
+      while (!done) {
+#pragma omp task default(none) shared(done)
+        done = 1;
+
+        assert(++ntasks <= max_tasks_ready + 1);
+      }
+    }
+  }
+
+  return 0;
+}

>From ed327625cc1de7f0f71c63f3ae59b6ac621e2c76 Mon Sep 17 00:00:00 2001
From: Romain Pereira <romain.pereira at inria.fr>
Date: Tue, 20 Feb 2024 09:46:05 +0100
Subject: [PATCH 2/8] Fix comment typos, disabled clang-format on unit tests
 header, replaced deprecated 'master' by 'single' in unit tests

---
 openmp/runtime/src/kmp_global.cpp                             | 2 +-
 openmp/runtime/src/kmp_tasking.cpp                            | 2 +-
 openmp/runtime/test/tasking/omp_throttling_max.c              | 4 +++-
 .../test/tasking/omp_throttling_max_ready_per_thread.c        | 4 +++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 6dc9ac2d175246..38bef05ffb433f 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -354,7 +354,7 @@ KMP_BUILD_ASSERT(sizeof(kmp_tasking_flags_t) == 4);
 
 int __kmp_task_stealing_constraint = 1; /* Constrain task stealing by default */
 
-std::atomic<kmp_int32> __kmp_n_tasks_in_flight = 0; /* n° of tasks in flight */
+std::atomic<kmp_int32> __kmp_n_tasks_in_flight = 0; /* n° of tasks in flight */
 
 kmp_int32 __kmp_enable_task_throttling = 1; /* Serialize tasks once a threshold
                                             is reached, such as the number of
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 9cfb0486fc71da..571b6eef69926d 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -1475,7 +1475,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   if (UNLIKELY(!TCR_4(__kmp_init_middle)))
     __kmp_middle_initialize();
 
-  // task throttling: to many tasks co-existing, emptying queue now
+  // task throttling: too many tasks co-existing, emptying queue now
   if (__kmp_enable_task_throttling)
     while (TCR_4(__kmp_n_tasks_in_flight.load()) >= __kmp_task_maximum)
       __kmpc_omp_taskyield(NULL, gtid, 0);
diff --git a/openmp/runtime/test/tasking/omp_throttling_max.c b/openmp/runtime/test/tasking/omp_throttling_max.c
index 582927c713fd34..a050f3c2a4302e 100644
--- a/openmp/runtime/test/tasking/omp_throttling_max.c
+++ b/openmp/runtime/test/tasking/omp_throttling_max.c
@@ -1,8 +1,10 @@
+// clang-format off
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=0      %libomp-run
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=1      %libomp-run
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=256    %libomp-run
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=65536  %libomp-run
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=100000 %libomp-run
+// clang-format on
 
 /**
  *  This test ensures that task throttling on the maximum number of tasks
@@ -46,7 +48,7 @@ int main(void) {
       while (!done)
         ;
 
-#pragma omp master
+#pragma omp single
     {
       int ntasks = 0;
       while (!done) {
diff --git a/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c b/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
index 6d801971d7af19..b38936c9c51047 100644
--- a/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
+++ b/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
@@ -1,8 +1,10 @@
+// clang-format off
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=0      %libomp-run
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=1      %libomp-run
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=256    %libomp-run
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=65536  %libomp-run
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=100000 %libomp-run
+// clang-format on
 
 /**
  *  This test ensures that task throttling on the maximum number of ready tasks
@@ -46,7 +48,7 @@ int main(void) {
       while (!done)
         ;
 
-#pragma omp master
+#pragma omp single
     {
       int ntasks = 0;
       while (!done) {

>From 4ea80bcecc31640585193377860ab52b8a055536 Mon Sep 17 00:00:00 2001
From: Romain Pereira <romain.pereira at inria.fr>
Date: Fri, 23 Feb 2024 17:49:11 +0100
Subject: [PATCH 3/8] Updates, see
 https://github.com/llvm/llvm-project/pull/82274

---
 openmp/runtime/src/kmp.h                      | 11 +++++++-
 openmp/runtime/src/kmp_global.cpp             | 21 ++++++++-------
 openmp/runtime/src/kmp_settings.cpp           |  8 ++++++
 openmp/runtime/src/kmp_tasking.cpp            | 27 ++++++++++++-------
 .../runtime/test/tasking/omp_throttling_max.c | 10 +++----
 .../omp_throttling_max_ready_per_thread.c     | 11 ++++----
 6 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 5409004a7e9d53..807965b52faaa8 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -117,6 +117,12 @@ class kmp_stats_list;
 #include <xmmintrin.h>
 #endif
 
+// Enable a global task counter to enable throttling if more than
+// 'KMP_TASK_MAXIMUM' (env var.) are in flight
+#ifndef KMP_COMPILE_GLOBAL_TASK_THROTTLING
+# define KMP_COMPILE_GLOBAL_TASK_THROTTLING 0
+#endif
+
 // The below has to be defined before including "kmp_barrier.h".
 #define KMP_INTERNAL_MALLOC(sz) malloc(sz)
 #define KMP_INTERNAL_FREE(p) free(p)
@@ -2422,9 +2428,12 @@ typedef enum kmp_tasking_mode {
 extern kmp_tasking_mode_t
     __kmp_tasking_mode; /* determines how/when to execute tasks */
 extern int __kmp_task_stealing_constraint;
+
+#if KMP_COMPILE_GLOBAL_TASK_THROTTLING
 extern std::atomic<kmp_int32> __kmp_n_tasks_in_flight;
-extern int __kmp_enable_task_throttling;
 extern kmp_int32 __kmp_task_maximum;
+#endif /* KMP_COMPILE_GLOBAL_TASK_THROTTLING */
+extern int __kmp_enable_task_throttling;
 extern kmp_int32 __kmp_task_maximum_ready_per_thread;
 
 extern kmp_int32 __kmp_default_device; // Set via OMP_DEFAULT_DEVICE if
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 38bef05ffb433f..6c2659d9b8c4a1 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -354,18 +354,21 @@ KMP_BUILD_ASSERT(sizeof(kmp_tasking_flags_t) == 4);
 
 int __kmp_task_stealing_constraint = 1; /* Constrain task stealing by default */
 
-std::atomic<kmp_int32> __kmp_n_tasks_in_flight = 0; /* n° of tasks in flight */
+/* Serialize tasks once a threshold is reached, such as the number of ready
+ * tasks or the total number of tasks in flight */
+kmp_int32 __kmp_enable_task_throttling = 1;
 
-kmp_int32 __kmp_enable_task_throttling = 1; /* Serialize tasks once a threshold
-                                            is reached, such as the number of
-                                            ready tasks or the total number of
-                                            tasks */
+/* number of ready tasks in a thread queue before it starts serializing */
+kmp_int32 __kmp_task_maximum_ready_per_thread = INITIAL_TASK_DEQUE_SIZE;
 
-kmp_int32 __kmp_task_maximum = 65536; /* number of tasks threshold before
-                                         serializing */
+#if KMP_COMPILE_GLOBAL_TASK_THROTTLING
+/* n of tasks in flight */
+std::atomic<kmp_int32> __kmp_n_tasks_in_flight = 0;
+
+/* maximum number of tasks in flight before serializing */
+kmp_int32 __kmp_task_maximum = 65536;
+#endif /* KMP_COMPILE_GLOBAL_TASK_THROTTLING */
 
-kmp_int32 __kmp_task_maximum_ready_per_thread = 256; /* number of ready tasks
-                                                        before serializing */
 #ifdef DEBUG_SUSPEND
 int __kmp_suspend_count = 0;
 #endif
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index 8491da4a3371f2..c475c48944e592 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -5364,12 +5364,20 @@ static void __kmp_stg_print_task_throttling(kmp_str_buf_t *buffer,
 // KMP_TASK_MAXIMUM
 static void __kmp_stg_parse_task_maximum(char const *name, char const *value,
                                          void *data) {
+#if KMP_USE_GLOBAL_TASK_THROTTLING
   __kmp_stg_parse_int(name, value, 1, INT_MAX, &__kmp_task_maximum);
+#else /* KMP_USE_GLOBAL_TASK_THROTTLING */
+  // TODO : warn user, runtime not compiled with this variable support
+#endif /* KMP_USE_GLOBAL_TASK_THROTTLING */
 } // __kmp_stg_parse_task_maximum
 
 static void __kmp_stg_print_task_maximum(kmp_str_buf_t *buffer,
                                          char const *name, void *data) {
+#if KMP_USE_GLOBAL_TASK_THROTTLING
   __kmp_stg_print_int(buffer, name, __kmp_task_maximum);
+#else /* KMP_USE_GLOBAL_TASK_THROTTLING */
+  __kmp_stg_print_int(buffer, name, -1);
+#endif /* KMP_USE_GLOBAL_TASK_THROTTLING */
 } // __kmp_stg_print_task_maximum
 
 // -----------------------------------------------------------------------------
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 571b6eef69926d..5901d1e89a2a66 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -438,9 +438,10 @@ static kmp_int32 __kmp_push_priority_task(kmp_int32 gtid, kmp_info_t *thread,
 
   __kmp_acquire_bootstrap_lock(&thread_data->td.td_deque_lock);
   // Check if deque is full
-  if (__kmp_enable_task_throttling && TCR_4(thread_data->td.td_deque_ntasks) >=
-                                          __kmp_task_maximum_ready_per_thread) {
-    if (__kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
+  if (TCR_4(thread_data->td.td_deque_ntasks) >=
+      TASK_DEQUE_SIZE(thread_data->td)) {
+    if (__kmp_enable_task_throttling &&
+        __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
                               thread->th.th_current_task)) {
       __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
       KA_TRACE(20, ("__kmp_push_priority_task: T#%d deque is full; returning "
@@ -541,15 +542,15 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   }
 
   int locked = 0;
-  // Check if deque is full
+  // Check if deque is full and needs to be expanded
   int requires_resize = TCR_4(thread_data->td.td_deque_ntasks) >=
                         TASK_DEQUE_SIZE(thread_data->td);
+  // Check if dequeue has too many tasks and needs throttling
   int requires_throttling =
       __kmp_enable_task_throttling && TCR_4(thread_data->td.td_deque_ntasks) >=
                                           __kmp_task_maximum_ready_per_thread;
-  int thread_can_execute;
   if (requires_resize || requires_throttling) {
-    thread_can_execute =
+    int thread_can_execute =
         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
                               thread->th.th_current_task);
     if (requires_throttling && thread_can_execute) {
@@ -577,7 +578,7 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
                           TCR_4(thread_data->td.td_deque_ntasks) >=
                               __kmp_task_maximum_ready_per_thread;
     if (requires_resize || requires_throttling) {
-      thread_can_execute =
+      int thread_can_execute =
           __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
                                 thread->th.th_current_task);
       if (requires_throttling && thread_can_execute) {
@@ -592,6 +593,7 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
       }
     }
   }
+
   // Must have room since no thread can add tasks but calling thread
   KMP_DEBUG_ASSERT(TCR_4(thread_data->td.td_deque_ntasks) <
                    TASK_DEQUE_SIZE(thread_data->td));
@@ -924,7 +926,10 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
 #else /* ! USE_FAST_MEMORY */
   __kmp_thread_free(thread, taskdata);
 #endif
-  --__kmp_n_tasks_in_flight;
+#if KMP_COMPILE_GLOBAL_TASK_THROTTLING
+  if (__kmp_enable_task_throttling)
+      --__kmp_n_tasks_in_flight;
+#endif /* KMP_COMPILE_GLOBAL_TASK_THROTTLING */
 #if OMPX_TASKGRAPH
   } else {
     taskdata->td_flags.complete = 0;
@@ -1475,10 +1480,15 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   if (UNLIKELY(!TCR_4(__kmp_init_middle)))
     __kmp_middle_initialize();
 
+#if KMP_COMPILE_GLOBAL_TASK_THROTTLING
   // task throttling: too many tasks co-existing, emptying queue now
   if (__kmp_enable_task_throttling)
+  {
     while (TCR_4(__kmp_n_tasks_in_flight.load()) >= __kmp_task_maximum)
       __kmpc_omp_taskyield(NULL, gtid, 0);
+    ++__kmp_n_tasks_in_flight;
+  }
+#endif /* KMP_COMPILE_GLOBAL_TASK_THROTTLING */
 
   if (flags->hidden_helper) {
     if (__kmp_enable_hidden_helper) {
@@ -1574,7 +1584,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
                                                                sizeof_shareds);
 #endif /* USE_FAST_MEMORY */
-  ++__kmp_n_tasks_in_flight;
 
   task = KMP_TASKDATA_TO_TASK(taskdata);
 
diff --git a/openmp/runtime/test/tasking/omp_throttling_max.c b/openmp/runtime/test/tasking/omp_throttling_max.c
index a050f3c2a4302e..4a9171b0fa3ace 100644
--- a/openmp/runtime/test/tasking/omp_throttling_max.c
+++ b/openmp/runtime/test/tasking/omp_throttling_max.c
@@ -1,9 +1,9 @@
 // clang-format off
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=0      %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=1      %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=256    %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=65536  %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=100000 %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=999999 KMP_TASK_MAXIMUM=0      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=999999 KMP_TASK_MAXIMUM=1      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=999999 KMP_TASK_MAXIMUM=256    %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=999999 KMP_TASK_MAXIMUM=65536  %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=999999 KMP_TASK_MAXIMUM=100000 %libomp-run
 // clang-format on
 
 /**
diff --git a/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c b/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
index b38936c9c51047..3860b7cda00fbb 100644
--- a/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
+++ b/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
@@ -1,9 +1,9 @@
 // clang-format off
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=0      %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=1      %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=256    %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=65536  %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=100000 %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=999999 KMP_TASK_MAXIMUM_READY_PER_THREAD=0      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=999999 KMP_TASK_MAXIMUM_READY_PER_THREAD=1      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=999999 KMP_TASK_MAXIMUM_READY_PER_THREAD=256    %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=999999 KMP_TASK_MAXIMUM_READY_PER_THREAD=65536  %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=999999 KMP_TASK_MAXIMUM_READY_PER_THREAD=100000 %libomp-run
 // clang-format on
 
 /**
@@ -22,7 +22,6 @@
 #include <assert.h>
 #include <omp.h>
 #include <stdlib.h>
-#include <stdio.h>
 
 #define MAX_TASKS_READY_DEFAULT (1 << 8)
 

>From 21f4e45a7ea3c833229a3f360a8221e594e3760a Mon Sep 17 00:00:00 2001
From: Romain Pereira <romain.pereira at inria.fr>
Date: Fri, 23 Feb 2024 18:12:24 +0100
Subject: [PATCH 4/8] git clang-format

---
 openmp/runtime/src/kmp.h           | 2 +-
 openmp/runtime/src/kmp_tasking.cpp | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 807965b52faaa8..8bbb0546f202aa 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -120,7 +120,7 @@ class kmp_stats_list;
 // Enable a global task counter to enable throttling if more than
 // 'KMP_TASK_MAXIMUM' (env var.) are in flight
 #ifndef KMP_COMPILE_GLOBAL_TASK_THROTTLING
-# define KMP_COMPILE_GLOBAL_TASK_THROTTLING 0
+#define KMP_COMPILE_GLOBAL_TASK_THROTTLING 0
 #endif
 
 // The below has to be defined before including "kmp_barrier.h".
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 5901d1e89a2a66..a87d98ca26b385 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -928,7 +928,7 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
 #endif
 #if KMP_COMPILE_GLOBAL_TASK_THROTTLING
   if (__kmp_enable_task_throttling)
-      --__kmp_n_tasks_in_flight;
+    --__kmp_n_tasks_in_flight;
 #endif /* KMP_COMPILE_GLOBAL_TASK_THROTTLING */
 #if OMPX_TASKGRAPH
   } else {
@@ -1482,8 +1482,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
 
 #if KMP_COMPILE_GLOBAL_TASK_THROTTLING
   // task throttling: too many tasks co-existing, emptying queue now
-  if (__kmp_enable_task_throttling)
-  {
+  if (__kmp_enable_task_throttling) {
     while (TCR_4(__kmp_n_tasks_in_flight.load()) >= __kmp_task_maximum)
       __kmpc_omp_taskyield(NULL, gtid, 0);
     ++__kmp_n_tasks_in_flight;

>From 09da1dd276db3051e835195bb978e58fd146e14f Mon Sep 17 00:00:00 2001
From: Romain Pereira <romain.pereira at inria.fr>
Date: Mon, 4 Mar 2024 16:53:25 +0100
Subject: [PATCH 5/8] [WIP] Throttling, 'max_child' isn't implemented yet

---
 openmp/runtime/CMakeLists.txt                 |   4 +
 openmp/runtime/src/CMakeLists.txt             |   1 +
 openmp/runtime/src/kmp.h                      |  21 ++--
 openmp/runtime/src/kmp_global.cpp             |  29 +++--
 openmp/runtime/src/kmp_settings.cpp           | 103 ++++++++++++++----
 openmp/runtime/src/kmp_tasking.cpp            |  49 ++++++---
 openmp/runtime/test/CMakeLists.txt            |   1 +
 openmp/runtime/test/lit.site.cfg.in           |   1 +
 openmp/runtime/test/tasking/omp_throttling.h  |  66 +++++++++++
 .../runtime/test/tasking/omp_throttling_max.c |  64 -----------
 .../tasking/omp_throttling_max_children.c     |  12 ++
 .../test/tasking/omp_throttling_max_global.c  |  13 +++
 .../omp_throttling_max_ready_per_thread.c     |  67 ++----------
 13 files changed, 249 insertions(+), 182 deletions(-)
 create mode 100644 openmp/runtime/test/tasking/omp_throttling.h
 delete mode 100644 openmp/runtime/test/tasking/omp_throttling_max.c
 create mode 100644 openmp/runtime/test/tasking/omp_throttling_max_children.c
 create mode 100644 openmp/runtime/test/tasking/omp_throttling_max_global.c

diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt
index 041b60efac5cee..d4abfd54062517 100644
--- a/openmp/runtime/CMakeLists.txt
+++ b/openmp/runtime/CMakeLists.txt
@@ -386,6 +386,9 @@ endif()
 set(LIBOMP_USE_HIER_SCHED FALSE CACHE BOOL
   "Hierarchical scheduling support?")
 
+# Enable compilation of global task throttling
+set(LIBOMP_KMP_TASK_THROTTLING_GLOBAL FALSE CACHE BOOL "A global task counter to bound tasking memory usage")
+
 # Setting final library name
 set(LIBOMP_DEFAULT_LIB_NAME libomp)
 if(${PROFILE_LIBRARY})
@@ -447,6 +450,7 @@ if(${OPENMP_STANDALONE_BUILD})
   libomp_say("Use quad precision   -- ${LIBOMP_USE_QUAD_PRECISION}")
   libomp_say("Use Hwloc library    -- ${LIBOMP_USE_HWLOC}")
   libomp_say("Use OMPX-taskgraph   -- ${LIBOMP_OMPX_TASKGRAPH}")
+  libomp_say("Use Task Throttling Global -- ${LIBOMP_KMP_TASK_THROTTLING_GLOBAL}")
 endif()
 
 add_subdirectory(src)
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index ff129feb5b6fa0..e0954a271ab68b 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -249,6 +249,7 @@ if(WIN32)
   libomp_append(LIBOMP_MASM_DEFINITIONS "-D_M_AMD64" IF_TRUE INTEL64)
   libomp_append(LIBOMP_MASM_DEFINITIONS "-DOMPT_SUPPORT" IF_TRUE_1_0 LIBOMP_OMPT_SUPPORT)
   libomp_append(LIBOMP_MASM_DEFINITIONS "-DOMPD_SUPPORT" IF_TRUE_1_0 LIBOMP_OMPD_SUPPORT)
+  libomp_append(LIBOMP_MASM_DEFINITIONS "-DKMP_TASK_THROTTLING_GLOBAL" IF_TRUE_1_0 LIBOMP_KMP_TASK_THROTTLING_GLOBAL)
   libomp_list_to_string("${LIBOMP_MASM_DEFINITIONS}" LIBOMP_MASM_DEFINITIONS)
   set_property(SOURCE z_Windows_NT-586_asm.asm APPEND_STRING PROPERTY COMPILE_FLAGS " ${LIBOMP_MASM_DEFINITIONS}")
   set_source_files_properties(thirdparty/ittnotify/ittnotify_static.cpp PROPERTIES COMPILE_DEFINITIONS "UNICODE")
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 8bbb0546f202aa..f075987aead5d0 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -117,10 +117,8 @@ class kmp_stats_list;
 #include <xmmintrin.h>
 #endif
 
-// Enable a global task counter to enable throttling if more than
-// 'KMP_TASK_MAXIMUM' (env var.) are in flight
-#ifndef KMP_COMPILE_GLOBAL_TASK_THROTTLING
-#define KMP_COMPILE_GLOBAL_TASK_THROTTLING 0
+#ifdef KMP_TASK_THROTTLING_GLOBAL
+# error "KMP_TASK_THROTTLING_GLOBAL SET !!"
 #endif
 
 // The below has to be defined before including "kmp_barrier.h".
@@ -2429,12 +2427,19 @@ extern kmp_tasking_mode_t
     __kmp_tasking_mode; /* determines how/when to execute tasks */
 extern int __kmp_task_stealing_constraint;
 
-#if KMP_COMPILE_GLOBAL_TASK_THROTTLING
+extern kmp_int32 __kmp_enable_task_throttling;
+#if KMP_TASK_THROTTLING_GLOBAL
+extern kmp_int32 __kmp_enable_task_throttling_global;
+#endif /* KMP_TASK_THROTTLING_GLOBAL */
+extern kmp_int32 __kmp_enable_task_throttling_ready_per_thread;
+extern kmp_int32 __kmp_enable_task_throttling_children;
+
+#if KMP_TASK_THROTTLING_GLOBAL
 extern std::atomic<kmp_int32> __kmp_n_tasks_in_flight;
-extern kmp_int32 __kmp_task_maximum;
-#endif /* KMP_COMPILE_GLOBAL_TASK_THROTTLING */
-extern int __kmp_enable_task_throttling;
+extern kmp_int32 __kmp_task_maximum_global;
+#endif /* KMP_TASK_THROTTLING_GLOBAL */
 extern kmp_int32 __kmp_task_maximum_ready_per_thread;
+extern kmp_int32 __kmp_task_maximum_children;
 
 extern kmp_int32 __kmp_default_device; // Set via OMP_DEFAULT_DEVICE if
 // specified, defaults to 0 otherwise
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 6c2659d9b8c4a1..da5925bee46036 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -354,20 +354,27 @@ KMP_BUILD_ASSERT(sizeof(kmp_tasking_flags_t) == 4);
 
 int __kmp_task_stealing_constraint = 1; /* Constrain task stealing by default */
 
-/* Serialize tasks once a threshold is reached, such as the number of ready
- * tasks or the total number of tasks in flight */
-kmp_int32 __kmp_enable_task_throttling = 1;
+/**
+ * Throttling parameters : the executing thread schedules any tasks as long as
+ * any of the following threshold is reached:
+ *  - n of tasks allocated in the runtime ('global')
+ *  - n of ready-tasks in the current thread queue ('per-thread')
+ *  - n of children tasks of for a parent task ('children')
+ */
+kmp_int32 __kmp_enable_task_throttling                  = 1;
+#if KMP_TASK_THROTTLING_GLOBAL
+kmp_int32 __kmp_enable_task_throttling_global           = 0;
+#endif /* KMP_TASK_THROTTLING_GLOBAL */
+kmp_int32 __kmp_enable_task_throttling_ready_per_thread = 1;
+kmp_int32 __kmp_enable_task_throttling_children         = 0;
 
-/* number of ready tasks in a thread queue before it starts serializing */
-kmp_int32 __kmp_task_maximum_ready_per_thread = INITIAL_TASK_DEQUE_SIZE;
 
-#if KMP_COMPILE_GLOBAL_TASK_THROTTLING
-/* n of tasks in flight */
+#if KMP_TASK_THROTTLING_GLOBAL
 std::atomic<kmp_int32> __kmp_n_tasks_in_flight = 0;
-
-/* maximum number of tasks in flight before serializing */
-kmp_int32 __kmp_task_maximum = 65536;
-#endif /* KMP_COMPILE_GLOBAL_TASK_THROTTLING */
+kmp_int32 __kmp_task_maximum_global = 65536;
+#endif /* KMP_TASK_THROTTLING_GLOBAL */
+kmp_int32 __kmp_task_maximum_ready_per_thread = INITIAL_TASK_DEQUE_SIZE;
+kmp_int32 __kmp_task_maximum_children = 16384;
 
 #ifdef DEBUG_SUSPEND
 int __kmp_suspend_count = 0;
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index c475c48944e592..d533cc47d82ec3 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -5361,24 +5361,59 @@ static void __kmp_stg_print_task_throttling(kmp_str_buf_t *buffer,
 } // __kmp_stg_print_task_throttling
 
 // -----------------------------------------------------------------------------
-// KMP_TASK_MAXIMUM
-static void __kmp_stg_parse_task_maximum(char const *name, char const *value,
+// KMP_ENABLE_TASK_THROTTLING_GLOBAL
+
+# if KMP_TASK_THROTTLING_GLOBAL
+static void __kmp_stg_parse_task_throttling_global(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_enable_task_throttling_global);
+} // __kmp_stg_parse_task_throttling_global
+
+static void __kmp_stg_print_task_throttling_global(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling_global);
+} // __kmp_stg_print_task_throttling_global
+# endif /* KMP_TASK_THROTTLING_GLOBAL */
+
+// -----------------------------------------------------------------------------
+// KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD
+
+static void __kmp_stg_parse_task_throttling_ready_per_thread(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_enable_task_throttling_ready_per_thread);
+} // __kmp_stg_parse_task_throttling_ready_per_thread
+
+static void __kmp_stg_print_task_throttling_ready_per_thread(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling_ready_per_thread);
+} // __kmp_stg_print_task_throttling_ready_per_thread
+
+// -----------------------------------------------------------------------------
+// KMP_ENABLE_TASK_THROTTLING_CHILDREN
+
+static void __kmp_stg_parse_task_throttling_children(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_bool(name, value, &__kmp_enable_task_throttling_children);
+} // __kmp_stg_parse_task_throttling_children
+
+static void __kmp_stg_print_task_throttling_children(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling_children);
+} // __kmp_stg_print_task_throttling_children
+
+// -----------------------------------------------------------------------------
+// KMP_TASK_MAXIMUM_GLOBAL
+# if KMP_TASK_THROTTLING_GLOBAL
+static void __kmp_stg_parse_task_maximum_global(char const *name, char const *value,
                                          void *data) {
-#if KMP_USE_GLOBAL_TASK_THROTTLING
-  __kmp_stg_parse_int(name, value, 1, INT_MAX, &__kmp_task_maximum);
-#else /* KMP_USE_GLOBAL_TASK_THROTTLING */
-  // TODO : warn user, runtime not compiled with this variable support
-#endif /* KMP_USE_GLOBAL_TASK_THROTTLING */
-} // __kmp_stg_parse_task_maximum
-
-static void __kmp_stg_print_task_maximum(kmp_str_buf_t *buffer,
+  __kmp_stg_parse_int(name, value, 1, INT_MAX, &__kmp_task_maximum_global);
+} // __kmp_stg_parse_task_maximum_global
+
+static void __kmp_stg_print_task_maximum_global(kmp_str_buf_t *buffer,
                                          char const *name, void *data) {
-#if KMP_USE_GLOBAL_TASK_THROTTLING
-  __kmp_stg_print_int(buffer, name, __kmp_task_maximum);
-#else /* KMP_USE_GLOBAL_TASK_THROTTLING */
-  __kmp_stg_print_int(buffer, name, -1);
-#endif /* KMP_USE_GLOBAL_TASK_THROTTLING */
-} // __kmp_stg_print_task_maximum
+  __kmp_stg_print_int(buffer, name, __kmp_task_maximum_global);
+} // __kmp_stg_print_task_maximum_global
+# endif /* KMP_TASK_THROTTLING_GLOBAL */
 
 // -----------------------------------------------------------------------------
 // KMP_TASK_MAXIMUM_READY_PER_THREAD
@@ -5395,6 +5430,19 @@ static void __kmp_stg_print_task_maximum_ready_per_thread(kmp_str_buf_t *buffer,
   __kmp_stg_print_int(buffer, name, __kmp_task_maximum_ready_per_thread);
 } // __kmp_stg_print_task_maximum_ready_per_thread
 
+// -----------------------------------------------------------------------------
+// KMP_TASK_MAXIMUM_CHILDREN
+static void __kmp_stg_parse_task_maximum_children(char const *name, char const *value,
+                                         void *data) {
+  __kmp_stg_parse_int(name, value, 1, INT_MAX, &__kmp_task_maximum_children);
+} // __kmp_stg_parse_task_maximum_children
+
+static void __kmp_stg_print_task_maximum_children(kmp_str_buf_t *buffer,
+                                         char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_task_maximum_children);
+} // __kmp_stg_print_task_maximum_children
+
+
 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
 // -----------------------------------------------------------------------------
 // KMP_USER_LEVEL_MWAIT
@@ -5782,15 +5830,26 @@ static kmp_setting_t __kmp_stg_table[] = {
     {"KMP_FORKJOIN_FRAMES_MODE", __kmp_stg_parse_forkjoin_frames_mode,
      __kmp_stg_print_forkjoin_frames_mode, NULL, 0, 0},
 #endif
+
     {"KMP_ENABLE_TASK_THROTTLING", __kmp_stg_parse_task_throttling,
      __kmp_stg_print_task_throttling, NULL, 0, 0},
-
-    {"KMP_TASK_MAXIMUM", __kmp_stg_parse_task_maximum,
-     __kmp_stg_print_task_maximum, NULL, 0, 0},
-
-    {"KMP_TASK_MAXIMUM_READY_PER_THREAD",
-     __kmp_stg_parse_task_maximum_ready_per_thread,
+    {"KMP_ENABLE_TASK_THROTTLING_CHILDREN", __kmp_stg_parse_task_throttling_children,
+     __kmp_stg_print_task_throttling_children, NULL, 0, 0},
+    {"KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD", __kmp_stg_parse_task_throttling_ready_per_thread,
+     __kmp_stg_print_task_throttling_ready_per_thread, NULL, 0, 0},
+# if KMP_TASK_THROTTLING_GLOBAL
+    {"KMP_ENABLE_TASK_THROTTLING_GLOBAL", __kmp_stg_parse_task_throttling_global,
+     __kmp_stg_print_task_throttling_global, NULL, 0, 0},
+# endif /* KMP_TASK_THROTTLING_GLOBAL */
+
+    {"KMP_TASK_MAXIMUM_CHILDREN", __kmp_stg_parse_task_maximum_children,
+     __kmp_stg_print_task_maximum_children, NULL, 0, 0},
+    {"KMP_TASK_MAXIMUM_READY_PER_THREAD", __kmp_stg_parse_task_maximum_ready_per_thread,
      __kmp_stg_print_task_maximum_ready_per_thread, NULL, 0, 0},
+# if KMP_TASK_THROTTLING_GLOBAL
+    {"KMP_TASK_MAXIMUM_GLOBAL", __kmp_stg_parse_task_maximum_global,
+     __kmp_stg_print_task_maximum_global, NULL, 0, 0},
+# endif /* KMP_TASK_THROTTLING_GLOBAL */
 
     {"OMP_DISPLAY_ENV", __kmp_stg_parse_omp_display_env,
      __kmp_stg_print_omp_display_env, NULL, 0, 0},
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index a87d98ca26b385..0afa62d046b478 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -543,12 +543,9 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
 
   int locked = 0;
   // Check if deque is full and needs to be expanded
-  int requires_resize = TCR_4(thread_data->td.td_deque_ntasks) >=
-                        TASK_DEQUE_SIZE(thread_data->td);
+  int requires_resize = TCR_4(thread_data->td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td);
   // Check if dequeue has too many tasks and needs throttling
-  int requires_throttling =
-      __kmp_enable_task_throttling && TCR_4(thread_data->td.td_deque_ntasks) >=
-                                          __kmp_task_maximum_ready_per_thread;
+  int requires_throttling = __kmp_enable_task_throttling && __kmp_enable_task_throttling_ready_per_thread && TCR_4(thread_data->td.td_deque_ntasks) >= __kmp_task_maximum_ready_per_thread;
   if (requires_resize || requires_throttling) {
     int thread_can_execute =
         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
@@ -574,7 +571,7 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
     // Need to recheck as we can get a proxy task from thread outside of OpenMP
     requires_resize = TCR_4(thread_data->td.td_deque_ntasks) >=
                       TASK_DEQUE_SIZE(thread_data->td);
-    requires_throttling = __kmp_enable_task_throttling &&
+    requires_throttling = __kmp_enable_task_throttling && __kmp_enable_task_throttling_ready_per_thread &&
                           TCR_4(thread_data->td.td_deque_ntasks) >=
                               __kmp_task_maximum_ready_per_thread;
     if (requires_resize || requires_throttling) {
@@ -926,10 +923,10 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
 #else /* ! USE_FAST_MEMORY */
   __kmp_thread_free(thread, taskdata);
 #endif
-#if KMP_COMPILE_GLOBAL_TASK_THROTTLING
-  if (__kmp_enable_task_throttling)
+#if KMP_TASK_THROTTLING_GLOBAL
+  if (__kmp_enable_task_throttling && __kmp_enable_task_throttling_global)
     --__kmp_n_tasks_in_flight;
-#endif /* KMP_COMPILE_GLOBAL_TASK_THROTTLING */
+#endif /* KMP_TASK_THROTTLING_GLOBAL */
 #if OMPX_TASKGRAPH
   } else {
     taskdata->td_flags.complete = 0;
@@ -1480,14 +1477,30 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   if (UNLIKELY(!TCR_4(__kmp_init_middle)))
     __kmp_middle_initialize();
 
-#if KMP_COMPILE_GLOBAL_TASK_THROTTLING
-  // task throttling: too many tasks co-existing, emptying queue now
   if (__kmp_enable_task_throttling) {
-    while (TCR_4(__kmp_n_tasks_in_flight.load()) >= __kmp_task_maximum)
-      __kmpc_omp_taskyield(NULL, gtid, 0);
-    ++__kmp_n_tasks_in_flight;
-  }
-#endif /* KMP_COMPILE_GLOBAL_TASK_THROTTLING */
+#if KMP_TASK_THROTTLING_GLOBAL
+  // task throttling: too many tasks existing, empty queues now
+      if (__kmp_enable_task_throttling_global) {
+         while (TCR_4(__kmp_n_tasks_in_flight.load()) >= __kmp_task_maximum_global)
+               __kmpc_omp_taskyield(NULL, gtid, 0);
+         ++__kmp_n_tasks_in_flight;
+      }
+#endif /* KMP_TASK_THROTTLING_GLOBAL */
+
+      if (__kmp_enable_task_throttling_children) {
+         int thread_finished = FALSE;
+#if USE_ITT_BUILD
+         void *itt_sync_obj = NULL;
+#endif /* USE_ITT_BUILD */
+         kmp_flag_32<false, false> flag( RCAST(std::atomic<kmp_uint32> *,
+                     &(taskdata->td_incomplete_child_tasks)), 0U);
+         while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) >= __kmp_task_maximum_children) {
+             flag.execute_tasks(thread, gtid, FALSE,
+                     &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
+                     __kmp_task_stealing_constraint);
+         }
+      }
+  } /* __kmp_enable_task_throttling */
 
   if (flags->hidden_helper) {
     if (__kmp_enable_hidden_helper) {
@@ -1678,8 +1691,8 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
     if (parent_task->td_taskgroup)
       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
     // Only need to keep track of allocated child tasks for explicit tasks since
-    // implicit not deallocated
-    if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
+    // implicit not deallocated; or if throttling is enabled
+    if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT || (__kmp_enable_task_throttling && __kmp_enable_task_throttling_children)) {
       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
     }
     if (flags->hidden_helper) {
diff --git a/openmp/runtime/test/CMakeLists.txt b/openmp/runtime/test/CMakeLists.txt
index a7790804542b7e..88b5ad82abbf31 100644
--- a/openmp/runtime/test/CMakeLists.txt
+++ b/openmp/runtime/test/CMakeLists.txt
@@ -31,6 +31,7 @@ pythonize_bool(LIBOMP_USE_HWLOC)
 pythonize_bool(LIBOMP_OMPT_SUPPORT)
 pythonize_bool(LIBOMP_OMPT_OPTIONAL)
 pythonize_bool(LIBOMP_OMPX_TASKGRAPH)
+pythonize_bool(LIBOMP_KMP_TASK_THROTTLING_GLOBAL)
 pythonize_bool(LIBOMP_HAVE_LIBM)
 pythonize_bool(LIBOMP_HAVE_LIBATOMIC)
 pythonize_bool(OPENMP_STANDALONE_BUILD)
diff --git a/openmp/runtime/test/lit.site.cfg.in b/openmp/runtime/test/lit.site.cfg.in
index d6c259280619be..361ada0e850213 100644
--- a/openmp/runtime/test/lit.site.cfg.in
+++ b/openmp/runtime/test/lit.site.cfg.in
@@ -16,6 +16,7 @@ config.hwloc_library_dir = "@LIBOMP_HWLOC_LIBRARY_DIR@"
 config.using_hwloc = @LIBOMP_USE_HWLOC@
 config.has_ompt = @LIBOMP_OMPT_SUPPORT@ and @LIBOMP_OMPT_OPTIONAL@
 config.has_ompx_taskgraph = @LIBOMP_OMPX_TASKGRAPH@
+config.has_kmp_task_throttling_global = @LIBOMP_KMP_TASK_THROTTLING_GLOBAL@
 config.has_libm = @LIBOMP_HAVE_LIBM@
 config.has_libatomic = @LIBOMP_HAVE_LIBATOMIC@
 config.is_standalone_build = @OPENMP_STANDALONE_BUILD@
diff --git a/openmp/runtime/test/tasking/omp_throttling.h b/openmp/runtime/test/tasking/omp_throttling.h
new file mode 100644
index 00000000000000..4b91a26e2f5330
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_throttling.h
@@ -0,0 +1,66 @@
+/**
+ *  This test ensures that task throttling works properly
+ *
+ *  It creates 2 threads (1 producer, 1 consummer)
+ *  The producer infinitely create tasks 'T_i' until one executed
+ *  The consumer is blocked until the producer starts throttling
+ *  Executing any 'T_i' unblocks the consumer and stop the producer
+ *
+ *  The assertion tests ensures that the producer does not create more than the
+ *  total number of tasks provided by the programmer
+ */
+
+#include <assert.h>
+#include <omp.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+int main(void) {
+
+  /* check that throttling is enabled (enabled by default) */
+  char *throttling_str = getenv("KMP_ENABLE_TASK_THROTTLING");
+  assert(throttling_str);
+  assert(*throttling_str == '1');
+
+  /* maximum number of tasks in-flight */
+  char *max_tasks_str = getenv(MAX_ENV_VAR);
+  assert(max_tasks_str);
+  int max_tasks = atoi(max_tasks_str);
+  if (max_tasks <= 0)
+    max_tasks = 1;
+
+  /* check that throttling is enabled (disabled by default) */
+  throttling_str = getenv(ENABLE_ENV_VAR);
+  assert(throttling_str);
+  assert(*throttling_str == '1');
+
+  volatile int done = 0;
+  int ntasks = 0;
+
+/* testing KMP_TASK_MAXIMUM */
+#pragma omp parallel num_threads(2) default(none) shared(max_tasks, done, ntasks)
+  {
+    if (omp_get_thread_num() == 1)
+      while (!done)
+        ;
+
+#pragma omp single
+    {
+      while (!done) {
+# if USE_DEPS
+            # pragma omp task default(none) shared(done) depend(out : max_tasks)
+# else
+            # pragma omp task default(none) shared(done)
+# endif
+            {
+                done = 1;
+            }
+
+        assert(++ntasks <= max_tasks + 1);
+      }
+    }
+  }
+  assert(ntasks == max_tasks + 1);
+
+  return 0;
+}
diff --git a/openmp/runtime/test/tasking/omp_throttling_max.c b/openmp/runtime/test/tasking/omp_throttling_max.c
deleted file mode 100644
index 4a9171b0fa3ace..00000000000000
--- a/openmp/runtime/test/tasking/omp_throttling_max.c
+++ /dev/null
@@ -1,64 +0,0 @@
-// clang-format off
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=999999 KMP_TASK_MAXIMUM=0      %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=999999 KMP_TASK_MAXIMUM=1      %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=999999 KMP_TASK_MAXIMUM=256    %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=999999 KMP_TASK_MAXIMUM=65536  %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM_READY_PER_THREAD=999999 KMP_TASK_MAXIMUM=100000 %libomp-run
-// clang-format on
-
-/**
- *  This test ensures that task throttling on the maximum number of tasks
- *  threshold works properly.
- *
- *  It creates 2 threads (1 producer, 1 consummer)
- *  The producer infinitely create tasks 'T_i' until one executed
- *  The consumer is blocked until the producer starts throttling
- *  Executing any 'T_i' unblocks the consumer and stop the producer
- *
- *  The assertion tests ensures that the producer does not create more than the
- *  total number of tasks provided by the programmer
- */
-
-#include <assert.h>
-#include <omp.h>
-#include <stdlib.h>
-
-/* default value */
-#define MAX_TASKS_DEFAULT (65536)
-
-int main(void) {
-  /* maximum number of tasks in-flight */
-  char *max_tasks_str = getenv("KMP_TASK_MAXIMUM");
-  int max_tasks = max_tasks_str ? atoi(max_tasks_str) : MAX_TASKS_DEFAULT;
-  if (max_tasks <= 0)
-    max_tasks = 1;
-
-  /* check if throttling is enabled (it is by default) */
-  char *throttling_str = getenv("KMP_ENABLE_TASK_THROTTLING");
-  int throttling = throttling_str ? *throttling_str == '1' : 1;
-  assert(throttling);
-
-  volatile int done = 0;
-
-/* testing KMP_TASK_MAXIMUM */
-#pragma omp parallel num_threads(2) default(none)                              \
-    shared(max_tasks, throttling, done)
-  {
-    if (omp_get_thread_num() == 1)
-      while (!done)
-        ;
-
-#pragma omp single
-    {
-      int ntasks = 0;
-      while (!done) {
-#pragma omp task default(none) shared(done) depend(out : max_tasks, throttling)
-        done = 1;
-
-        assert(++ntasks <= max_tasks + 1);
-      }
-    }
-  }
-
-  return 0;
-}
diff --git a/openmp/runtime/test/tasking/omp_throttling_max_children.c b/openmp/runtime/test/tasking/omp_throttling_max_children.c
new file mode 100644
index 00000000000000..fc7131e691d11d
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_throttling_max_children.c
@@ -0,0 +1,12 @@
+// clang-format off
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=0 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=0 KMP_ENABLE_TASK_THROTTLING_CHILDREN=1 KMP_TASK_MAXIMUM_CHILDREN=1      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=0 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=0 KMP_ENABLE_TASK_THROTTLING_CHILDREN=1 KMP_TASK_MAXIMUM_CHILDREN=256    %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=0 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=0 KMP_ENABLE_TASK_THROTTLING_CHILDREN=1 KMP_TASK_MAXIMUM_CHILDREN=65536  %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=0 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=0 KMP_ENABLE_TASK_THROTTLING_CHILDREN=1 KMP_TASK_MAXIMUM_CHILDREN=100000 %libomp-run
+// clang-format on
+
+# define ENABLE_ENV_VAR ("KMP_ENABLE_TASK_THROTTLING_CHILDREN")
+# define MAX_ENV_VAR    ("KMP_TASK_MAXIMUM_CHILDREN")
+# define USE_DEPS       (1)
+
+# include "omp_throttling.h"
diff --git a/openmp/runtime/test/tasking/omp_throttling_max_global.c b/openmp/runtime/test/tasking/omp_throttling_max_global.c
new file mode 100644
index 00000000000000..a553906a35417e
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_throttling_max_global.c
@@ -0,0 +1,13 @@
+// clang-format off
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=1 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=0 KMP_ENABLE_TASK_THROTTLING_CHILDREN=0 KMP_TASK_MAXIMUM_GLOBAL=1      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=1 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=0 KMP_ENABLE_TASK_THROTTLING_CHILDREN=0 KMP_TASK_MAXIMUM_GLOBAL=256    %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=1 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=0 KMP_ENABLE_TASK_THROTTLING_CHILDREN=0 KMP_TASK_MAXIMUM_GLOBAL=65536  %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=1 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=0 KMP_ENABLE_TASK_THROTTLING_CHILDREN=0 KMP_TASK_MAXIMUM_GLOBAL=100000 %libomp-run
+// REQUIRES: kmp_task_throttling_global
+// clang-format on
+
+# define ENABLE_ENV_VAR ("KMP_ENABLE_TASK_THROTTLING_GLOBAL")
+# define MAX_ENV_VAR    ("KMP_TASK_MAXIMUM_GLOBAL")
+# define USE_DEPS       (1)
+
+# include "omp_throttling.h"
diff --git a/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c b/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
index 3860b7cda00fbb..0d0bd16672a583 100644
--- a/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
+++ b/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
@@ -1,63 +1,12 @@
 // clang-format off
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=999999 KMP_TASK_MAXIMUM_READY_PER_THREAD=0      %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=999999 KMP_TASK_MAXIMUM_READY_PER_THREAD=1      %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=999999 KMP_TASK_MAXIMUM_READY_PER_THREAD=256    %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=999999 KMP_TASK_MAXIMUM_READY_PER_THREAD=65536  %libomp-run
-// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_TASK_MAXIMUM=999999 KMP_TASK_MAXIMUM_READY_PER_THREAD=100000 %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=0 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=1 KMP_ENABLE_TASK_THROTTLING_CHILDREN=0 KMP_TASK_MAXIMUM_READY_PER_THREAD=1      %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=0 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=1 KMP_ENABLE_TASK_THROTTLING_CHILDREN=0 KMP_TASK_MAXIMUM_READY_PER_THREAD=256    %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=0 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=1 KMP_ENABLE_TASK_THROTTLING_CHILDREN=0 KMP_TASK_MAXIMUM_READY_PER_THREAD=65536  %libomp-run
+// RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=0 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=1 KMP_ENABLE_TASK_THROTTLING_CHILDREN=0 KMP_TASK_MAXIMUM_READY_PER_THREAD=100000 %libomp-run
 // clang-format on
 
-/**
- *  This test ensures that task throttling on the maximum number of ready tasks
- *  per thread threshold works properly.
- *
- *  It creates 2 threads (1 producer, 1 consummer)
- *  The producer infinitely create tasks 'T_i' until one executed
- *  The consumer is blocked until the producer starts throttling
- *  Executing any 'T_i' unblocks the consumer and stop the producer
- *
- *  The assertion tests ensures that the producer does not create more than the
- *  total number of tasks provided by the programmer
- */
+# define ENABLE_ENV_VAR ("KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD")
+# define MAX_ENV_VAR    ("KMP_TASK_MAXIMUM_READY_PER_THREAD")
+# define USE_DEPS       (0)
 
-#include <assert.h>
-#include <omp.h>
-#include <stdlib.h>
-
-#define MAX_TASKS_READY_DEFAULT (1 << 8)
-
-int main(void) {
-  /* maximum number of ready tasks in-flight */
-  char *max_tasks_ready_str = getenv("KMP_TASK_MAXIMUM_READY_PER_THREAD");
-  int max_tasks_ready =
-      max_tasks_ready_str ? atoi(max_tasks_ready_str) : MAX_TASKS_READY_DEFAULT;
-  if (max_tasks_ready <= 0)
-    max_tasks_ready = 1;
-
-  /* check if throttling is enabled (it is by default) */
-  char *throttling_str = getenv("KMP_ENABLE_TASK_THROTTLING");
-  int throttling = throttling_str ? *throttling_str == '1' : 1;
-
-  volatile int done = 0;
-
-/* testing KMP_TASK_MAXIMUM_READY */
-#pragma omp parallel num_threads(2) default(none)                              \
-    shared(max_tasks_ready, throttling, done)
-  {
-    if (omp_get_thread_num() == 1)
-      while (!done)
-        ;
-
-#pragma omp single
-    {
-      int ntasks = 0;
-      while (!done) {
-#pragma omp task default(none) shared(done)
-        done = 1;
-
-        assert(++ntasks <= max_tasks_ready + 1);
-      }
-    }
-  }
-
-  return 0;
-}
+# include "omp_throttling.h"

>From a80ee348a2e7aa34bfb44ffdebda2f4a3dcf8b21 Mon Sep 17 00:00:00 2001
From: Romain PEREIRA <romain.pereira at inria.fr>
Date: Sun, 10 Mar 2024 00:16:57 +0100
Subject: [PATCH 6/8] [WIP] max_child first draft

---
 .../plugins-nextgen/amdgpu/src/rtl.cpp        |  4 +--
 openmp/runtime/src/kmp.h                      | 10 ++++++
 openmp/runtime/src/kmp_tasking.cpp            | 35 +++++++++++++------
 openmp/runtime/src/kmp_wait_release.h         | 22 ++++++++++++
 4 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 81634ae1edc490..2a1bfdc9fbad38 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1899,8 +1899,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Get the frequency of the steady clock. If the attribute is missing
     // assume running on an older libhsa and default to 0, omp_get_wtime
     // will be inaccurate but otherwise programs can still run.
-    if (auto Err = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
-                                    ClockFrequency))
+//    if (auto Err = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
+//                                    ClockFrequency))
       ClockFrequency = 0;
 
     // Load the grid values dependending on the wavefront.
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index f075987aead5d0..cbd4d63a3476b4 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -282,6 +282,7 @@ template <bool C = false, bool S = true> class kmp_flag_32;
 template <bool C = false, bool S = true> class kmp_flag_64;
 template <bool C = false, bool S = true> class kmp_atomic_flag_64;
 class kmp_flag_oncore;
+class kmp_flag_i32_lt;
 
 #ifdef __cplusplus
 extern "C" {
@@ -4673,6 +4674,15 @@ int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
 #endif /* USE_ITT_BUILD */
                                kmp_int32 is_constrained);
 
+int __kmp_execute_tasks_i32_lt(kmp_info_t *thread, kmp_int32 gtid,
+                               kmp_flag_i32_lt * flag, int final_spin,
+                               int *thread_finished,
+#if USE_ITT_BUILD
+                               void *itt_sync_obj,
+#endif /* USE_ITT_BUILD */
+                               kmp_int32 is_constrained);
+
+
 extern int __kmp_nesting_mode;
 extern int __kmp_nesting_mode_nlevels;
 extern int *__kmp_nesting_nth_level;
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 0afa62d046b478..7b9c7d2569184a 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -1477,13 +1477,19 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   if (UNLIKELY(!TCR_4(__kmp_init_middle)))
     __kmp_middle_initialize();
 
+  // check if too many tasks are allocated already, in such case, schedule a
+  // few before allocating a new one
   if (__kmp_enable_task_throttling) {
 #if KMP_TASK_THROTTLING_GLOBAL
-  // task throttling: too many tasks existing, empty queues now
       if (__kmp_enable_task_throttling_global) {
-         while (TCR_4(__kmp_n_tasks_in_flight.load()) >= __kmp_task_maximum_global)
-               __kmpc_omp_taskyield(NULL, gtid, 0);
-         ++__kmp_n_tasks_in_flight;
+          // empty queues
+         kmp_flag_i32_lt flag(&__kmp_n_tasks_in_flight, __kmp_task_maximum_global);
+         while (KMP_ATOMIC_LD_ACQ(&__kmp_n_tasks_in_flight) >= __kmp_task_maximum_global) {
+             flag.execute_tasks(thread, gtid, FALSE, &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+         }
+
+          // we can now allocate a new task, increase counter
+          ++__kmp_n_tasks_in_flight;
       }
 #endif /* KMP_TASK_THROTTLING_GLOBAL */
 
@@ -1492,12 +1498,10 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
 #if USE_ITT_BUILD
          void *itt_sync_obj = NULL;
 #endif /* USE_ITT_BUILD */
-         kmp_flag_32<false, false> flag( RCAST(std::atomic<kmp_uint32> *,
-                     &(taskdata->td_incomplete_child_tasks)), 0U);
-         while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) >= __kmp_task_maximum_children) {
-             flag.execute_tasks(thread, gtid, FALSE,
-                     &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
-                     __kmp_task_stealing_constraint);
+
+         kmp_flag_i32_lt flag(&parent_task->td_incomplete_child_tasks, __kmp_task_maximum_children);
+         while (KMP_ATOMIC_LD_ACQ(&parent_task->td_incomplete_child_tasks) >= __kmp_task_maximum_children) {
+             flag.execute_tasks(thread, gtid, FALSE, &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 0);
          }
       }
   } /* __kmp_enable_task_throttling */
@@ -3395,7 +3399,7 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
 // spinner == NULL means only execute a single task and return.
 // checker is the value to check to terminate the spin.
 template <class C>
-static inline int __kmp_execute_tasks_template(
+static int __kmp_execute_tasks_template(
     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
     kmp_int32 is_constrained) {
@@ -3658,6 +3662,15 @@ int __kmp_execute_tasks_oncore(
       thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
 }
 
+int __kmp_execute_tasks_i32_lt(
+    kmp_info_t *thread, kmp_int32 gtid, kmp_flag_i32_lt *flag, int final_spin,
+    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+    kmp_int32 is_constrained) {
+  return __kmp_execute_tasks_template(
+      thread, gtid, flag, final_spin,
+      thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+}
+
 template int
 __kmp_execute_tasks_32<false, false>(kmp_info_t *, kmp_int32,
                                      kmp_flag_32<false, false> *, int,
diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h
index 12d5d0677a90a2..ea8779ccf06b0e 100644
--- a/openmp/runtime/src/kmp_wait_release.h
+++ b/openmp/runtime/src/kmp_wait_release.h
@@ -852,6 +852,28 @@ class kmp_flag_32 : public kmp_flag_atomic<kmp_uint32, flag32, Sleepable> {
   flag_type get_ptr_type() { return flag32; }
 };
 
+class kmp_flag_i32_lt {
+    protected:
+        std::atomic<kmp_int32> * loc;
+        kmp_int32 checker;
+
+    public:
+        kmp_flag_i32_lt(std::atomic<kmp_int32> *p, kmp_int32 c)
+            : loc(p), checker(c) {}
+
+        int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+                kmp_int32 is_constrained) {
+            return __kmp_execute_tasks_i32_lt(
+                    this_thr, gtid, this, final_spin,
+                    thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+        }
+
+        bool done_check() {
+            return loc->load() <= checker;
+        }
+};
+
 template <bool Cancellable, bool Sleepable>
 class kmp_flag_64 : public kmp_flag_native<kmp_uint64, flag64, Sleepable> {
 public:

>From e695ac38d4e552c5e22c3f3bfaee5728016ac0ba Mon Sep 17 00:00:00 2001
From: Romain PEREIRA <romain.pereira at inria.fr>
Date: Sun, 10 Mar 2024 01:33:52 +0100
Subject: [PATCH 7/8] Throttling, global and per-task parameters implemented,
 tested under unit tests and epcc+fib

---
 .../plugins-nextgen/amdgpu/src/rtl.cpp        |  4 +-
 openmp/runtime/src/kmp.h                      |  7 +-
 openmp/runtime/src/kmp_config.h.cmake         |  2 +
 openmp/runtime/src/kmp_global.cpp             |  7 +-
 openmp/runtime/src/kmp_settings.cpp           | 75 +++++++++++--------
 openmp/runtime/src/kmp_tasking.cpp            | 61 ++++++++-------
 openmp/runtime/src/kmp_wait_release.h         | 34 ++++-----
 openmp/runtime/test/tasking/omp_throttling.h  | 27 ++++---
 .../tasking/omp_throttling_max_children.c     |  8 +-
 .../test/tasking/omp_throttling_max_global.c  |  8 +-
 .../omp_throttling_max_ready_per_thread.c     |  8 +-
 11 files changed, 129 insertions(+), 112 deletions(-)

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 2a1bfdc9fbad38..81634ae1edc490 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1899,8 +1899,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Get the frequency of the steady clock. If the attribute is missing
     // assume running on an older libhsa and default to 0, omp_get_wtime
     // will be inaccurate but otherwise programs can still run.
-//    if (auto Err = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
-//                                    ClockFrequency))
+    if (auto Err = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
+                                    ClockFrequency))
       ClockFrequency = 0;
 
     // Load the grid values dependending on the wavefront.
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index cbd4d63a3476b4..096146ce43cebf 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -117,10 +117,6 @@ class kmp_stats_list;
 #include <xmmintrin.h>
 #endif
 
-#ifdef KMP_TASK_THROTTLING_GLOBAL
-# error "KMP_TASK_THROTTLING_GLOBAL SET !!"
-#endif
-
 // The below has to be defined before including "kmp_barrier.h".
 #define KMP_INTERNAL_MALLOC(sz) malloc(sz)
 #define KMP_INTERNAL_FREE(p) free(p)
@@ -4675,14 +4671,13 @@ int __kmp_execute_tasks_oncore(kmp_info_t *thread, kmp_int32 gtid,
                                kmp_int32 is_constrained);
 
 int __kmp_execute_tasks_i32_lt(kmp_info_t *thread, kmp_int32 gtid,
-                               kmp_flag_i32_lt * flag, int final_spin,
+                               kmp_flag_i32_lt *flag, int final_spin,
                                int *thread_finished,
 #if USE_ITT_BUILD
                                void *itt_sync_obj,
 #endif /* USE_ITT_BUILD */
                                kmp_int32 is_constrained);
 
-
 extern int __kmp_nesting_mode;
 extern int __kmp_nesting_mode_nlevels;
 extern int *__kmp_nesting_nth_level;
diff --git a/openmp/runtime/src/kmp_config.h.cmake b/openmp/runtime/src/kmp_config.h.cmake
index b0cd0ed296e7f6..82782cfee1e293 100644
--- a/openmp/runtime/src/kmp_config.h.cmake
+++ b/openmp/runtime/src/kmp_config.h.cmake
@@ -48,6 +48,8 @@
 #define OMPD_SUPPORT LIBOMP_OMPD_SUPPORT
 #cmakedefine01 LIBOMP_OMPX_TASKGRAPH
 #define OMPX_TASKGRAPH LIBOMP_OMPX_TASKGRAPH
+#cmakedefine01 LIBOMP_KMP_TASK_THROTTLING_GLOBAL
+#define KMP_TASK_THROTTLING_GLOBAL LIBOMP_KMP_TASK_THROTTLING_GLOBAL
 #cmakedefine01 LIBOMP_PROFILING_SUPPORT
 #define OMP_PROFILING_SUPPORT LIBOMP_PROFILING_SUPPORT
 #cmakedefine01 LIBOMP_OMPT_OPTIONAL
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index da5925bee46036..25e6c8a3746d89 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -361,13 +361,12 @@ int __kmp_task_stealing_constraint = 1; /* Constrain task stealing by default */
  *  - n of ready-tasks in the current thread queue ('per-thread')
  *  - n of children tasks of for a parent task ('children')
  */
-kmp_int32 __kmp_enable_task_throttling                  = 1;
+kmp_int32 __kmp_enable_task_throttling = 1;
 #if KMP_TASK_THROTTLING_GLOBAL
-kmp_int32 __kmp_enable_task_throttling_global           = 0;
+kmp_int32 __kmp_enable_task_throttling_global = 0;
 #endif /* KMP_TASK_THROTTLING_GLOBAL */
 kmp_int32 __kmp_enable_task_throttling_ready_per_thread = 1;
-kmp_int32 __kmp_enable_task_throttling_children         = 0;
-
+kmp_int32 __kmp_enable_task_throttling_children = 0;
 
 #if KMP_TASK_THROTTLING_GLOBAL
 std::atomic<kmp_int32> __kmp_n_tasks_in_flight = 0;
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index d533cc47d82ec3..4fb30795d8f81c 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -5363,57 +5363,65 @@ static void __kmp_stg_print_task_throttling(kmp_str_buf_t *buffer,
 // -----------------------------------------------------------------------------
 // KMP_ENABLE_TASK_THROTTLING_GLOBAL
 
-# if KMP_TASK_THROTTLING_GLOBAL
-static void __kmp_stg_parse_task_throttling_global(char const *name, char const *value,
-                                            void *data) {
+#if KMP_TASK_THROTTLING_GLOBAL
+static void __kmp_stg_parse_task_throttling_global(char const *name,
+                                                   char const *value,
+                                                   void *data) {
   __kmp_stg_parse_bool(name, value, &__kmp_enable_task_throttling_global);
 } // __kmp_stg_parse_task_throttling_global
 
 static void __kmp_stg_print_task_throttling_global(kmp_str_buf_t *buffer,
-                                            char const *name, void *data) {
+                                                   char const *name,
+                                                   void *data) {
   __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling_global);
 } // __kmp_stg_print_task_throttling_global
-# endif /* KMP_TASK_THROTTLING_GLOBAL */
+#endif /* KMP_TASK_THROTTLING_GLOBAL */
 
 // -----------------------------------------------------------------------------
 // KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD
 
-static void __kmp_stg_parse_task_throttling_ready_per_thread(char const *name, char const *value,
-                                            void *data) {
-  __kmp_stg_parse_bool(name, value, &__kmp_enable_task_throttling_ready_per_thread);
+static void __kmp_stg_parse_task_throttling_ready_per_thread(char const *name,
+                                                             char const *value,
+                                                             void *data) {
+  __kmp_stg_parse_bool(name, value,
+                       &__kmp_enable_task_throttling_ready_per_thread);
 } // __kmp_stg_parse_task_throttling_ready_per_thread
 
-static void __kmp_stg_print_task_throttling_ready_per_thread(kmp_str_buf_t *buffer,
-                                            char const *name, void *data) {
-  __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling_ready_per_thread);
+static void
+__kmp_stg_print_task_throttling_ready_per_thread(kmp_str_buf_t *buffer,
+                                                 char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name,
+                       __kmp_enable_task_throttling_ready_per_thread);
 } // __kmp_stg_print_task_throttling_ready_per_thread
 
 // -----------------------------------------------------------------------------
 // KMP_ENABLE_TASK_THROTTLING_CHILDREN
 
-static void __kmp_stg_parse_task_throttling_children(char const *name, char const *value,
-                                            void *data) {
+static void __kmp_stg_parse_task_throttling_children(char const *name,
+                                                     char const *value,
+                                                     void *data) {
   __kmp_stg_parse_bool(name, value, &__kmp_enable_task_throttling_children);
 } // __kmp_stg_parse_task_throttling_children
 
 static void __kmp_stg_print_task_throttling_children(kmp_str_buf_t *buffer,
-                                            char const *name, void *data) {
+                                                     char const *name,
+                                                     void *data) {
   __kmp_stg_print_bool(buffer, name, __kmp_enable_task_throttling_children);
 } // __kmp_stg_print_task_throttling_children
 
 // -----------------------------------------------------------------------------
 // KMP_TASK_MAXIMUM_GLOBAL
-# if KMP_TASK_THROTTLING_GLOBAL
-static void __kmp_stg_parse_task_maximum_global(char const *name, char const *value,
-                                         void *data) {
+#if KMP_TASK_THROTTLING_GLOBAL
+static void __kmp_stg_parse_task_maximum_global(char const *name,
+                                                char const *value, void *data) {
   __kmp_stg_parse_int(name, value, 1, INT_MAX, &__kmp_task_maximum_global);
 } // __kmp_stg_parse_task_maximum_global
 
 static void __kmp_stg_print_task_maximum_global(kmp_str_buf_t *buffer,
-                                         char const *name, void *data) {
+                                                char const *name, void *data) {
   __kmp_stg_print_int(buffer, name, __kmp_task_maximum_global);
 } // __kmp_stg_print_task_maximum_global
-# endif /* KMP_TASK_THROTTLING_GLOBAL */
+#endif /* KMP_TASK_THROTTLING_GLOBAL */
 
 // -----------------------------------------------------------------------------
 // KMP_TASK_MAXIMUM_READY_PER_THREAD
@@ -5432,17 +5440,18 @@ static void __kmp_stg_print_task_maximum_ready_per_thread(kmp_str_buf_t *buffer,
 
 // -----------------------------------------------------------------------------
 // KMP_TASK_MAXIMUM_CHILDREN
-static void __kmp_stg_parse_task_maximum_children(char const *name, char const *value,
-                                         void *data) {
+static void __kmp_stg_parse_task_maximum_children(char const *name,
+                                                  char const *value,
+                                                  void *data) {
   __kmp_stg_parse_int(name, value, 1, INT_MAX, &__kmp_task_maximum_children);
 } // __kmp_stg_parse_task_maximum_children
 
 static void __kmp_stg_print_task_maximum_children(kmp_str_buf_t *buffer,
-                                         char const *name, void *data) {
+                                                  char const *name,
+                                                  void *data) {
   __kmp_stg_print_int(buffer, name, __kmp_task_maximum_children);
 } // __kmp_stg_print_task_maximum_children
 
-
 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
 // -----------------------------------------------------------------------------
 // KMP_USER_LEVEL_MWAIT
@@ -5833,23 +5842,27 @@ static kmp_setting_t __kmp_stg_table[] = {
 
     {"KMP_ENABLE_TASK_THROTTLING", __kmp_stg_parse_task_throttling,
      __kmp_stg_print_task_throttling, NULL, 0, 0},
-    {"KMP_ENABLE_TASK_THROTTLING_CHILDREN", __kmp_stg_parse_task_throttling_children,
+    {"KMP_ENABLE_TASK_THROTTLING_CHILDREN",
+     __kmp_stg_parse_task_throttling_children,
      __kmp_stg_print_task_throttling_children, NULL, 0, 0},
-    {"KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD", __kmp_stg_parse_task_throttling_ready_per_thread,
+    {"KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD",
+     __kmp_stg_parse_task_throttling_ready_per_thread,
      __kmp_stg_print_task_throttling_ready_per_thread, NULL, 0, 0},
-# if KMP_TASK_THROTTLING_GLOBAL
-    {"KMP_ENABLE_TASK_THROTTLING_GLOBAL", __kmp_stg_parse_task_throttling_global,
+#if KMP_TASK_THROTTLING_GLOBAL
+    {"KMP_ENABLE_TASK_THROTTLING_GLOBAL",
+     __kmp_stg_parse_task_throttling_global,
      __kmp_stg_print_task_throttling_global, NULL, 0, 0},
-# endif /* KMP_TASK_THROTTLING_GLOBAL */
+#endif /* KMP_TASK_THROTTLING_GLOBAL */
 
     {"KMP_TASK_MAXIMUM_CHILDREN", __kmp_stg_parse_task_maximum_children,
      __kmp_stg_print_task_maximum_children, NULL, 0, 0},
-    {"KMP_TASK_MAXIMUM_READY_PER_THREAD", __kmp_stg_parse_task_maximum_ready_per_thread,
+    {"KMP_TASK_MAXIMUM_READY_PER_THREAD",
+     __kmp_stg_parse_task_maximum_ready_per_thread,
      __kmp_stg_print_task_maximum_ready_per_thread, NULL, 0, 0},
-# if KMP_TASK_THROTTLING_GLOBAL
+#if KMP_TASK_THROTTLING_GLOBAL
     {"KMP_TASK_MAXIMUM_GLOBAL", __kmp_stg_parse_task_maximum_global,
      __kmp_stg_print_task_maximum_global, NULL, 0, 0},
-# endif /* KMP_TASK_THROTTLING_GLOBAL */
+#endif /* KMP_TASK_THROTTLING_GLOBAL */
 
     {"OMP_DISPLAY_ENV", __kmp_stg_parse_omp_display_env,
      __kmp_stg_print_omp_display_env, NULL, 0, 0},
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 7b9c7d2569184a..8631ae32f8ebf7 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -543,9 +543,13 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
 
   int locked = 0;
   // Check if deque is full and needs to be expanded
-  int requires_resize = TCR_4(thread_data->td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td);
+  int requires_resize = TCR_4(thread_data->td.td_deque_ntasks) >=
+                        TASK_DEQUE_SIZE(thread_data->td);
   // Check if dequeue has too many tasks and needs throttling
-  int requires_throttling = __kmp_enable_task_throttling && __kmp_enable_task_throttling_ready_per_thread && TCR_4(thread_data->td.td_deque_ntasks) >= __kmp_task_maximum_ready_per_thread;
+  int requires_throttling = __kmp_enable_task_throttling &&
+                            __kmp_enable_task_throttling_ready_per_thread &&
+                            TCR_4(thread_data->td.td_deque_ntasks) >=
+                                __kmp_task_maximum_ready_per_thread;
   if (requires_resize || requires_throttling) {
     int thread_can_execute =
         __kmp_task_is_allowed(gtid, __kmp_task_stealing_constraint, taskdata,
@@ -571,7 +575,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
     // Need to recheck as we can get a proxy task from thread outside of OpenMP
     requires_resize = TCR_4(thread_data->td.td_deque_ntasks) >=
                       TASK_DEQUE_SIZE(thread_data->td);
-    requires_throttling = __kmp_enable_task_throttling && __kmp_enable_task_throttling_ready_per_thread &&
+    requires_throttling = __kmp_enable_task_throttling &&
+                          __kmp_enable_task_throttling_ready_per_thread &&
                           TCR_4(thread_data->td.td_deque_ntasks) >=
                               __kmp_task_maximum_ready_per_thread;
     if (requires_resize || requires_throttling) {
@@ -1451,6 +1456,21 @@ static size_t __kmp_round_up_to_val(size_t size, size_t val) {
   return size;
 } // __kmp_round_up_to_va
 
+static inline void __kmp_empty_queues(kmp_info_t *thread, kmp_int32 gtid,
+                                      std::atomic<kmp_int32> *counter,
+                                      kmp_int32 threshold) {
+  int thread_finished = FALSE;
+#if USE_ITT_BUILD
+  void *itt_sync_obj = NULL;
+#endif /* USE_ITT_BUILD */
+
+  kmp_flag_i32_lt flag(counter, threshold);
+  while (KMP_ATOMIC_LD_ACQ(counter) >= threshold) {
+    flag.execute_tasks(thread, gtid, FALSE,
+                       &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 0);
+  }
+}
+
 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
 //
 // loc_ref: source location information
@@ -1481,29 +1501,18 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   // few before allocating a new one
   if (__kmp_enable_task_throttling) {
 #if KMP_TASK_THROTTLING_GLOBAL
-      if (__kmp_enable_task_throttling_global) {
-          // empty queues
-         kmp_flag_i32_lt flag(&__kmp_n_tasks_in_flight, __kmp_task_maximum_global);
-         while (KMP_ATOMIC_LD_ACQ(&__kmp_n_tasks_in_flight) >= __kmp_task_maximum_global) {
-             flag.execute_tasks(thread, gtid, FALSE, &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 0);
-         }
-
-          // we can now allocate a new task, increase counter
-          ++__kmp_n_tasks_in_flight;
-      }
+    if (__kmp_enable_task_throttling_global) {
+      __kmp_empty_queues(thread, gtid, &__kmp_n_tasks_in_flight,
+                         __kmp_task_maximum_global);
+      // allocating a new task now, increase the global counter
+      ++__kmp_n_tasks_in_flight;
+    }
 #endif /* KMP_TASK_THROTTLING_GLOBAL */
 
-      if (__kmp_enable_task_throttling_children) {
-         int thread_finished = FALSE;
-#if USE_ITT_BUILD
-         void *itt_sync_obj = NULL;
-#endif /* USE_ITT_BUILD */
-
-         kmp_flag_i32_lt flag(&parent_task->td_incomplete_child_tasks, __kmp_task_maximum_children);
-         while (KMP_ATOMIC_LD_ACQ(&parent_task->td_incomplete_child_tasks) >= __kmp_task_maximum_children) {
-             flag.execute_tasks(thread, gtid, FALSE, &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), 0);
-         }
-      }
+    if (__kmp_enable_task_throttling_children) {
+      __kmp_empty_queues(thread, gtid, &parent_task->td_incomplete_child_tasks,
+                         __kmp_task_maximum_children);
+    }
   } /* __kmp_enable_task_throttling */
 
   if (flags->hidden_helper) {
@@ -1696,7 +1705,9 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
     // Only need to keep track of allocated child tasks for explicit tasks since
     // implicit not deallocated; or if throttling is enabled
-    if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT || (__kmp_enable_task_throttling && __kmp_enable_task_throttling_children)) {
+    if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT ||
+        (__kmp_enable_task_throttling &&
+         __kmp_enable_task_throttling_children)) {
       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
     }
     if (flags->hidden_helper) {
diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h
index ea8779ccf06b0e..ad9b082cfe4185 100644
--- a/openmp/runtime/src/kmp_wait_release.h
+++ b/openmp/runtime/src/kmp_wait_release.h
@@ -853,25 +853,23 @@ class kmp_flag_32 : public kmp_flag_atomic<kmp_uint32, flag32, Sleepable> {
 };
 
 class kmp_flag_i32_lt {
-    protected:
-        std::atomic<kmp_int32> * loc;
-        kmp_int32 checker;
-
-    public:
-        kmp_flag_i32_lt(std::atomic<kmp_int32> *p, kmp_int32 c)
-            : loc(p), checker(c) {}
-
-        int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
-                int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
-                kmp_int32 is_constrained) {
-            return __kmp_execute_tasks_i32_lt(
-                    this_thr, gtid, this, final_spin,
-                    thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
-        }
+protected:
+  std::atomic<kmp_int32> *loc;
+  kmp_int32 checker;
 
-        bool done_check() {
-            return loc->load() <= checker;
-        }
+public:
+  kmp_flag_i32_lt(std::atomic<kmp_int32> *p, kmp_int32 c)
+      : loc(p), checker(c) {}
+
+  int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
+                    int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
+                    kmp_int32 is_constrained) {
+    return __kmp_execute_tasks_i32_lt(
+        this_thr, gtid, this, final_spin,
+        thread_finished USE_ITT_BUILD_ARG(itt_sync_obj), is_constrained);
+  }
+
+  bool done_check() { return loc->load() <= checker; }
 };
 
 template <bool Cancellable, bool Sleepable>
diff --git a/openmp/runtime/test/tasking/omp_throttling.h b/openmp/runtime/test/tasking/omp_throttling.h
index 4b91a26e2f5330..be4c27161a381f 100644
--- a/openmp/runtime/test/tasking/omp_throttling.h
+++ b/openmp/runtime/test/tasking/omp_throttling.h
@@ -1,9 +1,9 @@
 /**
- *  This test ensures that task throttling works properly
+ *  This test ensures that a task throttling works
  *
  *  It creates 2 threads (1 producer, 1 consummer)
  *  The producer infinitely create tasks 'T_i' until one executed
- *  The consumer is blocked until the producer starts throttling
+ *  The consumer is blocked until any 'T_i' executed
  *  Executing any 'T_i' unblocks the consumer and stop the producer
  *
  *  The assertion tests ensures that the producer does not create more than the
@@ -22,14 +22,15 @@ int main(void) {
   assert(throttling_str);
   assert(*throttling_str == '1');
 
-  /* maximum number of tasks in-flight */
+  /* retrieve the maximum number of task for this specific throttling parameter
+   */
   char *max_tasks_str = getenv(MAX_ENV_VAR);
   assert(max_tasks_str);
   int max_tasks = atoi(max_tasks_str);
   if (max_tasks <= 0)
     max_tasks = 1;
 
-  /* check that throttling is enabled (disabled by default) */
+  /* check that specific throttling under-test is enabled */
   throttling_str = getenv(ENABLE_ENV_VAR);
   assert(throttling_str);
   assert(*throttling_str == '1');
@@ -37,8 +38,8 @@ int main(void) {
   volatile int done = 0;
   int ntasks = 0;
 
-/* testing KMP_TASK_MAXIMUM */
-#pragma omp parallel num_threads(2) default(none) shared(max_tasks, done, ntasks)
+#pragma omp parallel num_threads(2) default(none)                              \
+    shared(max_tasks, done, ntasks)
   {
     if (omp_get_thread_num() == 1)
       while (!done)
@@ -47,14 +48,12 @@ int main(void) {
 #pragma omp single
     {
       while (!done) {
-# if USE_DEPS
-            # pragma omp task default(none) shared(done) depend(out : max_tasks)
-# else
-            # pragma omp task default(none) shared(done)
-# endif
-            {
-                done = 1;
-            }
+#if USE_DEPS
+#pragma omp task default(none) shared(done) depend(out : max_tasks)
+#else
+#pragma omp task default(none) shared(done)
+#endif
+        { done = 1; }
 
         assert(++ntasks <= max_tasks + 1);
       }
diff --git a/openmp/runtime/test/tasking/omp_throttling_max_children.c b/openmp/runtime/test/tasking/omp_throttling_max_children.c
index fc7131e691d11d..f7ef3405c44031 100644
--- a/openmp/runtime/test/tasking/omp_throttling_max_children.c
+++ b/openmp/runtime/test/tasking/omp_throttling_max_children.c
@@ -5,8 +5,8 @@
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=0 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=0 KMP_ENABLE_TASK_THROTTLING_CHILDREN=1 KMP_TASK_MAXIMUM_CHILDREN=100000 %libomp-run
 // clang-format on
 
-# define ENABLE_ENV_VAR ("KMP_ENABLE_TASK_THROTTLING_CHILDREN")
-# define MAX_ENV_VAR    ("KMP_TASK_MAXIMUM_CHILDREN")
-# define USE_DEPS       (1)
+#define ENABLE_ENV_VAR ("KMP_ENABLE_TASK_THROTTLING_CHILDREN")
+#define MAX_ENV_VAR ("KMP_TASK_MAXIMUM_CHILDREN")
+#define USE_DEPS (1)
 
-# include "omp_throttling.h"
+#include "omp_throttling.h"
diff --git a/openmp/runtime/test/tasking/omp_throttling_max_global.c b/openmp/runtime/test/tasking/omp_throttling_max_global.c
index a553906a35417e..600c78a1ee9ab7 100644
--- a/openmp/runtime/test/tasking/omp_throttling_max_global.c
+++ b/openmp/runtime/test/tasking/omp_throttling_max_global.c
@@ -6,8 +6,8 @@
 // REQUIRES: kmp_task_throttling_global
 // clang-format on
 
-# define ENABLE_ENV_VAR ("KMP_ENABLE_TASK_THROTTLING_GLOBAL")
-# define MAX_ENV_VAR    ("KMP_TASK_MAXIMUM_GLOBAL")
-# define USE_DEPS       (1)
+#define ENABLE_ENV_VAR ("KMP_ENABLE_TASK_THROTTLING_GLOBAL")
+#define MAX_ENV_VAR ("KMP_TASK_MAXIMUM_GLOBAL")
+#define USE_DEPS (1)
 
-# include "omp_throttling.h"
+#include "omp_throttling.h"
diff --git a/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c b/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
index 0d0bd16672a583..7e5d94a7e36f72 100644
--- a/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
+++ b/openmp/runtime/test/tasking/omp_throttling_max_ready_per_thread.c
@@ -5,8 +5,8 @@
 // RUN: %libomp-compile && env OMP_NUM_THREADS=2 KMP_ENABLE_TASK_THROTTLING=1 KMP_ENABLE_TASK_THROTTLING_GLOBAL=0 KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD=1 KMP_ENABLE_TASK_THROTTLING_CHILDREN=0 KMP_TASK_MAXIMUM_READY_PER_THREAD=100000 %libomp-run
 // clang-format on
 
-# define ENABLE_ENV_VAR ("KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD")
-# define MAX_ENV_VAR    ("KMP_TASK_MAXIMUM_READY_PER_THREAD")
-# define USE_DEPS       (0)
+#define ENABLE_ENV_VAR ("KMP_ENABLE_TASK_THROTTLING_READY_PER_THREAD")
+#define MAX_ENV_VAR ("KMP_TASK_MAXIMUM_READY_PER_THREAD")
+#define USE_DEPS (0)
 
-# include "omp_throttling.h"
+#include "omp_throttling.h"

>From 4fe15804b992f359cfb7bc3263ed643e6ef369d4 Mon Sep 17 00:00:00 2001
From: Romain PEREIRA <romain.pereira at inria.fr>
Date: Sun, 10 Mar 2024 02:10:06 +0100
Subject: [PATCH 8/8] Reverted 'inline' removal from
 __kmp_execute_tasks_template mistakenly pushed

---
 openmp/runtime/src/kmp_tasking.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 8631ae32f8ebf7..2306e21b7ea95d 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -3410,7 +3410,7 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
 // spinner == NULL means only execute a single task and return.
 // checker is the value to check to terminate the spin.
 template <class C>
-static int __kmp_execute_tasks_template(
+static inline int __kmp_execute_tasks_template(
     kmp_info_t *thread, kmp_int32 gtid, C *flag, int final_spin,
     int *thread_finished USE_ITT_BUILD_ARG(void *itt_sync_obj),
     kmp_int32 is_constrained) {



More information about the Openmp-commits mailing list