[Openmp-commits] [clang] [llvm] [openmp] [OpenMP] OpenMP 6.0 taskgraph support (WIP) (PR #188765)

Julian Brown via Openmp-commits openmp-commits at lists.llvm.org
Tue Jun 23 03:35:58 PDT 2026


https://github.com/jtb20 updated https://github.com/llvm/llvm-project/pull/188765

>From 4d8a14c7189fbd0a1a5c2d6bcc5105e248bed679 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Fri, 27 Mar 2026 11:23:09 -0500
Subject: [PATCH 01/24] [OpenMP] Mostly remove experimental taskgraph support
 from runtime

Because the changes to runtime support for taskgraph record/replay
are quite extensive, they will likely be more comprehensible if the
existing parts are removed before the new parts are added.  This is the
removal part.

commit-id:ebbd0706

Pull Request: https://github.com/llvm/llvm-project/pull/194046
---
 openmp/runtime/src/kmp.h                      |  72 ---
 openmp/runtime/src/kmp_global.cpp             |  12 -
 openmp/runtime/src/kmp_settings.cpp           |  28 --
 openmp/runtime/src/kmp_taskdeps.cpp           | 128 +----
 openmp/runtime/src/kmp_taskdeps.h             |  25 +-
 openmp/runtime/src/kmp_tasking.cpp            | 451 +-----------------
 .../test/tasking/omp_record_replay.cpp        |  48 --
 .../test/tasking/omp_record_replay_deps.cpp   |  63 ---
 .../omp_record_replay_deps_multi_succ.cpp     |  56 ---
 .../tasking/omp_record_replay_multiTDGs.cpp   |  76 ---
 .../tasking/omp_record_replay_print_dot.cpp   |  80 ----
 .../tasking/omp_record_replay_taskloop.cpp    |  50 --
 12 files changed, 6 insertions(+), 1083 deletions(-)
 delete mode 100644 openmp/runtime/test/tasking/omp_record_replay.cpp
 delete mode 100644 openmp/runtime/test/tasking/omp_record_replay_deps.cpp
 delete mode 100644 openmp/runtime/test/tasking/omp_record_replay_deps_multi_succ.cpp
 delete mode 100644 openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp
 delete mode 100644 openmp/runtime/test/tasking/omp_record_replay_print_dot.cpp
 delete mode 100644 openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index fbeb58fab1d16..df98f03c7dcdc 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2632,26 +2632,6 @@ typedef struct {
 } kmp_event_t;
 
 #if OMP_TASKGRAPH_EXPERIMENTAL
-// Initial number of allocated nodes while recording
-#define INIT_MAPSIZE 50
-
-typedef struct kmp_taskgraph_flags { /*This needs to be exactly 32 bits */
-  unsigned nowait : 1;
-  unsigned re_record : 1;
-  unsigned reserved : 30;
-} kmp_taskgraph_flags_t;
-
-/// Represents a TDG node
-typedef struct kmp_node_info {
-  kmp_task_t *task; // Pointer to the actual task
-  kmp_int32 *successors; // Array of the succesors ids
-  kmp_int32 nsuccessors; // Number of succesors of the node
-  std::atomic<kmp_int32>
-      npredecessors_counter; // Number of predessors on the fly
-  kmp_int32 npredecessors; // Total number of predecessors
-  kmp_int32 successors_size; // Number of allocated succesors ids
-  kmp_taskdata_t *parent_task; // Parent implicit task
-} kmp_node_info_t;
 
 /// Represent a TDG's current status
 typedef enum kmp_tdg_status {
@@ -2660,43 +2640,12 @@ typedef enum kmp_tdg_status {
   KMP_TDG_READY = 2
 } kmp_tdg_status_t;
 
-/// Structure that contains a TDG
-typedef struct kmp_tdg_info {
-  kmp_int32 tdg_id; // Unique idenfifier of the TDG
-  kmp_taskgraph_flags_t tdg_flags; // Flags related to a TDG
-  kmp_int32 map_size; // Number of allocated TDG nodes
-  kmp_int32 num_roots; // Number of roots tasks int the TDG
-  kmp_int32 *root_tasks; // Array of tasks identifiers that are roots
-  kmp_node_info_t *record_map; // Array of TDG nodes
-  kmp_tdg_status_t tdg_status =
-      KMP_TDG_NONE; // Status of the TDG (recording, ready...)
-  std::atomic<kmp_int32> num_tasks; // Number of TDG nodes
-  kmp_bootstrap_lock_t
-      graph_lock; // Protect graph attributes when updated via taskloop_recur
-  // Taskloop reduction related
-  void *rec_taskred_data; // Data to pass to __kmpc_task_reduction_init or
-                          // __kmpc_taskred_init
-  kmp_int32 rec_num_taskred;
-} kmp_tdg_info_t;
-
-extern int __kmp_tdg_dot;
-extern kmp_int32 __kmp_max_tdgs;
-extern kmp_tdg_info_t **__kmp_global_tdgs;
-extern kmp_int32 __kmp_curr_tdg_idx;
-extern kmp_int32 __kmp_successors_size;
-extern std::atomic<kmp_int32> __kmp_tdg_task_id;
-extern kmp_int32 __kmp_num_tdg;
 #endif
 
 typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
 #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
   /* Same fields as in the #else branch, but in reverse order */
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  unsigned reserved31 : 4;
-  unsigned onced : 1;
-#else
   unsigned reserved31 : 5;
-#endif
   unsigned hidden_helper : 1;
   unsigned target : 1;
   unsigned native : 1;
@@ -2752,13 +2701,8 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
   unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
   unsigned target : 1;
   unsigned hidden_helper : 1; /* 1 == hidden helper task */
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  unsigned onced : 1; /* 1==ran once already, 0==never ran, record & replay purposes */
-  unsigned reserved31 : 4; /* reserved for library use */
-#else
   unsigned reserved31 : 5; /* reserved for library use */
 #endif
-#endif
 } kmp_tasking_flags_t;
 
 typedef struct kmp_target_data {
@@ -2808,9 +2752,6 @@ struct kmp_taskdata { /* aligned during dynamic allocation       */
   ompt_task_info_t ompt_task_info;
 #endif
 #if OMP_TASKGRAPH_EXPERIMENTAL
-  bool is_taskgraph = 0; // whether the task is within a TDG
-  kmp_tdg_info_t *tdg; // used to associate task with a TDG
-  kmp_int32 td_tdg_task_id; // local task id in its TDG
 #endif
   kmp_target_data_t td_target_data;
 }; // struct kmp_taskdata
@@ -4385,20 +4326,7 @@ KMP_EXPORT void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid,
 KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
                                                 void **user_lock,
                                                 uintptr_t hint);
-
 #if OMP_TASKGRAPH_EXPERIMENTAL
-// Taskgraph's Record & Replay mechanism
-// __kmp_tdg_is_recording: check whether a given TDG is recording
-// status: the tdg's current status
-static inline bool __kmp_tdg_is_recording(kmp_tdg_status_t status) {
-  return status == KMP_TDG_RECORDING;
-}
-
-KMP_EXPORT kmp_int32 __kmpc_start_record_task(ident_t *loc, kmp_int32 gtid,
-                                              kmp_int32 input_flags,
-                                              kmp_int32 tdg_id);
-KMP_EXPORT void __kmpc_end_record_task(ident_t *loc, kmp_int32 gtid,
-                                       kmp_int32 input_flags, kmp_int32 tdg_id);
 #endif
 /* Interface to fast scalable reduce methods routines */
 
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 15b9babfaf0ba..3df46baa57544 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -553,17 +553,5 @@ int __kmp_nesting_mode = 0;
 int __kmp_nesting_mode_nlevels = 1;
 int *__kmp_nesting_nth_level;
 
-#if OMP_TASKGRAPH_EXPERIMENTAL
-// TDG record & replay
-int __kmp_tdg_dot = 0;
-kmp_int32 __kmp_max_tdgs = 100;
-kmp_tdg_info_t **__kmp_global_tdgs = NULL;
-kmp_int32 __kmp_curr_tdg_idx =
-    0; // Id of the current TDG being recorded or executed
-kmp_int32 __kmp_num_tdg = 0;
-kmp_int32 __kmp_successors_size = 10; // Initial succesor size list for
-                                      // recording
-std::atomic<kmp_int32> __kmp_tdg_task_id = 0;
-#endif
 // end of file //
 
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index 66ef6f8097dce..615058af9705f 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -1266,28 +1266,6 @@ static void __kmp_stg_parse_num_threads(char const *name, char const *value,
   K_DIAG(1, ("__kmp_dflt_team_nth == %d\n", __kmp_dflt_team_nth));
 } // __kmp_stg_parse_num_threads
 
-#if OMP_TASKGRAPH_EXPERIMENTAL
-static void __kmp_stg_parse_max_tdgs(char const *name, char const *value,
-                                     void *data) {
-  __kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_max_tdgs);
-} // __kmp_stg_parse_max_tdgs
-
-static void __kmp_std_print_max_tdgs(kmp_str_buf_t *buffer, char const *name,
-                                     void *data) {
-  __kmp_stg_print_int(buffer, name, __kmp_max_tdgs);
-} // __kmp_std_print_max_tdgs
-
-static void __kmp_stg_parse_tdg_dot(char const *name, char const *value,
-                                   void *data) {
-  __kmp_stg_parse_bool(name, value, &__kmp_tdg_dot);
-} // __kmp_stg_parse_tdg_dot
-
-static void __kmp_stg_print_tdg_dot(kmp_str_buf_t *buffer, char const *name,
-                                   void *data) {
-  __kmp_stg_print_bool(buffer, name, __kmp_tdg_dot);
-} // __kmp_stg_print_tdg_dot
-#endif
-
 static void __kmp_stg_parse_num_hidden_helper_threads(char const *name,
                                                       char const *value,
                                                       void *data) {
@@ -5742,12 +5720,6 @@ static kmp_setting_t __kmp_stg_table[] = {
     {"LIBOMP_NUM_HIDDEN_HELPER_THREADS",
      __kmp_stg_parse_num_hidden_helper_threads,
      __kmp_stg_print_num_hidden_helper_threads, NULL, 0, 0},
-#if OMP_TASKGRAPH_EXPERIMENTAL
-    {"KMP_MAX_TDGS", __kmp_stg_parse_max_tdgs, __kmp_std_print_max_tdgs, NULL,
-     0, 0},
-    {"KMP_TDG_DOT", __kmp_stg_parse_tdg_dot, __kmp_stg_print_tdg_dot, NULL, 0,
-     0},
-#endif
 
 #if OMPT_SUPPORT
     {"OMP_TOOL", __kmp_stg_parse_omp_tool, __kmp_stg_print_omp_tool, NULL, 0,
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index b1a0848fc722f..6884cd144f89e 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -222,47 +222,6 @@ static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
 static inline void __kmp_track_dependence(kmp_int32 gtid, kmp_depnode_t *source,
                                           kmp_depnode_t *sink,
                                           kmp_task_t *sink_task) {
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
-  kmp_taskdata_t *task_sink = KMP_TASK_TO_TASKDATA(sink_task);
-  if (source->dn.task && sink_task) {
-    // Not supporting dependency between two tasks that one is within the TDG
-    // and the other is not
-    KMP_ASSERT(task_source->is_taskgraph == task_sink->is_taskgraph);
-  }
-  if (task_sink->is_taskgraph &&
-      __kmp_tdg_is_recording(task_sink->tdg->tdg_status)) {
-    kmp_node_info_t *source_info =
-        &task_sink->tdg->record_map[task_source->td_tdg_task_id];
-    bool exists = false;
-    for (int i = 0; i < source_info->nsuccessors; i++) {
-      if (source_info->successors[i] == task_sink->td_tdg_task_id) {
-        exists = true;
-        break;
-      }
-    }
-    if (!exists) {
-      if (source_info->nsuccessors >= source_info->successors_size) {
-        kmp_uint old_size = source_info->successors_size;
-        source_info->successors_size = 2 * source_info->successors_size;
-        kmp_int32 *old_succ_ids = source_info->successors;
-        kmp_int32 *new_succ_ids = (kmp_int32 *)__kmp_allocate(
-            source_info->successors_size * sizeof(kmp_int32));
-        KMP_MEMCPY(new_succ_ids, old_succ_ids, old_size * sizeof(kmp_int32));
-        source_info->successors = new_succ_ids;
-        __kmp_free(old_succ_ids);
-      }
-
-      source_info->successors[source_info->nsuccessors] =
-          task_sink->td_tdg_task_id;
-      source_info->nsuccessors++;
-
-      kmp_node_info_t *sink_info =
-          &(task_sink->tdg->record_map[task_sink->td_tdg_task_id]);
-      sink_info->npredecessors++;
-    }
-  }
-#endif
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
   kmp_taskdata_t *task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
   // do not use sink->dn.task as that is only filled after the dependences
@@ -311,24 +270,11 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
   // link node as successor of list elements
   for (kmp_depnode_list_t *p = plist; p; p = p->next) {
     kmp_depnode_t *dep = p->node;
-#if OMP_TASKGRAPH_EXPERIMENTAL
-    kmp_tdg_status tdg_status = KMP_TDG_NONE;
-    if (task) {
-      kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
-      if (td->is_taskgraph)
-        tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
-      if (__kmp_tdg_is_recording(tdg_status))
-        __kmp_track_dependence(gtid, dep, node, task);
-    }
-#endif
     if (dep->dn.task) {
       KMP_ACQUIRE_DEPNODE(gtid, dep);
       if (dep->dn.task) {
         if (!dep->dn.successors || dep->dn.successors->node != node) {
-#if OMP_TASKGRAPH_EXPERIMENTAL
-          if (!(__kmp_tdg_is_recording(tdg_status)) && task)
-#endif
-            __kmp_track_dependence(gtid, dep, node, task);
+          __kmp_track_dependence(gtid, dep, node, task);
           dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
           KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                         "%p\n",
@@ -352,44 +298,18 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
   if (!sink)
     return 0;
   kmp_int32 npredecessors = 0;
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  kmp_tdg_status tdg_status = KMP_TDG_NONE;
-  kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(task);
-  if (task) {
-    if (td->is_taskgraph)
-      tdg_status = KMP_TASK_TO_TASKDATA(task)->tdg->tdg_status;
-    if (__kmp_tdg_is_recording(tdg_status) && sink->dn.task)
-      __kmp_track_dependence(gtid, sink, source, task);
-  }
-#endif
   if (sink->dn.task) {
     // synchronously add source to sink' list of successors
     KMP_ACQUIRE_DEPNODE(gtid, sink);
     if (sink->dn.task) {
       if (!sink->dn.successors || sink->dn.successors->node != source) {
-#if OMP_TASKGRAPH_EXPERIMENTAL
-        if (!(__kmp_tdg_is_recording(tdg_status)) && task)
-#endif
-          __kmp_track_dependence(gtid, sink, source, task);
+        __kmp_track_dependence(gtid, sink, source, task);
         sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
         KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                     "%p\n",
                     gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
                     KMP_TASK_TO_TASKDATA(task)));
-#if OMP_TASKGRAPH_EXPERIMENTAL
-        if (__kmp_tdg_is_recording(tdg_status)) {
-          kmp_taskdata_t *tdd = KMP_TASK_TO_TASKDATA(sink->dn.task);
-          if (tdd->is_taskgraph) {
-            if (tdd->td_flags.onced)
-              // decrement npredecessors if sink->dn.task belongs to a taskgraph
-              // and
-              //  1) the task is reset to its initial state (by kmp_free_task) or
-              //  2) the task is complete but not yet reset
-              npredecessors--;
-          }
-        }
-#endif
-      npredecessors++;
+        npredecessors++;
       }
     }
     KMP_RELEASE_DEPNODE(gtid, sink);
@@ -694,48 +614,6 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
 
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  // record TDG with deps
-  if (new_taskdata->is_taskgraph &&
-      __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
-    kmp_tdg_info_t *tdg = new_taskdata->tdg;
-    // extend record_map if needed
-    if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
-      __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
-      if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
-        kmp_uint old_size = tdg->map_size;
-        kmp_uint new_size = old_size * 2;
-        kmp_node_info_t *old_record = tdg->record_map;
-        kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
-            new_size * sizeof(kmp_node_info_t));
-        KMP_MEMCPY(new_record, tdg->record_map,
-                   old_size * sizeof(kmp_node_info_t));
-        tdg->record_map = new_record;
-
-        __kmp_free(old_record);
-
-        for (kmp_uint i = old_size; i < new_size; i++) {
-          kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
-              __kmp_successors_size * sizeof(kmp_int32));
-          new_record[i].task = nullptr;
-          new_record[i].successors = successorsList;
-          new_record[i].nsuccessors = 0;
-          new_record[i].npredecessors = 0;
-          new_record[i].successors_size = __kmp_successors_size;
-          KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
-        }
-        // update the size at the end, so that we avoid other
-        // threads use old_record while map_size is already updated
-        tdg->map_size = new_size;
-      }
-      __kmp_release_bootstrap_lock(&tdg->graph_lock);
-    }
-    tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
-    tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
-        new_taskdata->td_parent;
-    KMP_ATOMIC_INC(&tdg->num_tasks);
-  }
-#endif
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     if (!current_task->ompt_task_info.frame.enter_frame.ptr)
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
index 0792baf67f162..71e8e69d44593 100644
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -95,23 +95,6 @@ static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) {
 extern void __kmpc_give_task(kmp_task_t *ptask, kmp_int32 start);
 
 static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
-
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  if (task->is_taskgraph && !(__kmp_tdg_is_recording(task->tdg->tdg_status))) {
-    kmp_node_info_t *TaskInfo = &(task->tdg->record_map[task->td_tdg_task_id]);
-
-    for (int i = 0; i < TaskInfo->nsuccessors; i++) {
-      kmp_int32 successorNumber = TaskInfo->successors[i];
-      kmp_node_info_t *successor = &(task->tdg->record_map[successorNumber]);
-      kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->npredecessors_counter) - 1;
-      if (successor->task != nullptr && npredecessors == 0) {
-        __kmp_omp_task(gtid, successor->task, false);
-      }
-    }
-    return;
-  }
-#endif
-
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_depnode_t *node = task->td_depnode;
 
@@ -140,12 +123,8 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
                 gtid, task));
 
   KMP_ACQUIRE_DEPNODE(gtid, node);
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  if (!task->is_taskgraph ||
-      (task->is_taskgraph && !__kmp_tdg_is_recording(task->tdg->tdg_status)))
-#endif
-    node->dn.task =
-        NULL; // mark this task as finished, so no new dependencies are generated
+  node->dn.task =
+      NULL; // mark this task as finished, so no new dependencies are generated
   KMP_RELEASE_DEPNODE(gtid, node);
 
   kmp_depnode_list_t *next;
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index be42ed6f76abc..23092927babc1 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -37,10 +37,6 @@ static void __kmp_alloc_task_deque(kmp_info_t *thread,
 static int __kmp_realloc_task_threads_data(kmp_info_t *thread,
                                            kmp_task_team_t *task_team);
 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask);
-#if OMP_TASKGRAPH_EXPERIMENTAL
-static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id);
-int __kmp_taskloop_task(int gtid, void *ptask);
-#endif
 
 // returns 1 if new task is allowed to execute, 0 otherwise
 // checks Task Scheduling constraint (if requested) and
@@ -70,11 +66,7 @@ static bool __kmp_task_is_allowed(int gtid, const kmp_int32 is_constrained,
   }
   // Check mutexinoutset dependencies, acquire locks
   kmp_depnode_t *node = tasknew->td_depnode;
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  if (!tasknew->is_taskgraph && UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
-#else
   if (UNLIKELY(node && (node->dn.mtx_num_locks > 0))) {
-#endif
     for (int i = 0; i < node->dn.mtx_num_locks; ++i) {
       KMP_DEBUG_ASSERT(node->dn.mtx_locks[i] != NULL);
       if (__kmp_test_lock(node->dn.mtx_locks[i], gtid))
@@ -665,33 +657,12 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
   task->data2.priority = 0;
 
   taskdata->td_flags.freed = 1;
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  // do not free tasks in taskgraph
-  if (!taskdata->is_taskgraph) {
-#endif
 // deallocate the taskdata and shared variable blocks associated with this task
 #if USE_FAST_MEMORY
   __kmp_fast_free(thread, taskdata);
 #else /* ! USE_FAST_MEMORY */
   __kmp_thread_free(thread, taskdata);
 #endif
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  } else {
-    taskdata->td_flags.complete = 0;
-    taskdata->td_flags.started = 0;
-    taskdata->td_flags.freed = 0;
-    taskdata->td_flags.executing = 0;
-    taskdata->td_flags.task_serial =
-        (taskdata->td_parent->td_flags.final ||
-          taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
-
-    // taskdata->td_allow_completion_event.pending_events_count = 1;
-    KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
-    KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
-    // start at one because counts current task and children
-    KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
-  }
-#endif
 
   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
 }
@@ -779,10 +750,6 @@ static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
         flags.detachable == TASK_DETACHABLE || flags.hidden_helper;
   ret = ret ||
         KMP_ATOMIC_LD_ACQ(&taskdata->td_parent->td_incomplete_child_tasks) > 0;
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  if (taskdata->td_taskgroup && taskdata->is_taskgraph)
-    ret = ret || KMP_ATOMIC_LD_ACQ(&taskdata->td_taskgroup->count) > 0;
-#endif
   return ret;
 }
 
@@ -802,10 +769,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_task_team_t *task_team =
       thread->th.th_task_team; // might be NULL for serial teams...
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  // to avoid seg fault when we need to access taskdata->td_flags after free when using vanilla taskloop
-  bool is_taskgraph;
-#endif
 #if KMP_DEBUG
   kmp_int32 children = 0;
 #endif
@@ -815,10 +778,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
 
   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  is_taskgraph = taskdata->is_taskgraph;
-#endif
-
   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
     // untied task needs to check the counter so that the task structure is not
     // freed prematurely
@@ -923,9 +882,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
 
   if (completed) {
     taskdata->td_flags.complete = 1; // mark the task as completed
-#if OMP_TASKGRAPH_EXPERIMENTAL
-    taskdata->td_flags.onced = 1; // mark the task as ran once already
-#endif
 
 #if OMPT_SUPPORT
     // This is not a detached task, we are done here
@@ -942,11 +898,7 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
 #endif
           KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
       KMP_DEBUG_ASSERT(children >= 0);
-#if OMP_TASKGRAPH_EXPERIMENTAL
-      if (taskdata->td_taskgroup && !taskdata->is_taskgraph)
-#else
       if (taskdata->td_taskgroup)
-#endif
         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
     } else if (task_team && (task_team->tt.tt_found_proxy_tasks ||
                              task_team->tt.tt_hidden_helper_task_encountered)) {
@@ -985,19 +937,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   // KMP_DEBUG_ASSERT( resumed_task->td_flags.executing == 0 );
   resumed_task->td_flags.executing = 1; // resume previous task
 
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  if (is_taskgraph && __kmp_track_children_task(taskdata) &&
-      taskdata->td_taskgroup) {
-    // TDG: we only release taskgroup barrier here because
-    // free_task_and_ancestors will call
-    // __kmp_free_task, which resets all task parameters such as
-    // taskdata->started, etc. If we release the barrier earlier, these
-    // parameters could be read before being reset. This is not an issue for
-    // non-TDG implementation because we never reuse a task(data) structure
-    KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
-  }
-#endif
-
   KA_TRACE(
       10, ("__kmp_task_finish(exit): T#%d finished task %p, resuming task %p\n",
            gtid, taskdata, resumed_task));
@@ -1113,9 +1052,6 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
   task->td_flags.executing = 1;
   task->td_flags.complete = 0;
   task->td_flags.freed = 0;
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  task->td_flags.onced = 0;
-#endif
 
   task->td_depnode = NULL;
   task->td_last_tied = task;
@@ -1159,9 +1095,6 @@ void __kmp_finish_implicit_task(kmp_info_t *thread) {
   if (task->td_dephash) {
     int children;
     task->td_flags.complete = 1;
-#if OMP_TASKGRAPH_EXPERIMENTAL
-    task->td_flags.onced = 1;
-#endif
     children = KMP_ATOMIC_LD_ACQ(&task->td_incomplete_child_tasks);
     kmp_tasking_flags_t flags_old = task->td_flags;
     if (children == 0 && flags_old.complete == 1) {
@@ -1390,11 +1323,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   taskdata->td_flags.executing = 0;
   taskdata->td_flags.complete = 0;
   taskdata->td_flags.freed = 0;
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  taskdata->td_flags.onced = 0;
-  taskdata->is_taskgraph = 0;
-  taskdata->tdg = nullptr;
-#endif
   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
   // start at one because counts current task and children
   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
@@ -1430,16 +1358,6 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
     }
   }
 
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
-  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status) &&
-      (task_entry != (kmp_routine_entry_t)__kmp_taskloop_task)) {
-    taskdata->is_taskgraph = 1;
-    taskdata->tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
-    taskdata->td_task_id = KMP_GEN_TASK_ID();
-    taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
-  }
-#endif
   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
                 gtid, taskdata, taskdata->td_parent));
 
@@ -1807,53 +1725,6 @@ kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
                          bool serialize_immediate) {
   kmp_taskdata_t *new_taskdata = KMP_TASK_TO_TASKDATA(new_task);
 
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  if (new_taskdata->is_taskgraph &&
-      __kmp_tdg_is_recording(new_taskdata->tdg->tdg_status)) {
-    kmp_tdg_info_t *tdg = new_taskdata->tdg;
-    // extend the record_map if needed
-    if (new_taskdata->td_tdg_task_id >= new_taskdata->tdg->map_size) {
-      __kmp_acquire_bootstrap_lock(&tdg->graph_lock);
-      // map_size could have been updated by another thread if recursive
-      // taskloop
-      if (new_taskdata->td_tdg_task_id >= tdg->map_size) {
-        kmp_uint old_size = tdg->map_size;
-        kmp_uint new_size = old_size * 2;
-        kmp_node_info_t *old_record = tdg->record_map;
-        kmp_node_info_t *new_record = (kmp_node_info_t *)__kmp_allocate(
-            new_size * sizeof(kmp_node_info_t));
-
-        KMP_MEMCPY(new_record, old_record, old_size * sizeof(kmp_node_info_t));
-        tdg->record_map = new_record;
-
-        __kmp_free(old_record);
-
-        for (kmp_uint i = old_size; i < new_size; i++) {
-          kmp_int32 *successorsList = (kmp_int32 *)__kmp_allocate(
-              __kmp_successors_size * sizeof(kmp_int32));
-          new_record[i].task = nullptr;
-          new_record[i].successors = successorsList;
-          new_record[i].nsuccessors = 0;
-          new_record[i].npredecessors = 0;
-          new_record[i].successors_size = __kmp_successors_size;
-          KMP_ATOMIC_ST_REL(&new_record[i].npredecessors_counter, 0);
-        }
-        // update the size at the end, so that we avoid other
-        // threads use old_record while map_size is already updated
-        tdg->map_size = new_size;
-      }
-      __kmp_release_bootstrap_lock(&tdg->graph_lock);
-    }
-    // record a task
-    if (tdg->record_map[new_taskdata->td_tdg_task_id].task == nullptr) {
-      tdg->record_map[new_taskdata->td_tdg_task_id].task = new_task;
-      tdg->record_map[new_taskdata->td_tdg_task_id].parent_task =
-          new_taskdata->td_parent;
-      KMP_ATOMIC_INC(&tdg->num_tasks);
-    }
-  }
-#endif
-
   /* Should we execute the new task or queue it? For now, let's just always try
      to queue it.  If the queue fills up, then we'll execute it.  */
   if (new_taskdata->td_flags.proxy == TASK_PROXY ||
@@ -2376,17 +2247,6 @@ the reduction either does not use omp_orig object, or the omp_orig is accessible
 without help of the runtime library.
 */
 void *__kmpc_task_reduction_init(int gtid, int num, void *data) {
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
-  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
-    kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
-    this_tdg->rec_taskred_data =
-        __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
-    this_tdg->rec_num_taskred = num;
-    KMP_MEMCPY(this_tdg->rec_taskred_data, data,
-               sizeof(kmp_task_red_input_t) * num);
-  }
-#endif
   return __kmp_task_reduction_init(gtid, num, (kmp_task_red_input_t *)data);
 }
 
@@ -2403,17 +2263,6 @@ Note: this entry supposes the optional compiler-generated initializer routine
 has two parameters, pointer to object to be initialized and pointer to omp_orig
 */
 void *__kmpc_taskred_init(int gtid, int num, void *data) {
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  kmp_tdg_info_t *tdg = __kmp_find_tdg(__kmp_curr_tdg_idx);
-  if (tdg && __kmp_tdg_is_recording(tdg->tdg_status)) {
-    kmp_tdg_info_t *this_tdg = __kmp_global_tdgs[__kmp_curr_tdg_idx];
-    this_tdg->rec_taskred_data =
-        __kmp_allocate(sizeof(kmp_task_red_input_t) * num);
-    this_tdg->rec_num_taskred = num;
-    KMP_MEMCPY(this_tdg->rec_taskred_data, data,
-               sizeof(kmp_task_red_input_t) * num);
-  }
-#endif
   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
 }
 
@@ -2460,18 +2309,6 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
   kmp_int32 num;
   kmp_int32 tid = thread->th.th_info.ds.ds_tid;
 
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  if ((thread->th.th_current_task->is_taskgraph) &&
-      (!__kmp_tdg_is_recording(
-          __kmp_global_tdgs[__kmp_curr_tdg_idx]->tdg_status))) {
-    tg = thread->th.th_current_task->td_taskgroup;
-    KMP_ASSERT(tg != NULL);
-    KMP_ASSERT(tg->reduce_data != NULL);
-    arr = (kmp_taskred_data_t *)(tg->reduce_data);
-    num = tg->reduce_num_data;
-  }
-#endif
-
   KMP_ASSERT(data != NULL);
   while (tg != NULL) {
     arr = (kmp_taskred_data_t *)(tg->reduce_data);
@@ -4238,9 +4075,6 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
 
   taskdata->td_flags.complete = 1; // mark the task as completed
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  taskdata->td_flags.onced = 1;
-#endif
 
   if (taskdata->td_taskgroup)
     KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
@@ -4442,12 +4276,7 @@ void __kmp_fulfill_event(kmp_event_t *event) {
 // taskloop_recur: used only when dealing with taskgraph,
 //      indicating whether we need to update task->td_task_id
 // returns:  a pointer to the allocated kmp_task_t structure (task).
-kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
-#if OMP_TASKGRAPH_EXPERIMENTAL
-                                 ,
-                                 int taskloop_recur
-#endif
-) {
+kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
   kmp_task_t *task;
   kmp_taskdata_t *taskdata;
   kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
@@ -4475,11 +4304,6 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
   task = KMP_TASKDATA_TO_TASK(taskdata);
 
   // Initialize new task (only specific fields not affected by memcpy)
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  if (taskdata->is_taskgraph && !taskloop_recur &&
-      __kmp_tdg_is_recording(taskdata_src->tdg->tdg_status))
-    taskdata->td_tdg_task_id = KMP_ATOMIC_INC(&__kmp_tdg_task_id);
-#endif
   taskdata->td_task_id = KMP_GEN_TASK_ID();
   if (task->shareds != NULL) { // need setup shareds pointer
     shareds_offset = (char *)task_src->shareds - (char *)taskdata_src;
@@ -4708,11 +4532,7 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
       }
     }
 
-#if OMP_TASKGRAPH_EXPERIMENTAL
-    next_task = __kmp_task_dup_alloc(thread, task, /* taskloop_recur */ 0);
-#else
     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
-#endif
 
     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
     kmp_taskloop_bounds_t next_task_bounds =
@@ -4910,12 +4730,7 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   lb1 = ub0 + st;
 
   // create pattern task for 2nd half of the loop
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  next_task = __kmp_task_dup_alloc(thread, task,
-                                   /* taskloop_recur */ 1);
-#else
   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
-#endif
   // adjust lower bound (upper bound is not changed) for the 2nd half
   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
   if (ptask_dup != NULL) // construct firstprivates, etc.
@@ -4948,12 +4763,6 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   p->codeptr_ra = codeptr_ra;
 #endif
 
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
-  new_task_data->tdg = taskdata->tdg;
-  new_task_data->is_taskgraph = 0;
-#endif
-
 #if OMPT_SUPPORT
   // schedule new task with correct return address for OMPT events
   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
@@ -4992,10 +4801,6 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
 #endif
     __kmpc_taskgroup(loc, gtid);
   }
-
-#if OMP_TASKGRAPH_EXPERIMENTAL
-  KMP_ATOMIC_DEC(&__kmp_tdg_task_id);
-#endif
   // =========================================================================
   // calculate loop parameters
   kmp_taskloop_bounds_t task_bounds(task, lb, ub);
@@ -5243,257 +5048,3 @@ bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
 
   return taskdata->td_task_team != NULL;
 }
-
-#if OMP_TASKGRAPH_EXPERIMENTAL
-// __kmp_find_tdg: identify a TDG through its ID
-// tdg_id: ID of the TDG
-// returns: If a TDG corresponding to this ID is found and not
-// its initial state, return the pointer to it, otherwise nullptr
-static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
-  kmp_tdg_info_t *res = nullptr;
-  if (__kmp_max_tdgs == 0)
-    return res;
-
-  if (__kmp_global_tdgs == NULL)
-    __kmp_global_tdgs = (kmp_tdg_info_t **)__kmp_allocate(
-        sizeof(kmp_tdg_info_t *) * __kmp_max_tdgs);
-
-  if ((__kmp_global_tdgs[tdg_id]) &&
-      (__kmp_global_tdgs[tdg_id]->tdg_status != KMP_TDG_NONE))
-    res = __kmp_global_tdgs[tdg_id];
-  return res;
-}
-
-// __kmp_print_tdg_dot: prints the TDG to a dot file
-// tdg:    ID of the TDG
-// gtid:   Global Thread ID
-void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
-  kmp_int32 tdg_id = tdg->tdg_id;
-  KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
-
-  char file_name[20];
-  sprintf(file_name, "tdg_%d.dot", tdg_id);
-  kmp_safe_raii_file_t tdg_file(file_name, "w");
-
-  kmp_int32 num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
-  fprintf(tdg_file,
-          "digraph TDG {\n"
-          "   compound=true\n"
-          "   subgraph cluster {\n"
-          "      label=TDG_%d\n",
-          tdg_id);
-  for (kmp_int32 i = 0; i < num_tasks; i++) {
-    fprintf(tdg_file, "      %d[style=bold]\n", i);
-  }
-  fprintf(tdg_file, "   }\n");
-  for (kmp_int32 i = 0; i < num_tasks; i++) {
-    kmp_int32 nsuccessors = tdg->record_map[i].nsuccessors;
-    kmp_int32 *successors = tdg->record_map[i].successors;
-    if (nsuccessors > 0) {
-      for (kmp_int32 j = 0; j < nsuccessors; j++)
-        fprintf(tdg_file, "   %d -> %d \n", i, successors[j]);
-    }
-  }
-  fprintf(tdg_file, "}");
-  KA_TRACE(10, ("__kmp_print_tdg_dot(exit): T#%d tdg_id=%d \n", gtid, tdg_id));
-}
-
-// __kmp_exec_tdg: launch the execution of a previous
-// recorded TDG
-// gtid:   Global Thread ID
-// tdg:    ID of the TDG
-void __kmp_exec_tdg(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
-  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_READY);
-  KA_TRACE(10, ("__kmp_exec_tdg(enter): T#%d tdg_id=%d num_roots=%d\n", gtid,
-                tdg->tdg_id, tdg->num_roots));
-  kmp_node_info_t *this_record_map = tdg->record_map;
-  kmp_int32 *this_root_tasks = tdg->root_tasks;
-  kmp_int32 this_num_roots = tdg->num_roots;
-  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
-
-  kmp_info_t *thread = __kmp_threads[gtid];
-  kmp_taskdata_t *parent_task = thread->th.th_current_task;
-
-  if (tdg->rec_taskred_data) {
-    __kmpc_taskred_init(gtid, tdg->rec_num_taskred, tdg->rec_taskred_data);
-  }
-
-  for (kmp_int32 j = 0; j < this_num_tasks; j++) {
-    kmp_taskdata_t *td = KMP_TASK_TO_TASKDATA(this_record_map[j].task);
-
-    td->td_parent = parent_task;
-    this_record_map[j].parent_task = parent_task;
-
-    kmp_taskgroup_t *parent_taskgroup =
-        this_record_map[j].parent_task->td_taskgroup;
-
-    KMP_ATOMIC_ST_RLX(&this_record_map[j].npredecessors_counter,
-                      this_record_map[j].npredecessors);
-    KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_incomplete_child_tasks);
-
-    if (parent_taskgroup) {
-      KMP_ATOMIC_INC(&parent_taskgroup->count);
-      // The taskgroup is different so we must update it
-      td->td_taskgroup = parent_taskgroup;
-    } else if (td->td_taskgroup != nullptr) {
-      // If the parent doesnt have a taskgroup, remove it from the task
-      td->td_taskgroup = nullptr;
-    }
-    if (this_record_map[j].parent_task->td_flags.tasktype == TASK_EXPLICIT)
-      KMP_ATOMIC_INC(&this_record_map[j].parent_task->td_allocated_child_tasks);
-  }
-
-  for (kmp_int32 j = 0; j < this_num_roots; ++j) {
-    __kmp_omp_task(gtid, this_record_map[this_root_tasks[j]].task, true);
-  }
-  KA_TRACE(10, ("__kmp_exec_tdg(exit): T#%d tdg_id=%d num_roots=%d\n", gtid,
-                tdg->tdg_id, tdg->num_roots));
-}
-
-// __kmp_start_record: set up a TDG structure and turn the
-// recording flag to true
-// gtid:        Global Thread ID of the encountering thread
-// input_flags: Flags associated with the TDG
-// tdg_id:      ID of the TDG to record
-static inline void __kmp_start_record(kmp_int32 gtid,
-                                      kmp_taskgraph_flags_t *flags,
-                                      kmp_int32 tdg_id) {
-  kmp_tdg_info_t *tdg =
-      (kmp_tdg_info_t *)__kmp_allocate(sizeof(kmp_tdg_info_t));
-  __kmp_global_tdgs[__kmp_curr_tdg_idx] = tdg;
-  // Initializing the TDG structure
-  tdg->tdg_id = tdg_id;
-  tdg->map_size = INIT_MAPSIZE;
-  tdg->num_roots = -1;
-  tdg->root_tasks = nullptr;
-  tdg->tdg_status = KMP_TDG_RECORDING;
-  tdg->rec_num_taskred = 0;
-  tdg->rec_taskred_data = nullptr;
-  KMP_ATOMIC_ST_RLX(&tdg->num_tasks, 0);
-
-  // Initializing the list of nodes in this TDG
-  kmp_node_info_t *this_record_map =
-      (kmp_node_info_t *)__kmp_allocate(INIT_MAPSIZE * sizeof(kmp_node_info_t));
-  for (kmp_int32 i = 0; i < INIT_MAPSIZE; i++) {
-    kmp_int32 *successorsList =
-        (kmp_int32 *)__kmp_allocate(__kmp_successors_size * sizeof(kmp_int32));
-    this_record_map[i].task = nullptr;
-    this_record_map[i].successors = successorsList;
-    this_record_map[i].nsuccessors = 0;
-    this_record_map[i].npredecessors = 0;
-    this_record_map[i].successors_size = __kmp_successors_size;
-    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter, 0);
-  }
-
-  __kmp_global_tdgs[__kmp_curr_tdg_idx]->record_map = this_record_map;
-}
-
-// __kmpc_start_record_task: Wrapper around __kmp_start_record to mark
-// the beginning of the record process of a task region
-// loc_ref:     Location of TDG, not used yet
-// gtid:        Global Thread ID of the encountering thread
-// input_flags: Flags associated with the TDG
-// tdg_id:      ID of the TDG to record, for now, incremental integer
-// returns:     1 if we record, otherwise, 0
-kmp_int32 __kmpc_start_record_task(ident_t *loc_ref, kmp_int32 gtid,
-                                   kmp_int32 input_flags, kmp_int32 tdg_id) {
-
-  kmp_int32 res;
-  kmp_taskgraph_flags_t *flags = (kmp_taskgraph_flags_t *)&input_flags;
-  KA_TRACE(10,
-           ("__kmpc_start_record_task(enter): T#%d loc=%p flags=%d tdg_id=%d\n",
-            gtid, loc_ref, input_flags, tdg_id));
-
-  if (__kmp_max_tdgs == 0) {
-    KA_TRACE(
-        10,
-        ("__kmpc_start_record_task(abandon): T#%d loc=%p flags=%d tdg_id = %d, "
-         "__kmp_max_tdgs = 0\n",
-         gtid, loc_ref, input_flags, tdg_id));
-    return 1;
-  }
-
-  __kmpc_taskgroup(loc_ref, gtid);
-  if (kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id)) {
-    // TODO: use re_record flag
-    __kmp_exec_tdg(gtid, tdg);
-    res = 0;
-  } else {
-    __kmp_curr_tdg_idx = tdg_id;
-    KMP_DEBUG_ASSERT(__kmp_curr_tdg_idx < __kmp_max_tdgs);
-    __kmp_start_record(gtid, flags, tdg_id);
-    __kmp_num_tdg++;
-    res = 1;
-  }
-  KA_TRACE(10, ("__kmpc_start_record_task(exit): T#%d TDG %d starts to %s\n",
-                gtid, tdg_id, res ? "record" : "execute"));
-  return res;
-}
-
-// __kmp_end_record: set up a TDG after recording it
-// gtid:   Global thread ID
-// tdg:    Pointer to the TDG
-void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
-  // Store roots
-  kmp_node_info_t *this_record_map = tdg->record_map;
-  kmp_int32 this_num_tasks = KMP_ATOMIC_LD_RLX(&tdg->num_tasks);
-  kmp_int32 *this_root_tasks =
-      (kmp_int32 *)__kmp_allocate(this_num_tasks * sizeof(kmp_int32));
-  kmp_int32 this_map_size = tdg->map_size;
-  kmp_int32 this_num_roots = 0;
-  kmp_info_t *thread = __kmp_threads[gtid];
-
-  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
-    if (this_record_map[i].npredecessors == 0) {
-      this_root_tasks[this_num_roots++] = i;
-    }
-  }
-
-  // Update with roots info and mapsize
-  tdg->map_size = this_map_size;
-  tdg->num_roots = this_num_roots;
-  tdg->root_tasks = this_root_tasks;
-  KMP_DEBUG_ASSERT(tdg->tdg_status == KMP_TDG_RECORDING);
-  tdg->tdg_status = KMP_TDG_READY;
-
-  if (thread->th.th_current_task->td_dephash) {
-    __kmp_dephash_free(thread, thread->th.th_current_task->td_dephash);
-    thread->th.th_current_task->td_dephash = NULL;
-  }
-
-  // Reset predecessor counter
-  for (kmp_int32 i = 0; i < this_num_tasks; i++) {
-    KMP_ATOMIC_ST_RLX(&this_record_map[i].npredecessors_counter,
-                      this_record_map[i].npredecessors);
-  }
-  KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
-
-  if (__kmp_tdg_dot)
-    __kmp_print_tdg_dot(tdg, gtid);
-}
-
-// __kmpc_end_record_task: wrapper around __kmp_end_record to mark
-// the end of recording phase
-//
-// loc_ref:      Source location information
-// gtid:         Global thread ID
-// input_flags:  Flags attached to the graph
-// tdg_id:       ID of the TDG just finished recording
-void __kmpc_end_record_task(ident_t *loc_ref, kmp_int32 gtid,
-                            kmp_int32 input_flags, kmp_int32 tdg_id) {
-  kmp_tdg_info_t *tdg = __kmp_find_tdg(tdg_id);
-
-  KA_TRACE(10, ("__kmpc_end_record_task(enter): T#%d loc=%p finishes recording"
-                " tdg=%d with flags=%d\n",
-                gtid, loc_ref, tdg_id, input_flags));
-  if (__kmp_max_tdgs) {
-    // TODO: use input_flags->nowait
-    __kmpc_end_taskgroup(loc_ref, gtid);
-    if (__kmp_tdg_is_recording(tdg->tdg_status))
-      __kmp_end_record(gtid, tdg);
-  }
-  KA_TRACE(10, ("__kmpc_end_record_task(exit): T#%d loc=%p finished recording"
-                " tdg=%d, its status is now READY\n",
-                gtid, loc_ref, tdg_id));
-}
-#endif
diff --git a/openmp/runtime/test/tasking/omp_record_replay.cpp b/openmp/runtime/test/tasking/omp_record_replay.cpp
deleted file mode 100644
index 4fea22e081da9..0000000000000
--- a/openmp/runtime/test/tasking/omp_record_replay.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// REQUIRES: omp_taskgraph_experimental
-// RUN: %libomp-cxx-compile-and-run
-#include <iostream>
-#include <cassert>
-#define NT 100
-
-// Compiler-generated code (emulation)
-typedef struct ident {
-    void* dummy;
-} ident_t;
-
-
-#ifdef __cplusplus
-extern "C" {
-  int __kmpc_global_thread_num(ident_t *);
-  int __kmpc_start_record_task(ident_t *, int, int, int);
-  void __kmpc_end_record_task(ident_t *, int, int , int);
-}
-#endif
-
-void func(int *num_exec) {
-  (*num_exec)++;
-}
-
-int main() {
-  int num_exec = 0;
-  int num_tasks = 0;
-  int x=0;
-  #pragma omp parallel
-  #pragma omp single
-  for (int iter = 0; iter < NT; ++iter) {
-    int gtid = __kmpc_global_thread_num(nullptr);
-    int res =  __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */0);
-    if (res) {
-      num_tasks++;
-      #pragma omp task 
-      func(&num_exec);
-    }
-    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
-  }
-
-  assert(num_tasks==1);
-  assert(num_exec==NT);
-
-  std::cout << "Passed" << std::endl;
-  return 0;
-}
-// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_record_replay_deps.cpp b/openmp/runtime/test/tasking/omp_record_replay_deps.cpp
deleted file mode 100644
index 4c06ae3f7b273..0000000000000
--- a/openmp/runtime/test/tasking/omp_record_replay_deps.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// REQUIRES: omp_taskgraph_experimental
-// RUN: %libomp-cxx-compile-and-run
-#include <iostream>
-#include <cassert>
-#define NT 100
-#define MULTIPLIER 100
-#define DECREMENT 5
-
-int val;
-// Compiler-generated code (emulation)
-typedef struct ident {
-    void* dummy;
-} ident_t;
-
-
-#ifdef __cplusplus
-extern "C" {
-  int __kmpc_global_thread_num(ident_t *);
-  int __kmpc_start_record_task(ident_t *, int, int, int);
-  void __kmpc_end_record_task(ident_t *, int, int, int);
-}
-#endif
-
-void sub() {
-  #pragma omp atomic
-  val -= DECREMENT;
-}
-
-void add() {
-  #pragma omp atomic
-  val += DECREMENT;
-}
-
-void mult() {
-  // no atomicity needed, can only be executed by 1 thread
-  // and no concurrency with other tasks possible
-  val *= MULTIPLIER;
-}
-
-int main() {
-  val = 0;
-  int *x, *y;
-  #pragma omp parallel
-  #pragma omp single
-  for (int iter = 0; iter < NT; ++iter) {
-    int gtid = __kmpc_global_thread_num(nullptr);
-    int res =  __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
-    if (res) {
-      #pragma omp task depend(out:y)
-      add();
-      #pragma omp task depend(out:x)
-      sub();
-      #pragma omp task depend(in:x,y)
-      mult();
-    }
-    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
-  }
-  assert(val==0);
-
-  std::cout << "Passed" << std::endl;
-  return 0;
-}
-// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_record_replay_deps_multi_succ.cpp b/openmp/runtime/test/tasking/omp_record_replay_deps_multi_succ.cpp
deleted file mode 100644
index 6bcd3dee56030..0000000000000
--- a/openmp/runtime/test/tasking/omp_record_replay_deps_multi_succ.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// REQUIRES: omp_taskgraph_experimental
-// RUN: %libomp-cxx-compile-and-run
-#include <omp.h>
-#include <cassert>
-#include <vector>
-
-constexpr const int TASKS_SIZE = 12;
-
-typedef struct ident ident_t;
-
-extern "C" {
-int __kmpc_global_thread_num(ident_t *);
-int __kmpc_start_record_task(ident_t *, int, int, int);
-void __kmpc_end_record_task(ident_t *, int, int, int);
-}
-
-void init(int &A, int val) { A = val; }
-
-void update(int &A, int &B, int val) { A = B + val; }
-
-void test(int nb, std::vector<std::vector<int>> &Ah) {
-#pragma omp parallel
-#pragma omp single
-  {
-    int gtid = __kmpc_global_thread_num(nullptr);
-    int res = __kmpc_start_record_task(nullptr, gtid, 0, 0);
-    if (res) {
-      for (int k = 0; k < nb; ++k) {
-#pragma omp task depend(inout : Ah[k][0])
-        init(Ah[k][0], k);
-
-        for (int i = 1; i < nb; ++i) {
-#pragma omp task depend(in : Ah[k][0]) depend(out : Ah[k][i])
-          update(Ah[k][i], Ah[k][0], 1);
-        }
-      }
-    }
-    __kmpc_end_record_task(nullptr, gtid, 0, 0);
-  }
-}
-
-int main() {
-  std::vector<std::vector<int>> matrix(TASKS_SIZE,
-                                       std::vector<int>(TASKS_SIZE, 0));
-
-  test(TASKS_SIZE, matrix);
-  test(TASKS_SIZE, matrix);
-
-  for (int k = 0; k < TASKS_SIZE; ++k) {
-    assert(matrix[k][0] == k);
-    for (int i = 1; i < TASKS_SIZE; ++i) {
-      assert(matrix[k][i] == k + 1);
-    }
-  }
-  return 0;
-}
diff --git a/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp b/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp
deleted file mode 100644
index 1864d5d89cc70..0000000000000
--- a/openmp/runtime/test/tasking/omp_record_replay_multiTDGs.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-// REQUIRES: omp_taskgraph_experimental
-// RUN: %libomp-cxx-compile-and-run
-#include <iostream>
-#include <cassert>
-#define NT 20
-#define MULTIPLIER 100
-#define DECREMENT 5
-
-// Compiler-generated code (emulation)
-typedef struct ident {
-    void* dummy;
-} ident_t;
-
-int val;
-#ifdef __cplusplus
-extern "C" {
-  int __kmpc_global_thread_num(ident_t *);
-  int __kmpc_start_record_task(ident_t *, int, int, int);
-  void __kmpc_end_record_task(ident_t *, int, int , int);
-}
-#endif
-
-void sub() {
-  #pragma omp atomic
-  val -= DECREMENT;
-}
-
-void add() {
-  #pragma omp atomic
-  val += DECREMENT;
-}
-
-void mult() {
-  // no atomicity needed, can only be executed by 1 thread
-  // and no concurrency with other tasks possible
-  val *= MULTIPLIER;
-}
-
-int main() {
-  int num_tasks = 0;
-  int *x, *y;
-  #pragma omp parallel
-  #pragma omp single
-  for (int iter = 0; iter < NT; ++iter) {
-    int gtid = __kmpc_global_thread_num(nullptr);
-    int res =  __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */0);
-    if (res) {
-      num_tasks++;
-      #pragma omp task depend(out:y)
-      add();
-      #pragma omp task depend(out:x)
-      sub();
-      #pragma omp task depend(in:x,y)
-      mult();
-    }
-    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */0);
-    res =  __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */1);
-    if (res) {
-      num_tasks++;
-      #pragma omp task depend(out:y)
-      add();
-      #pragma omp task depend(out:x)
-      sub();
-      #pragma omp task depend(in:x,y)
-      mult();
-    }
-    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0, /* tdg_id */1);
-  }
-
-  assert(num_tasks==2);
-  assert(val==0);
-
-  std::cout << "Passed" << std::endl;
-  return 0;
-}
-// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_record_replay_print_dot.cpp b/openmp/runtime/test/tasking/omp_record_replay_print_dot.cpp
deleted file mode 100644
index 7f1f5ccd77d37..0000000000000
--- a/openmp/runtime/test/tasking/omp_record_replay_print_dot.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-// REQUIRES: omp_taskgraph_experimental
-// RUN: %libomp-cxx-compile-and-run
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <cassert>
-
-// Compiler-generated code (emulation)
-typedef struct ident {
-    void* dummy;
-} ident_t;
-
-#ifdef __cplusplus
-extern "C" {
-  int __kmpc_global_thread_num(ident_t *);
-  int __kmpc_start_record_task(ident_t *, int, int, int);
-  void __kmpc_end_record_task(ident_t *, int, int , int);
-}
-#endif
-
-void func(int *num_exec) {
-  #pragma omp atomic
-  (*num_exec)++;
-}
-
-std::string tdg_string= "digraph TDG {\n"
-"   compound=true\n"
-"   subgraph cluster {\n"
-"      label=TDG_0\n"
-"      0[style=bold]\n"
-"      1[style=bold]\n"
-"      2[style=bold]\n"
-"      3[style=bold]\n"
-"   }\n"
-"   0 -> 1 \n"
-"   1 -> 2 \n"
-"   1 -> 3 \n"
-"}";
-
-int main() {
-  int num_exec = 0;
-  int x, y;
-
-  setenv("KMP_TDG_DOT","TRUE",1);
-  remove("tdg_0.dot");
-
-  #pragma omp parallel
-  #pragma omp single
-  {
-    int gtid = __kmpc_global_thread_num(nullptr);
-    int res = __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */ 0);
-    if (res) {
-      #pragma omp task depend(out : x)
-      func(&num_exec);
-      #pragma omp task depend(in : x) depend(out : y)
-      func(&num_exec);
-      #pragma omp task depend(in : y)
-      func(&num_exec);
-      #pragma omp task depend(in : y)
-      func(&num_exec);
-    }
-
-    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */ 0, /* tdg_id */ 0);
-  }
-
-  assert(num_exec == 4);
-
-  std::ifstream tdg_file("tdg_0.dot");
-  assert(tdg_file.is_open());
-
-  std::stringstream tdg_file_stream;
-  tdg_file_stream << tdg_file.rdbuf();
-  int equal = tdg_string.compare(tdg_file_stream.str());
-
-  assert(equal == 0);
-
-  std::cout << "Passed" << std::endl;
-  return 0;
-}
-// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp b/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp
deleted file mode 100644
index 163a1b4192d85..0000000000000
--- a/openmp/runtime/test/tasking/omp_record_replay_taskloop.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// REQUIRES: omp_taskgraph_experimental
-// RUN: %libomp-cxx-compile-and-run
-#include <iostream>
-#include <cassert>
-
-#define NT 20
-#define N 128*128
-
-typedef struct ident {
-    void* dummy;
-} ident_t;
-
-
-#ifdef __cplusplus
-extern "C" {
-  int __kmpc_global_thread_num(ident_t *);
-  int __kmpc_start_record_task(ident_t *, int, int, int);
-  void __kmpc_end_record_task(ident_t *, int, int , int);
-}
-#endif
-
-int main() {
-  int num_tasks = 0;
-
-  int array[N];
-  for (int i = 0; i < N; ++i)
-    array[i] = 1;
-
-  long sum = 0;
-  #pragma omp parallel
-  #pragma omp single
-  for (int iter = 0; iter < NT; ++iter) {
-    int gtid = __kmpc_global_thread_num(nullptr);
-    int res =  __kmpc_start_record_task(nullptr, gtid, /* kmp_tdg_flags */0,  /* tdg_id */0);
-    if (res) {
-      num_tasks++;
-      #pragma omp taskloop reduction(+:sum) num_tasks(4096)
-      for (int i = 0; i < N; ++i) {
-        sum += array[i];
-      }
-    }
-    __kmpc_end_record_task(nullptr, gtid, /* kmp_tdg_flags */0,  /* tdg_id */0);
-  }
-  assert(sum==N*NT);
-  assert(num_tasks==1);
-
-  std::cout << "Passed" << std::endl;
-  return 0;
-}
-// CHECK: Passed

>From df9e956ae4014decc6c4156b739df8b8d7d38375 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Fri, 27 Mar 2026 11:56:28 -0500
Subject: [PATCH 02/24] [OpenMP] New taskgraph runtime implementation

This patch contains the bulk of the new runtime support for taskgraph
record/replay.

Key points are as follows.

The task/taskdata structures and the dependencies between them are
duplicated whilst recording a taskgraph, keeping the existing runtime
dependency handling unaffected by the taskgraph implementation --
e.g. during runtime execution, it is valid for output dependencies can
be dropped as soon as the producing task completes. This separation
is intended to eliminate a class of race conditions, where tasks which
complete unpredictably might or might not be marked as having a subsequent
task depending on them.

The dependencies between tasks in a taskgraph are processed by static
analysis: the high-level process is akin to turning data dependencies
between tasks into control-flow dependencies. This is done by building
a set of successors and predecessors for each recorded task, then
decomposing the resulting DAG into parallel and sequential regions. In the
(presumed relatively unlikely, in real-world code) case that the graph
is irreducible, a further set of analyses and transformations is done,
and the parallel-sequential decomposition is run again.

The output of this process is a set of nested kmp_taskgraph_region
structures -- parallel or sequential (with some number of children),
or nodes representing a single task. The two phases alternate until we
obtain a single, top-level region.

Replaying a taskgraph processed in this way on the CPU involves another
set of linked structures, of type kmp_taskgraph_exec_descr. These form
a kind of trace of a traversal over the kmp_taskgraph_region structure,
so that a pointer to a kmp_taskgraph_exec_descr is somewhat equivalent
to a "program counter".

Recorded taskgraphs are located directly by using a handle passed in
from the user's compiled program, rather than using a linked list or
hashtable to find taskgraph records to replay keyed by an index.

commit-id:47e383a2

Pull Request: https://github.com/llvm/llvm-project/pull/194047
---
 openmp/runtime/src/kmp.h            |  199 +-
 openmp/runtime/src/kmp_debug.h      |   24 +
 openmp/runtime/src/kmp_global.cpp   |    3 +
 openmp/runtime/src/kmp_settings.cpp |   12 +
 openmp/runtime/src/kmp_taskdeps.cpp | 2990 ++++++++++++++++++++++++++-
 openmp/runtime/src/kmp_taskdeps.h   |   27 +-
 openmp/runtime/src/kmp_tasking.cpp  | 1038 +++++++++-
 7 files changed, 4146 insertions(+), 147 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index df98f03c7dcdc..8a735b87619f4 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2510,6 +2510,13 @@ typedef struct kmp_task { /* GEH: Shouldn't this be aligned somehow? */
 @}
 */
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+typedef struct kmp_taskgraph_reduce_input_data {
+  void *reduce_data;
+  kmp_int32 reduce_num_data;
+} kmp_taskgraph_reduce_input_data_t;
+#endif
+
 typedef struct kmp_taskgroup {
   std::atomic<kmp_int32> count; // number of allocated and incomplete tasks
   std::atomic<kmp_int32>
@@ -2519,6 +2526,17 @@ typedef struct kmp_taskgroup {
   void *reduce_data; // reduction related info
   kmp_int32 reduce_num_data; // number of data items to reduce
   uintptr_t *gomp_data; // gomp reduction data
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  struct {
+    // Points to taskgraph that tasks in this taskgroup are being recorded to.
+    std::atomic<struct kmp_taskgraph_record *> recording;
+    // Temporary holding place for input data for reductions for this taskgroup
+    // during taskgraph recording.  This is passed over to the first
+    // kmp_taskgraph_node we encounter inside the taskgroup.  We'll have to
+    // watch out for potential race conditions here.
+    kmp_taskgraph_reduce_input_data_t *reduce_input;
+  } taskgraph;
+#endif
 } kmp_taskgroup_t;
 
 // forward declarations
@@ -2570,6 +2588,14 @@ struct kmp_depnode_list {
 // Max number of mutexinoutset dependencies per node
 #define MAX_MTX_DEPS 4
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+struct kmp_taskgraph_node;
+struct kmp_taskgraph_region;
+struct kmp_taskgraph_record;
+
+struct kmp_bitset;
+#endif
+
 typedef struct kmp_base_depnode {
   kmp_depnode_list_t *successors; /* used under lock */
   kmp_task_t *task; /* non-NULL if depnode is active, used under lock */
@@ -2581,6 +2607,9 @@ typedef struct kmp_base_depnode {
 #endif
   std::atomic<kmp_int32> npredecessors;
   std::atomic<kmp_int32> nrefs;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  struct kmp_bitset *set_membership;
+#endif
 } kmp_base_depnode_t;
 
 union KMP_ALIGN_CACHE kmp_depnode {
@@ -2595,7 +2624,14 @@ struct kmp_dephash_entry {
   kmp_depnode_list_t *last_set;
   kmp_depnode_list_t *prev_set;
   kmp_uint8 last_flag;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  union {
+    kmp_lock_t *mtx_lock; /* is referenced by depnodes w/mutexinoutset dep */
+    kmp_int32 set_num;
+  };
+#else
   kmp_lock_t *mtx_lock; /* is referenced by depnodes w/mutexinoutset dep */
+#endif
   kmp_dephash_entry_t *next_in_bucket;
 };
 
@@ -2633,12 +2669,131 @@ typedef struct {
 
 #if OMP_TASKGRAPH_EXPERIMENTAL
 
-/// Represent a TDG's current status
-typedef enum kmp_tdg_status {
+/// Represent a task dependency graph's current status
+typedef enum kmp_taskgraph_status {
   KMP_TDG_NONE = 0,
   KMP_TDG_RECORDING = 1,
   KMP_TDG_READY = 2
-} kmp_tdg_status_t;
+} kmp_taskgraph_status_t;
+
+enum kmp_taskgraph_mark {
+  TASKGRAPH_UNMARKED,
+  TASKGRAPH_TEMP_MARK,
+  TASKGRAPH_PERMANENT_MARK,
+  TASKGRAPH_COMBINED,
+  TASKGRAPH_DELETED
+};
+
+typedef struct kmp_taskgraph_region_dep {
+  struct kmp_taskgraph_region *region;
+  struct kmp_taskgraph_region_dep *next;
+} kmp_taskgraph_region_dep_t;
+
+typedef struct kmp_taskgraph_node {
+  kmp_task_t *task;
+  bool taskloop_task;
+  kmp_taskgraph_reduce_input_data_t *reduce_input;
+  union {
+    // Valid when KMP_TDG_RECORDING in parent taskgraph record.
+    struct {
+      kmp_depend_info_t *dep_list;
+      kmp_int32 ndeps;
+      // This is a control dependency.  If not -1, it is the index of the
+      // taskgraph node which succeeds this one in an array of taskgraph nodes.
+      kmp_int32 cfg_successor;
+    } unresolved;
+
+    // Valid when KMP_TDG_READY in parent taskgraph record.
+    struct {
+      struct kmp_taskgraph_region *last_region;
+      kmp_int32 count;
+    } resolved;
+  } u;
+} kmp_taskgraph_node_t;
+
+enum kmp_taskgraph_region_type {
+  TASKGRAPH_REGION_ENTRY,
+  TASKGRAPH_REGION_EXIT,
+  TASKGRAPH_REGION_NODE,
+  TASKGRAPH_REGION_WAIT,
+  TASKGRAPH_REGION_PARALLEL,
+  TASKGRAPH_REGION_EXCLUSIVE,
+  TASKGRAPH_REGION_SEQUENTIAL,
+  TASKGRAPH_REGION_IRREDUCIBLE
+};
+
+typedef struct kmp_taskgraph_region {
+  struct kmp_taskgraph_record *owner;
+  // Initially, the lexical "next" region (which doesn't have to be a
+  // successor). Subsequently, a pointer to the next item in the worklist.
+  struct kmp_taskgraph_region *next;
+  // The parent taskgraph for this one.  Initially nullptr.
+  struct kmp_taskgraph_region *parent;
+  kmp_taskgraph_region_dep_t *successors;
+  kmp_taskgraph_region_dep_t *predecessors;
+  // Only valid while building the exec descr structure.  This could probably
+  // share storage with one of the other fields if we wanted to save space.
+  struct kmp_taskgraph_exec_descr *exec_descr;
+  // The next allocated block.
+  struct kmp_taskgraph_region *alloc_chain;
+  struct kmp_bitset *mutexset;
+  struct kmp_taskgraph_region *mutexset_parent;
+  // Pointer to reduction input data for the region.  We only expect to see
+  // this on TASKGRAPH_REGION_PARALLEL regions.
+  kmp_taskgraph_reduce_input_data_t *reduce_input;
+  enum kmp_taskgraph_region_type type;
+  enum kmp_taskgraph_mark mark;
+  kmp_int32 timestamp;
+  kmp_int32 level;
+  union {
+    struct {
+      kmp_taskgraph_node_t *node;
+      struct kmp_taskgraph_region *next_instance;
+    } task;
+    struct {
+      struct kmp_taskgraph_region **children;
+      kmp_int32 num_children;
+    } inner;
+  };
+} kmp_taskgraph_region_t;
+
+typedef struct kmp_taskgraph_record {
+  std::atomic<kmp_taskgraph_status_t> status = KMP_TDG_NONE;
+  kmp_int32 gtid = 0;
+  kmp_int32 graph_id = 0;
+  // A lock that protects the record_map and num_tasks fields from being
+  // modified by multiple threads.
+  // For now, we also use this whilst the taskgraph is being replayed.
+  // This should be replaced with an invocation counter when we implement
+  // concurrent replay of the taskgraph from different threads.
+  kmp_lock_t map_lock;
+  kmp_taskgraph_node_t *record_map = nullptr;
+  kmp_int32 num_tasks = 0;
+  kmp_int32 nodes_allocated = 0;
+  kmp_taskgraph_region_t *root;
+  kmp_taskgraph_region_t *alloc_root;
+  kmp_taskgraph_region_dep_t *recycled_deps;
+  kmp_int32 num_mutexes;
+  struct kmp_taskgraph_exec_descr *exec_descrs;
+  kmp_size_t exec_descr_size;
+  kmp_lock_t replay_lock;
+  // We need a taskgroup structure to keep track of recorded tasks.  This is
+  // set to TRUE if the user requested "nogroup" on the taskgraph directive
+  // (then we can avoid blocking at the end of the taskgraph region on replay,
+  // at least).
+  bool nogroup_taskgroup;
+  struct kmp_taskgraph_record *next = nullptr;
+} kmp_taskgraph_record_t;
+
+typedef struct kmp_taskgraph_exec_descr {
+  std::atomic<kmp_int32> npredecessors;
+  std::atomic<kmp_int32> nblocks;
+  kmp_taskgraph_region_t *region;
+  struct kmp_taskgraph_exec_descr *sibling;
+  struct kmp_taskgraph_exec_descr *predecessor_chain;
+  struct kmp_taskgraph_exec_descr *successor;
+  struct kmp_taskgraph_exec_descr *next_instance;
+} kmp_taskgraph_exec_descr_t;
 
 #endif
 
@@ -2752,6 +2907,9 @@ struct kmp_taskdata { /* aligned during dynamic allocation       */
   ompt_task_info_t ompt_task_info;
 #endif
 #if OMP_TASKGRAPH_EXPERIMENTAL
+  // Whether the task is within a task dependency graph.
+  struct kmp_taskgraph_record *owning_taskgraph = nullptr;
+  struct kmp_taskgraph_exec_descr *exec_descr = nullptr;
 #endif
   kmp_target_data_t td_target_data;
 }; // struct kmp_taskdata
@@ -3264,6 +3422,9 @@ extern int kmp_c_debug;
 extern int kmp_d_debug;
 extern int kmp_e_debug;
 extern int kmp_f_debug;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+extern int kmp_g_debug;
+#endif
 #endif /* KMP_DEBUG */
 
 /* For debug information logging using rotating buffer */
@@ -4259,6 +4420,17 @@ KMP_EXPORT void __kmpc_omp_taskwait_deps_51(ident_t *loc_ref, kmp_int32 gtid,
 extern kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
                                 bool serialize_immediate);
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+extern kmp_int32 __kmp_build_taskgraph(kmp_int32 gtid,
+                                       kmp_taskdata_t *current_taskdata,
+                                       kmp_taskgraph_record_t *taskgraph);
+
+extern void __kmp_replay_taskgraph(kmp_int32 gtid,
+                                   kmp_taskdata_t *current_taskdata,
+                                   kmp_taskgraph_record_t *taskgraph,
+                                   kmp_uint32 graph_id);
+#endif
+
 KMP_EXPORT kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid,
                                    kmp_int32 cncl_kind);
 KMP_EXPORT kmp_int32 __kmpc_cancellationpoint(ident_t *loc_ref, kmp_int32 gtid,
@@ -4327,6 +4499,27 @@ KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
                                                 void **user_lock,
                                                 uintptr_t hint);
 #if OMP_TASKGRAPH_EXPERIMENTAL
+KMP_EXPORT void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
+                                 std::atomic<void *> *tdg_handle,
+                                 kmp_uint32 graph_id, kmp_int32 graph_reset,
+                                 kmp_int32 nogroup, void (*entry)(void *),
+                                 void *args);
+KMP_EXPORT kmp_uint32 __kmpc_taskgraph_task(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 flags,
+    size_t sizeof_kmp_task_t, void *shareds, size_t sizeof_shareds,
+    kmp_int32 ndeps, kmp_depend_info_t *dep_list);
+KMP_EXPORT kmp_uint32 __kmpc_taskgraph_taskloop(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 flags,
+    size_t sizeof_kmp_task_t, void *shareds, size_t sizeof_shareds,
+    kmp_int32 if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+    kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize,
+    kmp_int32 modifier, void *task_dup);
+KMP_EXPORT void __kmpc_taskgraph_taskwait(ident_t *loc_ref, kmp_int32 gtid,
+                                          kmp_int32 ndeps,
+                                          kmp_depend_info_t *dep_list,
+                                          kmp_int32 has_no_wait);
+KMP_EXPORT void *__kmpc_taskgraph_taskred_init(kmp_int32 gtid, kmp_int32 num,
+                                               void *data);
 #endif
 /* Interface to fast scalable reduce methods routines */
 
diff --git a/openmp/runtime/src/kmp_debug.h b/openmp/runtime/src/kmp_debug.h
index 08d52cc04a108..81b44cc071eaf 100644
--- a/openmp/runtime/src/kmp_debug.h
+++ b/openmp/runtime/src/kmp_debug.h
@@ -76,6 +76,9 @@ extern int kmp_c_debug;
 extern int kmp_d_debug;
 extern int kmp_e_debug;
 extern int kmp_f_debug;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+extern int kmp_g_debug;
+#endif
 extern int kmp_diag;
 
 #define KA_TRACE(d, x)                                                         \
@@ -102,6 +105,12 @@ extern int kmp_diag;
   if (kmp_f_debug >= d) {                                                      \
     __kmp_debug_printf x;                                                      \
   }
+#if OMP_TASKGRAPH_EXPERIMENTAL
+#define KG_TRACE(d, x)                                                         \
+  if (kmp_g_debug >= d) {                                                      \
+    __kmp_debug_printf x;                                                      \
+  }
+#endif
 #define K_DIAG(d, x)                                                           \
   {                                                                            \
     if (kmp_diag == d) {                                                       \
@@ -151,6 +160,15 @@ extern int kmp_diag;
     (x);                                                                       \
     __kmp_enable(ks);                                                          \
   }
+#if OMP_TASKGRAPH_EXPERIMENTAL
+#define KG_DUMP(d, x)                                                          \
+  if (kmp_g_debug >= d) {                                                      \
+    int ks;                                                                    \
+    __kmp_disable(&ks);                                                        \
+    (x);                                                                       \
+    __kmp_enable(ks);                                                          \
+  }
+#endif
 
 #else
 
@@ -160,6 +178,9 @@ extern int kmp_diag;
 #define KD_TRACE(d, x) /* nothing to do */
 #define KE_TRACE(d, x) /* nothing to do */
 #define KF_TRACE(d, x) /* nothing to do */
+#if OMP_TASKGRAPH_EXPERIMENTAL
+#define KG_TRACE(d, x) /* nothing to do */
+#endif
 #define K_DIAG(d, x)                                                           \
   {} /* nothing to do */
 
@@ -169,6 +190,9 @@ extern int kmp_diag;
 #define KD_DUMP(d, x) /* nothing to do */
 #define KE_DUMP(d, x) /* nothing to do */
 #define KF_DUMP(d, x) /* nothing to do */
+#if OMP_TASKGRAPH_EXPERIMENTAL
+#define KG_DUMP(d, x) /* nothing to do */
+#endif
 
 #endif // KMP_DEBUG
 
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp
index 3df46baa57544..c38d734b47c9b 100644
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -375,6 +375,9 @@ int kmp_c_debug = 0;
 int kmp_d_debug = 0;
 int kmp_e_debug = 0;
 int kmp_f_debug = 0;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+int kmp_g_debug = 0;
+#endif
 int kmp_diag = 0;
 #endif
 
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index 615058af9705f..23346ea397c65 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -1557,6 +1557,9 @@ KMP_STG_X_DEBUG(c)
 KMP_STG_X_DEBUG(d)
 KMP_STG_X_DEBUG(e)
 KMP_STG_X_DEBUG(f)
+#if OMP_TASKGRAPH_EXPERIMENTAL
+KMP_STG_X_DEBUG(g)
+#endif
 
 #undef KMP_STG_X_DEBUG
 
@@ -1582,6 +1585,11 @@ static void __kmp_stg_parse_debug(char const *name, char const *value,
   if (kmp_f_debug < debug) {
     kmp_f_debug = debug;
   }
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  if (kmp_g_debug < debug) {
+    kmp_g_debug = debug;
+  }
+#endif
 } // __kmp_stg_parse_debug
 
 static void __kmp_stg_parse_debug_buf(char const *name, char const *value,
@@ -5568,6 +5576,10 @@ static kmp_setting_t __kmp_stg_table[] = {
      0},
     {"KMP_F_DEBUG", __kmp_stg_parse_f_debug, __kmp_stg_print_f_debug, NULL, 0,
      0},
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    {"KMP_G_DEBUG", __kmp_stg_parse_g_debug, __kmp_stg_print_g_debug, NULL, 0,
+     0},
+#endif
     {"KMP_DEBUG", __kmp_stg_parse_debug, NULL, /* no print */ NULL, 0, 0},
     {"KMP_DEBUG_BUF", __kmp_stg_parse_debug_buf, __kmp_stg_print_debug_buf,
      NULL, 0, 0},
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index 6884cd144f89e..3d24ff3f8b5f7 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -20,6 +20,13 @@
 #include "ompt-specific.h"
 #endif
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+#include <bit>
+#include <cstdlib>
+#include <algorithm>
+#include <cinttypes>
+#endif
+
 // TODO: Improve memory allocation? keep a list of pre-allocated structures?
 // allocate in blocks? re-use list finished list entries?
 // TODO: don't use atomic ref counters for stack-allocated nodes.
@@ -33,6 +40,14 @@
 static std::atomic<kmp_int32> kmp_node_id_seed = 0;
 #endif
 
+#undef DEBUG_TASKGRAPH
+
+#ifdef DEBUG_TASKGRAPH
+#define TGDBG(ARGS...) fprintf(stderr, ARGS)
+#else
+#define TGDBG(ARGS...)
+#endif
+
 static void __kmp_init_node(kmp_depnode_t *node, bool on_stack) {
   node->dn.successors = NULL;
   node->dn.task = NULL; // will point to the right task
@@ -49,6 +64,9 @@ static void __kmp_init_node(kmp_depnode_t *node, bool on_stack) {
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
   node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed);
 #endif
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  node->dn.set_membership = nullptr;
+#endif
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
   __itt_sync_create(node, "OMP task dep node", NULL, 0);
 #endif
@@ -160,7 +178,12 @@ static kmp_dephash_t *__kmp_dephash_create(kmp_info_t *thread,
 
 static kmp_dephash_entry *__kmp_dephash_find(kmp_info_t *thread,
                                              kmp_dephash_t **hash,
-                                             kmp_intptr_t addr) {
+                                             kmp_intptr_t addr
+#if OMP_TASKGRAPH_EXPERIMENTAL
+                                             ,
+                                             bool taskgraph_p
+#endif
+) {
   kmp_dephash_t *h = *hash;
   if (h->nelements != 0 && h->nconflicts / h->size >= 1) {
     *hash = __kmp_dephash_extend(thread, h);
@@ -190,7 +213,12 @@ static kmp_dephash_entry *__kmp_dephash_find(kmp_info_t *thread,
     entry->last_set = NULL;
     entry->prev_set = NULL;
     entry->last_flag = 0;
-    entry->mtx_lock = NULL;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    if (taskgraph_p)
+      entry->set_num = -1;
+    else
+#endif
+      entry->mtx_lock = NULL;
     entry->next_in_bucket = h->buckets[bucket];
     h->buckets[bucket] = entry;
     h->nelements++;
@@ -200,6 +228,7 @@ static kmp_dephash_entry *__kmp_dephash_find(kmp_info_t *thread,
   return entry;
 }
 
+template <bool refcounting>
 static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
                                           kmp_depnode_list_t *list,
                                           kmp_depnode_t *node) {
@@ -213,7 +242,11 @@ static kmp_depnode_list_t *__kmp_add_node(kmp_info_t *thread,
       thread, sizeof(kmp_depnode_list_t));
 #endif
 
-  new_head->node = __kmp_node_ref(node);
+  if (refcounting) {
+    new_head->node = __kmp_node_ref(node);
+  } else {
+    new_head->node = node;
+  }
   new_head->next = list;
 
   return new_head;
@@ -275,7 +308,8 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
       if (dep->dn.task) {
         if (!dep->dn.successors || dep->dn.successors->node != node) {
           __kmp_track_dependence(gtid, dep, node, task);
-          dep->dn.successors = __kmp_add_node(thread, dep->dn.successors, node);
+          dep->dn.successors =
+              __kmp_add_node<true>(thread, dep->dn.successors, node);
           KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                         "%p\n",
                         gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
@@ -304,7 +338,8 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
     if (sink->dn.task) {
       if (!sink->dn.successors || sink->dn.successors->node != source) {
         __kmp_track_dependence(gtid, sink, source, task);
-        sink->dn.successors = __kmp_add_node(thread, sink->dn.successors, source);
+        sink->dn.successors =
+            __kmp_add_node<true>(thread, sink->dn.successors, source);
         KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                     "%p\n",
                     gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
@@ -317,21 +352,249 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
   return npredecessors;
 }
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+kmp_taskgraph_region_dep_t *__kmp_region_deplist_add(
+    kmp_info_t *thread, kmp_taskgraph_region_dep_t **recycled_deps,
+    kmp_taskgraph_region_t *region, kmp_taskgraph_region_dep_t *list) {
+  kmp_taskgraph_region_dep_t *head;
+  if (*recycled_deps) {
+    head = *recycled_deps;
+    *recycled_deps = (*recycled_deps)->next;
+  } else
+    head = (kmp_taskgraph_region_dep_t *)__kmp_fast_allocate(
+        thread, sizeof(kmp_taskgraph_region_dep_t));
+  head->region = region;
+  head->next = list;
+  return head;
+}
+
+kmp_taskgraph_region_t *
+__kmp_region_worklist_reverse(kmp_taskgraph_region_t *list) {
+  kmp_taskgraph_region_t *last = nullptr;
+  while (list) {
+    kmp_taskgraph_region_t *next = list->next;
+    list->next = last;
+    last = list;
+    list = next;
+  }
+  return last;
+}
+
+static kmp_depnode_t *__kmp_find_in_depnode_list(kmp_depnode_t *node,
+                                                 kmp_depnode_list_t *list) {
+  for (; list; list = list->next)
+    if (list->node == node)
+      return list->node;
+  return nullptr;
+}
+
+// A trivial fixed-size bitset implementation.
+
+typedef struct kmp_bitset {
+  kmp_uint64 *bits;
+  kmp_size_t bitsize;
+  kmp_size_t num_chunks;
+} kmp_bitset_t;
+
+static kmp_bitset_t *__kmp_bitset_alloc(kmp_info_t *thread,
+                                        kmp_size_t bitsize) {
+  kmp_size_t bytesize = (bitsize + 7) / 8;
+  kmp_size_t num_chunks =
+      (bytesize + sizeof(kmp_uint64) - 1) / sizeof(kmp_uint64);
+  kmp_bitset_t *bitset = (kmp_bitset_t *)__kmp_fast_allocate(
+      thread, sizeof(kmp_bitset_t) + sizeof(kmp_uint64) * num_chunks);
+  bitset->bits = (kmp_uint64 *)&bitset[1];
+  memset(bitset->bits, 0, sizeof(kmp_uint64) * num_chunks);
+  bitset->bitsize = bitsize;
+  bitset->num_chunks = num_chunks;
+  return bitset;
+}
+
+static void __kmp_bitset_free(kmp_info_t *thread, kmp_bitset_t *bitset) {
+  __kmp_fast_free(thread, bitset);
+}
+
+static void __kmp_bitset_set(kmp_bitset_t *bitset, kmp_size_t bitnum) {
+  kmp_size_t chunk = bitnum / (8 * sizeof(kmp_uint64));
+  if (bitnum < bitset->bitsize)
+    bitset->bits[chunk] |= (kmp_uint64)1 << (bitnum & 63);
+}
+
+static void __kmp_bitset_clearall(kmp_bitset_t *bitset) {
+  if (bitset)
+    memset(bitset->bits, 0, sizeof(kmp_int64) * bitset->num_chunks);
+}
+
+static void __kmp_bitset_setall(kmp_bitset_t *bitset) {
+  for (kmp_int32 chunk = 0; chunk < bitset->num_chunks - 1; chunk++)
+    bitset->bits[chunk] = ~(kmp_uint64)0;
+  kmp_int32 last_chunk_numbits = bitset->bitsize & 63;
+  if (last_chunk_numbits > 0) {
+    kmp_uint64 last_chunk_bits = ~((~(kmp_uint64)0) << last_chunk_numbits);
+    bitset->bits[bitset->num_chunks - 1] = last_chunk_bits;
+  }
+}
+
+static void __kmp_bitset_copy(kmp_bitset_t *dst, const kmp_bitset_t *src) {
+  assert(dst->num_chunks == src->num_chunks);
+  assert(dst->bitsize == src->bitsize);
+  memcpy(dst->bits, src->bits, sizeof(kmp_uint64) * dst->num_chunks);
+}
+
+/// Return TRUE if \c b is a subset of \c a.
+
+static bool __kmp_bitset_subset_p(const kmp_bitset_t *a,
+                                  const kmp_bitset_t *b) {
+  if (!b)
+    return true;
+  kmp_size_t chunk_max = std::max(a->num_chunks, b->num_chunks);
+  for (kmp_size_t chunk = 0; chunk < chunk_max; chunk++) {
+    kmp_uint64 a_bits = chunk < a->num_chunks ? a->bits[chunk] : 0;
+    kmp_uint64 b_bits = chunk < b->num_chunks ? b->bits[chunk] : 0;
+    if ((a_bits & b_bits) != b_bits)
+      return false;
+  }
+  return true;
+}
+
+static void __kmp_bitset_and(kmp_bitset_t *a, kmp_bitset_t *b,
+                             kmp_bitset_t *c) {
+  kmp_size_t chunk_max = std::max(b->num_chunks, c->num_chunks);
+  for (kmp_size_t chunk = 0; chunk < chunk_max; chunk++) {
+    kmp_uint64 b_bits = chunk < b->num_chunks ? b->bits[chunk] : 0;
+    kmp_uint64 c_bits = chunk < c->num_chunks ? c->bits[chunk] : 0;
+    a->bits[chunk] = b_bits & c_bits;
+  }
+}
+
+static void __kmp_bitset_and_not(kmp_bitset_t *a, kmp_bitset_t *b,
+                                 kmp_bitset_t *c) {
+  if (!c)
+    __kmp_bitset_copy(a, b);
+  else {
+    kmp_size_t chunk_max = std::max(b->num_chunks, c->num_chunks);
+    for (kmp_size_t chunk = 0; chunk < chunk_max; chunk++) {
+      kmp_uint64 b_bits = chunk < b->num_chunks ? b->bits[chunk] : 0;
+      kmp_uint64 c_bits = chunk < c->num_chunks ? c->bits[chunk] : 0;
+      a->bits[chunk] = b_bits & ~c_bits;
+    }
+  }
+}
+
+static void __kmp_bitset_or(kmp_bitset_t *a, kmp_bitset_t *b, kmp_bitset_t *c) {
+  if (!b && !c)
+    __kmp_bitset_clearall(a);
+  else if (!b)
+    __kmp_bitset_copy(a, c);
+  else if (!c)
+    __kmp_bitset_copy(a, b);
+  else {
+    kmp_size_t chunk_max = std::max(b->num_chunks, c->num_chunks);
+    for (kmp_size_t chunk = 0; chunk < chunk_max; chunk++) {
+      kmp_uint64 b_bits = chunk < b->num_chunks ? b->bits[chunk] : 0;
+      kmp_uint64 c_bits = chunk < c->num_chunks ? c->bits[chunk] : 0;
+      a->bits[chunk] = b_bits | c_bits;
+    }
+  }
+}
+
+static bool __kmp_bitset_empty_p(kmp_bitset_t *bitset) {
+  if (!bitset)
+    return true;
+  for (kmp_size_t chunk = 0; chunk < bitset->num_chunks; chunk++) {
+    if (bitset->bits[chunk] != 0)
+      return false;
+  }
+  return true;
+}
+
+/// Test two bitsets for equality.  Note that any unused bits at the end of the
+/// last chunk are kept as zero.
+
+static bool __kmp_bitset_equal(kmp_bitset_t *a, kmp_bitset_t *b) {
+  if (!b)
+    return __kmp_bitset_empty_p(a);
+  kmp_size_t chunk_max = std::max(a->num_chunks, b->num_chunks);
+  for (kmp_size_t chunk = 0; chunk < chunk_max; chunk++) {
+    kmp_uint64 a_bits = chunk < a->num_chunks ? a->bits[chunk] : 0;
+    kmp_uint64 b_bits = chunk < b->num_chunks ? b->bits[chunk] : 0;
+    if (a_bits != b_bits)
+      return false;
+  }
+  return true;
+}
+
+static bool __kmp_bitset_intersect_p(kmp_bitset_t *a, kmp_bitset_t *b) {
+  if (!a || !b)
+    return false;
+  kmp_size_t chunk_max = std::max(a->num_chunks, b->num_chunks);
+  for (kmp_size_t chunk = 0; chunk < chunk_max; chunk++) {
+    kmp_uint64 a_bits = chunk < a->num_chunks ? a->bits[chunk] : 0;
+    kmp_uint64 b_bits = chunk < b->num_chunks ? b->bits[chunk] : 0;
+    if ((a_bits & b_bits) != 0)
+      return true;
+  }
+  return false;
+}
+
+static kmp_int32 __kmp_bitset_popcount(kmp_bitset_t *bitset) {
+  if (!bitset)
+    return 0;
+  kmp_int32 accum = 0;
+  for (kmp_int32 c = 0; c < bitset->num_chunks; c++) {
+    accum += std::__popcount(bitset->bits[c]);
+  }
+  return accum;
+}
+
+static kmp_int32 __kmp_taskgraph_add_dep(kmp_info_t *thread,
+                                         kmp_depnode_t *node,
+                                         kmp_depnode_list_t *plist) {
+  kmp_int32 npredecessors = 0;
+  for (; plist; plist = plist->next) {
+    kmp_depnode_t *dep = plist->node;
+    if (!dep->dn.successors ||
+        !__kmp_find_in_depnode_list(node, dep->dn.successors)) {
+      dep->dn.successors =
+          __kmp_add_node<false>(thread, dep->dn.successors, node);
+      npredecessors++;
+    }
+  }
+  return npredecessors;
+}
+
+static kmp_int32 __kmp_taskgraph_add_dep(kmp_info_t *thread,
+                                         kmp_depnode_t *source,
+                                         kmp_depnode_t *sink) {
+  if (!sink)
+    return 0;
+  kmp_int32 npredecessors = 0;
+  if (!sink->dn.successors || sink->dn.successors->node != source) {
+    if (!__kmp_find_in_depnode_list(source, sink->dn.successors)) {
+      sink->dn.successors =
+          __kmp_add_node<false>(thread, sink->dn.successors, source);
+      npredecessors++;
+    }
+  }
+  return npredecessors;
+}
+#endif
+
+template <typename T>
 static inline kmp_int32
 __kmp_process_dep_all(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *h,
                       bool dep_barrier, kmp_task_t *task) {
-  KA_TRACE(30, ("__kmp_process_dep_all: T#%d processing dep_all, "
+  KA_TRACE(30, ("__kmp_process_dep_all<%s>: T#%d processing dep_all, "
                 "dep_barrier = %d\n",
-                gtid, dep_barrier));
+                T::name, gtid, dep_barrier));
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_int32 npredecessors = 0;
 
   // process previous omp_all_memory node if any
-  npredecessors +=
-      __kmp_depnode_link_successor(gtid, thread, task, node, h->last_all);
-  __kmp_node_deref(thread, h->last_all);
+  npredecessors += T::link_successor(gtid, thread, task, node, h->last_all);
+  T::deref(thread, h->last_all);
   if (!dep_barrier) {
-    h->last_all = __kmp_node_ref(node);
+    h->last_all = T::ref(node);
   } else {
     // if this is a sync point in the serial sequence, then the previous
     // outputs are guaranteed to be completed after the execution of this
@@ -350,38 +613,37 @@ __kmp_process_dep_all(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *h,
       kmp_depnode_list_t *last_set = info->last_set;
       kmp_depnode_list_t *prev_set = info->prev_set;
       if (last_set) {
-        npredecessors +=
-            __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
-        __kmp_depnode_list_free(thread, last_set);
-        __kmp_depnode_list_free(thread, prev_set);
+        npredecessors += T::link_successor(gtid, thread, task, node, last_set);
+        __kmp_depnode_list_free<T::rc>(thread, last_set);
+        __kmp_depnode_list_free<T::rc>(thread, prev_set);
         info->last_set = NULL;
         info->prev_set = NULL;
         info->last_flag = 0; // no sets in this dephash entry
       } else {
-        npredecessors +=
-            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+        npredecessors += T::link_successor(gtid, thread, task, node, last_out);
       }
-      __kmp_node_deref(thread, last_out);
+      T::deref(thread, last_out);
       if (!dep_barrier) {
-        info->last_out = __kmp_node_ref(node);
+        info->last_out = T::ref(node);
       } else {
         info->last_out = NULL;
       }
     }
   }
-  KA_TRACE(30, ("__kmp_process_dep_all: T#%d found %d predecessors\n", gtid,
-                npredecessors));
+  KA_TRACE(30, ("__kmp_process_dep_all<%s>: T#%d found %d predecessors\n",
+                T::name, gtid, npredecessors));
   return npredecessors;
 }
 
-template <bool filter>
+template <typename T>
 static inline kmp_int32
 __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
                    bool dep_barrier, kmp_int32 ndeps,
-                   kmp_depend_info_t *dep_list, kmp_task_t *task) {
-  KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d processing %d dependences : "
-                "dep_barrier = %d\n",
-                filter, gtid, ndeps, dep_barrier));
+                   kmp_depend_info_t *dep_list, kmp_task_t *task,
+                   kmp_int32 &next_mutex_set, bool filter = true) {
+  KA_TRACE(30, ("__kmp_process_deps<%s>: T#%d processing %d dependences : "
+                "dep_barrier = %d, filter = %d\n",
+                T::name, gtid, ndeps, dep_barrier, filter));
 
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_int32 npredecessors = 0;
@@ -391,28 +653,31 @@ __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
     if (filter && dep->base_addr == 0)
       continue; // skip filtered entries
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    kmp_dephash_entry_t *info =
+        __kmp_dephash_find(thread, hash, dep->base_addr, !T::rc);
+#else
     kmp_dephash_entry_t *info =
         __kmp_dephash_find(thread, hash, dep->base_addr);
+#endif
     kmp_depnode_t *last_out = info->last_out;
     kmp_depnode_list_t *last_set = info->last_set;
     kmp_depnode_list_t *prev_set = info->prev_set;
 
     if (dep->flags.out) { // out or inout --> clean lists if any
       if (last_set) {
-        npredecessors +=
-            __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
-        __kmp_depnode_list_free(thread, last_set);
-        __kmp_depnode_list_free(thread, prev_set);
+        npredecessors += T::link_successor(gtid, thread, task, node, last_set);
+        __kmp_depnode_list_free<T::rc>(thread, last_set);
+        __kmp_depnode_list_free<T::rc>(thread, prev_set);
         info->last_set = NULL;
         info->prev_set = NULL;
         info->last_flag = 0; // no sets in this dephash entry
       } else {
-        npredecessors +=
-            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+        npredecessors += T::link_successor(gtid, thread, task, node, last_out);
       }
-      __kmp_node_deref(thread, last_out);
+      T::deref(thread, last_out);
       if (!dep_barrier) {
-        info->last_out = __kmp_node_ref(node);
+        info->last_out = T::ref(node);
       } else {
         // if this is a sync point in the serial sequence, then the previous
         // outputs are guaranteed to be completed after the execution of this
@@ -423,27 +688,24 @@ __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
       if (info->last_flag == 0 || info->last_flag == dep->flag) {
         // last_set either didn't exist or of same dep kind
         // link node as successor of the last_out if any
-        npredecessors +=
-            __kmp_depnode_link_successor(gtid, thread, task, node, last_out);
+        npredecessors += T::link_successor(gtid, thread, task, node, last_out);
         // link node as successor of all nodes in the prev_set if any
-        npredecessors +=
-            __kmp_depnode_link_successor(gtid, thread, task, node, prev_set);
+        npredecessors += T::link_successor(gtid, thread, task, node, prev_set);
         if (dep_barrier) {
           // clean last_out and prev_set if any; don't touch last_set
-          __kmp_node_deref(thread, last_out);
+          T::deref(thread, last_out);
           info->last_out = NULL;
-          __kmp_depnode_list_free(thread, prev_set);
+          __kmp_depnode_list_free<T::rc>(thread, prev_set);
           info->prev_set = NULL;
         }
       } else { // last_set is of different dep kind, make it prev_set
         // link node as successor of all nodes in the last_set
-        npredecessors +=
-            __kmp_depnode_link_successor(gtid, thread, task, node, last_set);
+        npredecessors += T::link_successor(gtid, thread, task, node, last_set);
         // clean last_out if any
-        __kmp_node_deref(thread, last_out);
+        T::deref(thread, last_out);
         info->last_out = NULL;
         // clean prev_set if any
-        __kmp_depnode_list_free(thread, prev_set);
+        __kmp_depnode_list_free<T::rc>(thread, prev_set);
         if (!dep_barrier) {
           // move last_set to prev_set, new last_set will be allocated
           info->prev_set = last_set;
@@ -457,62 +719,133 @@ __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
       // 0 if last_set is empty, unchanged otherwise
       if (!dep_barrier) {
         info->last_flag = dep->flag; // store dep kind of the last_set
-        info->last_set = __kmp_add_node(thread, info->last_set, node);
+        info->last_set = __kmp_add_node<T::rc>(thread, info->last_set, node);
       }
       // check if we are processing MTX dependency
       if (dep->flag == KMP_DEP_MTX) {
-        if (info->mtx_lock == NULL) {
-          info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
-          __kmp_init_lock(info->mtx_lock);
-        }
-        KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS);
-        kmp_int32 m;
-        // Save lock in node's array
-        for (m = 0; m < MAX_MTX_DEPS; ++m) {
-          // sort pointers in decreasing order to avoid potential livelock
-          if (node->dn.mtx_locks[m] < info->mtx_lock) {
-            KMP_DEBUG_ASSERT(!node->dn.mtx_locks[node->dn.mtx_num_locks]);
-            for (int n = node->dn.mtx_num_locks; n > m; --n) {
-              // shift right all lesser non-NULL pointers
-              KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL);
-              node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1];
-            }
-            node->dn.mtx_locks[m] = info->mtx_lock;
-            break;
-          }
-        }
-        KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop
-        node->dn.mtx_num_locks++;
+        T::mutex_dep(thread, info, node, next_mutex_set);
       }
     }
   }
-  KA_TRACE(30, ("__kmp_process_deps<%d>: T#%d found %d predecessors\n", filter,
-                gtid, npredecessors));
+  KA_TRACE(30,
+           ("__kmp_process_deps<%s>: T#%d found %d predecessors (filter: %d)\n",
+            T::name, gtid, npredecessors, filter));
   return npredecessors;
 }
 
-#define NO_DEP_BARRIER (false)
-#define DEP_BARRIER (true)
+struct normal_deps {
+  static constexpr char name[] = "normal";
+  static constexpr bool rc = true;
+  static kmp_int32 link_successor(kmp_int32 gtid, kmp_info_t *thread,
+                                  kmp_task_t *task, kmp_depnode_t *source,
+                                  kmp_depnode_t *sink);
+  static kmp_int32 link_successor(kmp_int32 gtid, kmp_info_t *thread,
+                                  kmp_task_t *task, kmp_depnode_t *node,
+                                  kmp_depnode_list_t *plist);
+  static kmp_depnode_t *ref(kmp_depnode_t *node);
+  static void deref(kmp_info_t *thread, kmp_depnode_t *node);
+  static void mutex_dep(kmp_info_t *thread, kmp_dephash_entry_t *info,
+                        kmp_depnode_t *node, kmp_int32 &next_mutex_set);
+};
+
+kmp_int32 normal_deps::link_successor(kmp_int32 gtid, kmp_info_t *thread,
+                                      kmp_task_t *task, kmp_depnode_t *source,
+                                      kmp_depnode_t *sink) {
+  return __kmp_depnode_link_successor(gtid, thread, task, source, sink);
+}
 
-// returns true if the task has any outstanding dependence
-static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
-                             kmp_task_t *task, kmp_dephash_t **hash,
-                             bool dep_barrier, kmp_int32 ndeps,
-                             kmp_depend_info_t *dep_list,
-                             kmp_int32 ndeps_noalias,
-                             kmp_depend_info_t *noalias_dep_list) {
-  int i, n_mtxs = 0, dep_all = 0;
-#if KMP_DEBUG
-  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+kmp_int32 normal_deps::link_successor(kmp_int32 gtid, kmp_info_t *thread,
+                                      kmp_task_t *task, kmp_depnode_t *node,
+                                      kmp_depnode_list_t *plist) {
+  return __kmp_depnode_link_successor(gtid, thread, task, node, plist);
+}
+
+kmp_depnode_t *normal_deps::ref(kmp_depnode_t *node) {
+  return __kmp_node_ref(node);
+}
+
+void normal_deps::deref(kmp_info_t *thread, kmp_depnode_t *node) {
+  __kmp_node_deref(thread, node);
+}
+
+void normal_deps::mutex_dep(kmp_info_t *thread, kmp_dephash_entry_t *info,
+                            kmp_depnode_t *node, kmp_int32 &next_mutex_set) {
+  if (info->mtx_lock == NULL) {
+    info->mtx_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
+    __kmp_init_lock(info->mtx_lock);
+  }
+  KMP_DEBUG_ASSERT(node->dn.mtx_num_locks < MAX_MTX_DEPS);
+  kmp_int32 m;
+  // Save lock in node's array
+  for (m = 0; m < MAX_MTX_DEPS; ++m) {
+    // sort pointers in decreasing order to avoid potential livelock
+    if (node->dn.mtx_locks[m] < info->mtx_lock) {
+      KMP_DEBUG_ASSERT(!node->dn.mtx_locks[node->dn.mtx_num_locks]);
+      for (int n = node->dn.mtx_num_locks; n > m; --n) {
+        // shift right all lesser non-NULL pointers
+        KMP_DEBUG_ASSERT(node->dn.mtx_locks[n - 1] != NULL);
+        node->dn.mtx_locks[n] = node->dn.mtx_locks[n - 1];
+      }
+      node->dn.mtx_locks[m] = info->mtx_lock;
+      break;
+    }
+  }
+  KMP_DEBUG_ASSERT(m < MAX_MTX_DEPS); // must break from loop
+  node->dn.mtx_num_locks++;
+}
+
+#if OMP_TASKGRAPH_EXPERIMENTAL
+struct taskgraph_deps {
+  static constexpr char name[] = "taskgraph";
+  static constexpr bool rc = false;
+  static kmp_int32 link_successor(kmp_int32 gtid, kmp_info_t *thread,
+                                  kmp_task_t *task, kmp_depnode_t *source,
+                                  kmp_depnode_t *sink);
+  static kmp_int32 link_successor(kmp_int32 gtid, kmp_info_t *thread,
+                                  kmp_task_t *task, kmp_depnode_t *node,
+                                  kmp_depnode_list_t *plist);
+  static kmp_depnode_t *ref(kmp_depnode_t *node) { return node; }
+  static void deref(kmp_info_t *thread, kmp_depnode_t *node) {}
+  static void mutex_dep(kmp_info_t *thread, kmp_dephash_entry_t *info,
+                        kmp_depnode_t *node, kmp_int32 &next_mutex_set);
+};
+
+kmp_int32 taskgraph_deps::link_successor(kmp_int32 gtid, kmp_info_t *thread,
+                                         kmp_task_t *task,
+                                         kmp_depnode_t *source,
+                                         kmp_depnode_t *sink) {
+  return __kmp_taskgraph_add_dep(thread, source, sink);
+}
+
+kmp_int32 taskgraph_deps::link_successor(kmp_int32 gtid, kmp_info_t *thread,
+                                         kmp_task_t *task, kmp_depnode_t *node,
+                                         kmp_depnode_list_t *plist) {
+  return __kmp_taskgraph_add_dep(thread, node, plist);
+}
+
+void taskgraph_deps::mutex_dep(kmp_info_t *thread, kmp_dephash_entry_t *info,
+                               kmp_depnode_t *node, kmp_int32 &next_mutex_set) {
+  if (info->set_num == -1) {
+    info->set_num = next_mutex_set++;
+  }
+  if (!node->dn.set_membership) {
+    node->dn.set_membership = __kmp_bitset_alloc(thread, 64);
+  }
+  __kmp_bitset_set(node->dn.set_membership, info->set_num);
+}
 #endif
-  KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependences for task %p : %d "
-                "possibly aliased dependences, %d non-aliased dependences : "
-                "dep_barrier=%d .\n",
-                gtid, taskdata, ndeps, ndeps_noalias, dep_barrier));
+
+/// Search for aliased (same base address) dependencies in \c dep_list, and
+/// nullify duplicates.  Return TRUE if we have an 'all' dependency, FALSE
+/// otherwise.  Return number of mutex dependencies in *N_MTXS.
+static bool __kmp_filter_aliased_deps(kmp_int32 ndeps,
+                                      kmp_depend_info_t *dep_list,
+                                      kmp_task_t *task, int *n_mtxs) {
+  *n_mtxs = 0;
 
   // Filter deps in dep_list
   // TODO: Different algorithm for large dep_list ( > 10 ? )
-  for (i = 0; i < ndeps; i++) {
+  for (int i = 0; i < ndeps; i++) {
     if (dep_list[i].base_addr != 0 &&
         dep_list[i].base_addr != (kmp_intptr_t)KMP_SIZE_T_MAX) {
       KMP_DEBUG_ASSERT(
@@ -530,8 +863,8 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
       }
       if (dep_list[i].flag == KMP_DEP_MTX) {
         // limit number of mtx deps to MAX_MTX_DEPS per node
-        if (n_mtxs < MAX_MTX_DEPS && task != NULL) {
-          ++n_mtxs;
+        if (*n_mtxs < MAX_MTX_DEPS && task != NULL) {
+          ++(*n_mtxs);
         } else {
           dep_list[i].flag = KMP_DEP_OUT; // downgrade mutexinoutset to inout
         }
@@ -541,11 +874,2477 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
       // omp_all_memory dependence can be marked by compiler by either
       // (addr=0 && flag=0x80) (flag KMP_DEP_ALL), or (addr=-1).
       // omp_all_memory overrides all other dependences if any
-      dep_all = 1;
+      return true;
+    }
+  }
+  return false;
+}
+
+#if OMP_TASKGRAPH_EXPERIMENTAL
+// Round up a size to a power of two specified by val: Used to insert padding
+// between structures co-allocated using a single malloc() call
+// FIXME: We copy+pasted this, put it somewhere else instead.
+static size_t __kmp_round_up_to_val(size_t size, size_t val) {
+  if (size & (val - 1)) {
+    size &= ~(val - 1);
+    if (size <= KMP_SIZE_T_MAX - val) {
+      size += val; // Round up if there is no overflow.
+    }
+  }
+  return size;
+} // __kmp_round_up_to_val
+
+// FIXME: C++-ify this.
+static kmp_taskgraph_region_t *__kmp_taskgraph_region_alloc(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_node_t *node,
+    kmp_taskgraph_region_t *parent) {
+  kmp_taskgraph_region_t *region =
+      (kmp_taskgraph_region_t *)__kmp_fast_allocate(
+          thread, sizeof(kmp_taskgraph_region_t));
+  region->owner = taskgraph;
+  region->type = node ? TASKGRAPH_REGION_NODE : TASKGRAPH_REGION_WAIT;
+  region->task.node = node;
+  region->task.next_instance = region;
+  region->mark = TASKGRAPH_UNMARKED;
+  region->level = -1;
+  region->timestamp = 0;
+  region->next = nullptr;
+  region->parent = parent;
+  region->predecessors = nullptr;
+  region->successors = nullptr;
+  region->mutexset = nullptr;
+  region->mutexset_parent = nullptr;
+  region->reduce_input = nullptr;
+  *alloc_chain = region;
+  alloc_chain = &region->alloc_chain;
+  return region;
+}
+
+// FIXME: This too.
+static kmp_taskgraph_region_t *__kmp_taskgraph_region_alloc(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, enum kmp_taskgraph_region_type type,
+    kmp_int32 num_nodes, kmp_taskgraph_region_t *parent) {
+  kmp_size_t size = sizeof(kmp_taskgraph_region_t) +
+                    num_nodes * sizeof(kmp_taskgraph_region_t *);
+  size = __kmp_round_up_to_val(size, sizeof(kmp_taskgraph_region_t *));
+  kmp_taskgraph_region_t *region =
+      (kmp_taskgraph_region_t *)__kmp_fast_allocate(thread, size);
+  region->owner = taskgraph;
+  region->type = type;
+  region->inner.children = (kmp_taskgraph_region **)&region[1];
+  region->inner.num_children = num_nodes;
+  region->mark = TASKGRAPH_UNMARKED;
+  region->level = -1;
+  region->timestamp = 0;
+  region->next = nullptr;
+  region->parent = parent;
+  region->predecessors = nullptr;
+  region->successors = nullptr;
+  region->mutexset = nullptr;
+  region->mutexset_parent = nullptr;
+  region->reduce_input = nullptr;
+  *alloc_chain = region;
+  alloc_chain = &region->alloc_chain;
+  return region;
+}
+
+// This makes a mostly-deep copy of a region.  The region itself and children
+// nodes are created new, but node pointers are shared.
+static kmp_taskgraph_region_t *__kmp_taskgraph_region_clone(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t *from,
+    kmp_taskgraph_region_t *parent, kmp_int32 indent = 0) {
+  kmp_taskgraph_region_t *clone = nullptr;
+  switch (from->type) {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+    clone = __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
+                                         nullptr, parent);
+    clone->type = from->type;
+    break;
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT:
+    clone = __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
+                                         from->task.node, parent);
+    break;
+  default: {
+    clone =
+        __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain, from->type,
+                                     from->inner.num_children, parent);
+    for (kmp_int32 n = 0; n < from->inner.num_children; n++) {
+      clone->inner.children[n] = __kmp_taskgraph_region_clone(
+          thread, taskgraph, alloc_chain, from->inner.children[n], clone,
+          indent + 2);
+    }
+  }
+  }
+  TGDBG("%*scloned region %p from region %p\n", indent, "", clone, from);
+  return clone;
+}
+
+static kmp_int32
+__kmp_taskgraph_topological_order(kmp_taskgraph_region_t *region,
+                                  kmp_taskgraph_region_t **order_out,
+                                  kmp_int32 *outidx) {
+  if (region->mark == TASKGRAPH_PERMANENT_MARK)
+    return region->level;
+
+  assert(region->mark != TASKGRAPH_TEMP_MARK);
+
+  region->mark = TASKGRAPH_TEMP_MARK;
+
+  kmp_int32 max_level = -1;
+  for (kmp_taskgraph_region_dep_t *s = region->predecessors; s; s = s->next) {
+    kmp_int32 pred_level =
+        __kmp_taskgraph_topological_order(s->region, order_out, outidx);
+    max_level = pred_level > max_level ? pred_level : max_level;
+  }
+
+  region->level = max_level + 1;
+  region->mark = TASKGRAPH_PERMANENT_MARK;
+  order_out[(*outidx)++] = region;
+
+  return region->level;
+}
+
+static void
+__kmp_taskgraph_region_chain_clear_marks(kmp_taskgraph_region_t *region) {
+  for (; region; region = region->next)
+    region->mark = TASKGRAPH_UNMARKED;
+}
+
+static void
+__kmp_taskgraph_region_chain_prune(kmp_taskgraph_region_t **region_p) {
+  kmp_taskgraph_region_t *pruned_region = nullptr, *region = *region_p;
+  kmp_taskgraph_region_t **pruned_region_p = &pruned_region;
+
+  TGDBG("pruning worklist...\n");
+
+  // NOTE: Pruning and deletion look the same here with respect to the handling
+  // of the worklist, but deleted nodes are freed from the taskgraph structure
+  // during cleanup, whereas combined nodes are retained.
+  for (; region; region = region->next) {
+    if (region->mark == TASKGRAPH_COMBINED || region->mark == TASKGRAPH_DELETED)
+      *pruned_region_p = region->next;
+    else {
+      *pruned_region_p = region;
+      pruned_region_p = &region->next;
+    }
+  }
+
+  *pruned_region_p = nullptr;
+  *region_p = pruned_region;
+}
+
+static kmp_int32 __kmp_region_deplist_len(kmp_taskgraph_region_dep_t *list) {
+  kmp_int32 len = 0;
+  for (; list; list = list->next)
+    ++len;
+  return len;
+}
+
+static void __kmp_region_deplist_free(kmp_info_t *thread,
+                                      kmp_taskgraph_region_dep_t *list) {
+  while (list) {
+    kmp_taskgraph_region_dep_t *next = list->next;
+    __kmp_fast_free(thread, list);
+    list = next;
+  }
+}
+
+static void __kmp_region_dep_recycle(kmp_taskgraph_region_dep_t **recycled,
+                                     kmp_taskgraph_region_dep_t *dep) {
+  dep->next = *recycled;
+  *recycled = dep;
+}
+
+static void __kmp_region_deplist_recycle(kmp_taskgraph_region_dep_t **recycled,
+                                         kmp_taskgraph_region_dep_t *list) {
+  while (list) {
+    kmp_taskgraph_region_dep_t *next = list->next;
+    __kmp_region_dep_recycle(recycled, list);
+    list = next;
+  }
+}
+
+static bool __kmp_taskgraph_collapse_sequence(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t **region_p,
+    kmp_taskgraph_region_t *parent, kmp_int32 &stamp) {
+  kmp_taskgraph_region_t *region = *region_p;
+  kmp_taskgraph_region_t *chain_start = region;
+  kmp_taskgraph_region_t *chain_end = region;
+  kmp_int32 chain_len = 1;
+
+  if (region->type == TASKGRAPH_REGION_ENTRY)
+    return false;
+
+  while (__kmp_region_deplist_len(chain_end->successors) == 1) {
+    kmp_taskgraph_region_t *past_end = chain_end->successors->region;
+    if (__kmp_region_deplist_len(past_end->predecessors) == 1) {
+      if (past_end->type == TASKGRAPH_REGION_EXIT)
+        break;
+      else {
+        chain_end = past_end;
+        ++chain_len;
+      }
+    } else
       break;
+  }
+
+  if (chain_len <= 1)
+    return false;
+
+  kmp_taskgraph_region_t *seq_region = __kmp_taskgraph_region_alloc(
+      thread, taskgraph, alloc_chain, TASKGRAPH_REGION_SEQUENTIAL, chain_len,
+      parent);
+  TGDBG("allocated new seq region: %p (length %d)\n", seq_region, chain_len);
+  kmp_taskgraph_region_t **worklist_p = region_p;
+  *worklist_p = seq_region;
+  seq_region->next = chain_start->next;
+  kmp_int32 level = -1;
+  for (kmp_int32 i = 0; i < chain_len; i++) {
+    seq_region->inner.children[i] = chain_start;
+    TGDBG("mark node %p as combined\n", chain_start);
+    chain_start->mark = TASKGRAPH_COMBINED;
+    chain_start->timestamp = stamp;
+    chain_start->parent = seq_region;
+    // The level of the sequence is the level of the first node.
+    if (level == -1)
+      level = chain_start->level;
+
+    if (i < chain_len - 1) {
+      chain_start = chain_start->successors->region;
+    }
+  }
+
+  seq_region->level = level;
+  seq_region->predecessors = seq_region->inner.children[0]->predecessors;
+  seq_region->successors =
+      seq_region->inner.children[chain_len - 1]->successors;
+  seq_region->inner.children[0]->predecessors = nullptr;
+  seq_region->inner.children[chain_len - 1]->successors = nullptr;
+
+  // Update predecessors to point to new seq region.
+  for (kmp_taskgraph_region_dep_t *pred = seq_region->predecessors; pred;
+       pred = pred->next) {
+    for (kmp_taskgraph_region_dep_t *succ = pred->region->successors; succ;
+         succ = succ->next) {
+      if (succ->region == seq_region->inner.children[0]) {
+        succ->region = seq_region;
+      }
+    }
+  }
+
+  // Update successors to point back to new seq region.
+  for (kmp_taskgraph_region_dep_t *succ = seq_region->successors; succ;
+       succ = succ->next) {
+    for (kmp_taskgraph_region_dep_t *pred = succ->region->predecessors; pred;
+         pred = pred->next) {
+      if (pred->region == seq_region->inner.children[chain_len - 1]) {
+        pred->region = seq_region;
+      }
+    }
+  }
+
+  return true;
+}
+
+static const char *
+__kmp_taskgraph_region_type_name(kmp_taskgraph_region_type type);
+
+static void __kmp_taskgraph_region_dfs(kmp_taskgraph_region_t *region,
+                                       kmp_taskgraph_region_t **order,
+                                       kmp_int32 &idx, bool use_preds) {
+  if (order) {
+    region->timestamp = --idx;
+    order[idx] = region;
+  }
+  region->mark = TASKGRAPH_TEMP_MARK;
+  for (kmp_taskgraph_region_dep_t *reg = use_preds ? region->predecessors
+                                                   : region->successors;
+       reg; reg = reg->next) {
+    if (reg->region->mark == TASKGRAPH_UNMARKED)
+      __kmp_taskgraph_region_dfs(reg->region, order, idx, use_preds);
+  }
+}
+
+#if defined(DEBUG_TASKGRAPH) && defined(CHECK_WORKLIST)
+
+static void __kmp_taskgraph_region_gather_deps(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, kmp_taskgraph_region_dep_t **deplist,
+    bool &ok) {
+  for (kmp_taskgraph_region_dep_t *dep = *deplist; dep; dep = dep->next) {
+    if (dep->region == region)
+      return;
+  }
+
+  *deplist = __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, region,
+                                      *deplist);
+
+  for (kmp_taskgraph_region_dep_t *pred = region->predecessors; pred;
+       pred = pred->next) {
+    if (pred->region->mark == TASKGRAPH_DELETED) {
+      fprintf(stderr, "*** Region %p's predecessor %p is a deleted node\n",
+              region, pred->region);
+      ok = false;
+    }
+    __kmp_taskgraph_region_gather_deps(thread, taskgraph, pred->region, deplist,
+                                       ok);
+  }
+
+  for (kmp_taskgraph_region_dep_t *succ = region->successors; succ;
+       succ = succ->next) {
+    if (succ->region->mark == TASKGRAPH_DELETED) {
+      fprintf(stderr, "*** Region %p's successor %p is a deleted node\n",
+              region, succ->region);
+      ok = false;
+    }
+    __kmp_taskgraph_region_gather_deps(thread, taskgraph, succ->region, deplist,
+                                       ok);
+  }
+}
+
+static bool __kmp_taskgraph_region_worklist_check(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, const char *where) {
+  kmp_taskgraph_region_dep_t *collected_nodes = nullptr;
+  bool ok = true;
+  __kmp_taskgraph_region_gather_deps(thread, taskgraph, region,
+                                     &collected_nodes, ok);
+
+  // Check all collected nodes are in the region's worklist.
+  for (kmp_taskgraph_region_dep_t *cn = collected_nodes; cn; cn = cn->next) {
+    bool in_list = false;
+    for (kmp_taskgraph_region_t *r = region; r; r = r->next) {
+      if (r == cn->region) {
+        in_list = true;
+        break;
+      }
+    }
+    if (!in_list) {
+      fprintf(stderr,
+              "*** Region %p is in dependency graph but not worklist (%s)\n",
+              cn->region, where);
+      ok = false;
+    }
+  }
+
+  for (kmp_taskgraph_region_t *r = region; r; r = r->next) {
+    bool in_list = false;
+    for (kmp_taskgraph_region_dep_t *cn = collected_nodes; cn; cn = cn->next) {
+      if (r == cn->region) {
+        in_list = true;
+        break;
+      }
+    }
+    if (!in_list) {
+      fprintf(stderr,
+              "*** Region %p is in worklist but not dependency graph (%s)\n", r,
+              where);
+      ok = false;
+    }
+  }
+
+  __kmp_region_deplist_recycle(&taskgraph->recycled_deps, collected_nodes);
+
+  return ok;
+}
+#else
+static bool __kmp_taskgraph_region_worklist_check(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, const char *where) {
+  return true;
+}
+#endif
+
+static kmp_taskgraph_region_t *__kmp_taskgraph_region_dom_intersect(
+    kmp_taskgraph_region_t **order, kmp_taskgraph_region_t **doms,
+    kmp_taskgraph_region_t *b1, kmp_taskgraph_region_t *b2) {
+  kmp_int32 finger1 = b1->timestamp;
+  kmp_int32 finger2 = b2->timestamp;
+  while (finger1 != finger2) {
+    while (finger1 < finger2)
+      finger1 = doms[finger1]->timestamp;
+    while (finger2 < finger1)
+      finger2 = doms[finger2]->timestamp;
+  }
+  return order[finger1];
+}
+
+static void __kmp_taskgraph_region_doms(kmp_taskgraph_region_t **order,
+                                        kmp_taskgraph_region_t **doms,
+                                        kmp_int32 worklist_length,
+                                        bool postdom) {
+  bool changed = true;
+  // Set doms[start_node] <- start_node
+  doms[worklist_length - 1] = order[worklist_length - 1];
+  order[worklist_length - 1]->mark = TASKGRAPH_PERMANENT_MARK;
+  while (changed) {
+    changed = false;
+    for (int n = 0; n < worklist_length - 1; n++) {
+      kmp_taskgraph_region_t *b = order[n];
+      kmp_taskgraph_region_t *new_idom = nullptr;
+      for (kmp_taskgraph_region_dep_t *pred = postdom ? b->successors
+                                                      : b->predecessors;
+           pred; pred = pred->next) {
+        if (pred->region->mark == TASKGRAPH_PERMANENT_MARK) {
+          new_idom = pred->region;
+          break;
+        }
+      }
+      for (kmp_taskgraph_region_dep_t *pred = postdom ? b->successors
+                                                      : b->predecessors;
+           pred; pred = pred->next) {
+        if (pred->region == new_idom)
+          continue;
+        if (doms[pred->region->timestamp]) {
+          new_idom = __kmp_taskgraph_region_dom_intersect(
+              order, doms, pred->region, new_idom);
+        }
+      }
+      if (doms[b->timestamp] != new_idom) {
+        doms[b->timestamp] = new_idom;
+        order[b->timestamp]->mark = TASKGRAPH_PERMANENT_MARK;
+        changed = true;
+      }
+    }
+  }
+}
+
+static bool __kmp_taskgraph_region_mutex_p(kmp_taskgraph_region_t *reg) {
+  if (reg->type == TASKGRAPH_REGION_NODE)
+    return reg->mutexset != nullptr;
+  return false;
+}
+
+// This function collapses graph regions with forms like this:
+//
+//  1.    A(pp)    2.        A               3.     A(pp)
+//       / \              /     \                 /   \
+//      B   C           B(pp)    E(pp)           B(pp)  E
+//       \ /           / \       / \            / \    /
+//       D(*)         C   D     F   G           C  D  /
+//                     \   \   /   /             \ | /
+//                      `---H(*)--'               F(*)
+//
+// We look for a node with more than one predecessor (*), where each of those
+// predecessors has a single successor and a single predecessor (pp).  We group
+// nodes by which pp (predecessor-predecessor) they have: for (1), nodes B & C
+// share a pp; for (2), C & D share a pp, and F & G share a pp; for (3), C & D
+// share a pp, and E has a separate pp.
+//
+// We choose the pp the the highest level ("furthest down the graph"), and
+// collapse the subgraph into a parallel region.
+
+static bool __kmp_taskgraph_collapse_par_exclusive(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t **region_p,
+    kmp_taskgraph_region_t *parent, kmp_int32 &stamp) {
+  kmp_taskgraph_region_t *region = *region_p;
+  kmp_int32 num_predecessors = __kmp_region_deplist_len(region->predecessors);
+
+  TGDBG("predecessors %d, successors %d\n",
+        __kmp_region_deplist_len(region->predecessors),
+        __kmp_region_deplist_len(region->successors));
+
+  if (num_predecessors <= 1)
+    return false;
+
+  TGDBG("found multiple predecessors, creating parallel/unordered region\n");
+  kmp_taskgraph_region_dep_t *pred_preds = nullptr;
+  kmp_int32 highest_level = -1;
+
+  for (kmp_taskgraph_region_dep_t *pred = region->predecessors; pred;
+       pred = pred->next) {
+    TGDBG("consider predecessor: %p\n", pred->region);
+    TGDBG("-- successors %d, predecessors %d\n",
+          __kmp_region_deplist_len(pred->region->successors),
+          __kmp_region_deplist_len(pred->region->predecessors));
+    if (highest_level == -1 || pred->region->level > highest_level)
+      highest_level = pred->region->level;
+    kmp_taskgraph_region_t *pred_region = pred->region;
+    if (__kmp_region_deplist_len(pred_region->successors) != 1)
+      continue;
+    if (__kmp_region_deplist_len(pred_region->predecessors) != 1)
+      continue;
+    bool in_list = false;
+    TGDBG("pp region: %p (%s)\n", pred_region->predecessors->region,
+          __kmp_taskgraph_region_type_name(
+              pred_region->predecessors->region->type));
+    kmp_taskgraph_region_t *pp_region = pred_region->predecessors->region;
+    for (kmp_taskgraph_region_dep_t *pp = pred_preds; pp; pp = pp->next) {
+      if (pp->region == pp_region) {
+        in_list = true;
+        break;
+      }
+    }
+    if (!in_list) {
+      pred_preds = __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
+                                            pp_region, pred_preds);
+      TGDBG("add %p to list: len(pred_preds)=%d\n", pp_region,
+            __kmp_region_deplist_len(pred_preds));
+    }
+  }
+
+  kmp_int32 num_pps = __kmp_region_deplist_len(pred_preds);
+  if (num_pps == 0) {
+    TGDBG("no collapsible regions, bailing out\n");
+    return false;
+  }
+  TGDBG("found %d predecessor-predecessors\n", num_pps);
+  TGDBG("highest pred level: %d\n", highest_level);
+
+  kmp_int32 pp_idx = 0;
+
+  bool changed = false;
+
+  for (kmp_taskgraph_region_dep_t *pp = pred_preds; pp; pp = pp->next) {
+    kmp_taskgraph_region_dep_t *par_succs = nullptr;
+    kmp_taskgraph_region_dep_t *par_preds = nullptr;
+    kmp_int32 preds_for_pp = 0;
+    bool any_mutex_p = false;
+    for (kmp_taskgraph_region_dep_t *pred = region->predecessors; pred;
+         pred = pred->next) {
+      kmp_taskgraph_region_t *pred_region = pred->region;
+      if (!pred_region->predecessors)
+        continue;
+      if (pred_region->level < highest_level)
+        continue;
+      if (__kmp_region_deplist_len(pred_region->predecessors) != 1 ||
+          __kmp_region_deplist_len(pred_region->successors) != 1)
+        continue;
+      TGDBG("counting pred region: %p (%s)\n", pred_region,
+            __kmp_taskgraph_region_type_name(pred_region->type));
+      if (pred_region->predecessors->region == pp->region) {
+        ++preds_for_pp;
+        if (__kmp_taskgraph_region_mutex_p(pred_region))
+          any_mutex_p = true;
+      }
+    }
+    TGDBG("found %d preds for pp region %p\n", preds_for_pp, pp->region);
+    if (preds_for_pp < 2)
+      continue;
+    kmp_taskgraph_region_type region_type =
+        any_mutex_p ? TASKGRAPH_REGION_EXCLUSIVE : TASKGRAPH_REGION_PARALLEL;
+    kmp_taskgraph_region_t *par_region = __kmp_taskgraph_region_alloc(
+        thread, taskgraph, alloc_chain, region_type, preds_for_pp, parent);
+    changed = true;
+    TGDBG("allocated %s region: %p\n",
+          region_type == TASKGRAPH_REGION_EXCLUSIVE ? "exclusive" : "parallel",
+          par_region);
+    kmp_taskgraph_region_dep_t *pred = region->predecessors;
+    kmp_int32 level = -1;
+    bool found_reduction_data = false;
+    for (kmp_int32 i = 0; pred; pred = pred->next) {
+      kmp_taskgraph_region_t *pred_region = pred->region;
+      TGDBG("considering pred region: %p\n", pred_region);
+      if (!pred_region->predecessors) {
+        TGDBG("bailing (no predecessors)\n");
+        continue;
+      }
+      if (pred_region->predecessors->region != pp->region) {
+        TGDBG("bailing (wrong pp region)\n");
+        continue;
+      }
+      if (__kmp_region_deplist_len(pred_region->predecessors) != 1 ||
+          __kmp_region_deplist_len(pred_region->successors) != 1) {
+        TGDBG("bailing (non-unit pred/succ list length)\n");
+        continue;
+      }
+      TGDBG("process region %p (%d/%d), level %d\n", pred->region, i + 1,
+            preds_for_pp, pred_region->level);
+      par_region->inner.children[i] = pred_region;
+      pred_region->mark = TASKGRAPH_COMBINED;
+      pred_region->timestamp = stamp;
+      pred_region->parent = par_region;
+
+      // Reduction handling.  The reduction input data is now attached to one
+      // of the tasks participating in the reduction.  Move it to the enclosing
+      // parallel region instead.
+      if (pred_region->type == TASKGRAPH_REGION_NODE &&
+          pred_region->task.node->reduce_input) {
+        // We should only be doing this once per par region.
+        assert(!par_region->reduce_input);
+        par_region->reduce_input = pred_region->task.node->reduce_input;
+        pred_region->task.node->reduce_input = nullptr;
+        found_reduction_data = true;
+      }
+
+      // We expect all the predecessor regions to be at the same level.
+      if (level == -1)
+        level = pred_region->level;
+      else
+        assert(level == pred_region->level);
+      if (!par_succs) {
+        // Copy one list of predecessors/successors for the predecessor region.
+        // We know these are of length one by checks above.  We'll re-use them
+        // for the created parallel region.
+        par_preds = pred_region->predecessors;
+        par_succs = pred_region->successors;
+        pred_region->predecessors = nullptr;
+        pred_region->successors = nullptr;
+      }
+      i++;
+    }
+    par_region->level = level;
+    par_region->predecessors = par_preds;
+    par_region->successors = par_succs;
+
+    if (region->type == TASKGRAPH_REGION_WAIT && !found_reduction_data) {
+      // If we have no reduction data, we will not create a taskgroup for this
+      // parallel region at replay time, so we don't need to terminate/discard
+      // that region when we're done.  Clear the taskloop_task flag.
+      region->task.node->taskloop_task = false;
+    }
+
+    // Add the new parallel region to the worklist. FIXME: We're reprocessing
+    // the 'region' node here -- we don't need to do that if it's fully
+    // consumed.)
+    par_region->next = region->next;
+    region->next = par_region;
+  }
+
+#ifdef DEBUG_TASKGRAPH
+  TGDBG("before pred fixup:\n");
+  for (kmp_taskgraph_region_dep_t *pred = region->predecessors; pred;
+       pred = pred->next) {
+    TGDBG("region %p, pred region: %p\n", region, pred->region);
+  }
+#endif
+
+  // Now, fix up predecessor list for 'region', and successor lists for each
+  // predecessor-predecessor.
+  kmp_taskgraph_region_dep_t **dep_p = &region->predecessors;
+  while (*dep_p) {
+    kmp_taskgraph_region_dep_t *dep = *dep_p;
+    if (dep->region->mark == TASKGRAPH_COMBINED) {
+      if (!dep->region->successors) {
+        dep->region = dep->region->parent;
+        dep_p = &dep->next;
+      } else {
+        kmp_taskgraph_region_dep_t *next = dep->next;
+        __kmp_region_dep_recycle(&taskgraph->recycled_deps, dep);
+        *dep_p = next;
+      }
+    } else {
+      dep_p = &dep->next;
     }
   }
 
+#ifdef DEBUG_TASKGRAPH
+  TGDBG("after pred fixup:\n");
+  for (kmp_taskgraph_region_dep_t *pred = region->predecessors; pred;
+       pred = pred->next) {
+    TGDBG("region %p, pred region: %p\n", region, pred->region);
+  }
+#endif
+
+  for (kmp_taskgraph_region_dep_t *pp = pred_preds; pp; pp = pp->next) {
+    kmp_taskgraph_region_t *pp_region = pp->region;
+    dep_p = &pp_region->successors;
+    while (*dep_p) {
+      kmp_taskgraph_region_dep_t *dep = *dep_p;
+      if (dep->region->mark == TASKGRAPH_COMBINED) {
+        if (!dep->region->predecessors) {
+          dep->region = dep->region->parent;
+          dep_p = &dep->next;
+        } else {
+          kmp_taskgraph_region_dep_t *next = dep->next;
+          __kmp_region_dep_recycle(&taskgraph->recycled_deps, dep);
+          *dep_p = next;
+        }
+      } else {
+        dep_p = &dep->next;
+      }
+    }
+  }
+
+  return changed;
+}
+
+static void __kmp_taskgraph_region_dot(kmp_taskgraph_region_t *region,
+                                       const char *name) {
+  fprintf(stderr, "digraph %s {\n", name);
+  for (kmp_taskgraph_region_t *r = region; r; r = r->next) {
+    if (r->mark == TASKGRAPH_DELETED) {
+      fprintf(stderr, "\"%p\" [shape=box, label=\"%p(%s) (deleted)\"]\n", r, r,
+              __kmp_taskgraph_region_type_name(r->type));
+    } else if (r->level == -1) {
+      fprintf(stderr, "\"%p\" [shape=box, label=\"%p(%s) (new)\"]\n", r, r,
+              __kmp_taskgraph_region_type_name(r->type));
+    } else {
+      fprintf(stderr, "\"%p\" [shape=box, label=\"%p(%s)\"]\n", r, r,
+              __kmp_taskgraph_region_type_name(r->type));
+    }
+    for (kmp_taskgraph_region_dep_t *succ = r->successors; succ;
+         succ = succ->next) {
+      fprintf(stderr, "  \"%p\" -> \"%p\" [color=green]\n", r, succ->region);
+    }
+    for (kmp_taskgraph_region_dep_t *pred = r->predecessors; pred;
+         pred = pred->next) {
+      fprintf(stderr, "  \"%p\" -> \"%p\" [color=red, constraint=false]\n", r,
+              pred->region);
+    }
+  }
+  fprintf(stderr, "}\n");
+}
+
+static kmp_int32
+__kmp_taskgraph_count_edges_to_dominator(kmp_taskgraph_region_t *reg,
+                                         kmp_taskgraph_region_t *dom) {
+  kmp_int32 count = __kmp_region_deplist_len(reg->successors) - 1;
+
+  for (kmp_taskgraph_region_dep_t *pred = reg->predecessors; pred;
+       pred = pred->next) {
+    if (pred->region == dom)
+      count++;
+    else
+      count += __kmp_taskgraph_count_edges_to_dominator(pred->region, dom) + 1;
+  }
+  count--;
+
+  return count;
+}
+
+/// Extract/clone a subgraph of the dependency graph, and rewrite predecessor
+/// and successor edges to point to the new cloned part.
+//
+// The function conceptually starts at the bottom (a list of predecessors
+// with some particular dominator) and works up towards the entry point,
+// stopping when it hits the aforementioned dominator.
+//
+// Say we have an irreducible graph like this (each letter represents a region,
+// which could be a single task node or an already-processed nested region):
+//
+//          <S>         (S->A, S->B)
+//        _/   \_
+//       /       \
+//      A         B     (A->C, A->D, B->F, B->G)
+//     /  \      /  \
+//    C    D    F    G
+//    |\     \/     /|
+//    | \    /\    / |  (C->H, C->I, D->I, F->H, G->H, G->J)
+//    |  \ / __|__/  |
+//     \ /\_/_ /     |
+//      H__/  I      J
+//       \__  |  ___/   (H->E, I->E, J->E)
+//          \ | /
+//           <E>
+//
+// We pick the exit node E which has more than one predecessor: H, I and J.
+// In this case, H is immediately dominated by the start node, S.
+// The 'preds_with_dom' list initially contains the node H.
+// We clone the region H then call ourselves with its cloned predecessors,
+// until we hit the dominator 'region_dom'.  After rewriting the original
+// subgraph's (entering) predecessors and (leaving) successors, we obtain a
+// graph like this:
+//
+//           __ <S>__          (S->A', S->B', S->A, S->B)
+//        _/  /   \  \___
+//       /  /      \     \
+//      A'  B'      A     B    (A'->C', B'->F', B'->G', A->C, A->D, B->F, B->G)
+//     /   / \     / \   / \
+//    C'  F'  G'  C   D  F* G  (C'->H', F'->H', G'->H', C->I, D->I, G->J)
+//     \_ | _/     \ /     /
+//        H'        I     J    (H'->E, I->E, J->E)
+//         \        |    /
+//          \___   / __/
+//               <E>
+//
+// The new cloned subgraph formed from nodes H', C', F', G', A', B' replaces
+// the original predecessor of E, H.  Some nodes are now unreachable (F, marked
+// with *), and can be deleted.  The start node S now has successors A, B, and
+// the new clones A' and B'.
+//
+// In this way, irreducible graphs are turned into reducible graphs.  A
+// critical point is what it means to clone a task node in this way: that is
+// discussed in the commentary of __kmp_taskgraph_rewrite_irreducible.
+
+static void __kmp_taskgraph_clone_subgraph(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain,
+    kmp_taskgraph_region_t *cloned_nodes[], kmp_taskgraph_region_t *orig_region,
+    kmp_taskgraph_region_t *doms[], kmp_taskgraph_region_dep_t *preds_with_dom,
+    kmp_taskgraph_region_t *region_dom,
+    kmp_taskgraph_region_t ***added_worklist) {
+  for (kmp_taskgraph_region_dep_t *pred = preds_with_dom; pred;
+       pred = pred->next) {
+    kmp_taskgraph_region_t *pred_region = pred->region;
+    if (pred_region == region_dom) {
+      // NOTE: Adding the new subgraph entry point as a new successor for the
+      // dominating block is done in the successor-adding post-pass.
+      pred->region = region_dom;
+    } else {
+      // If we've already processed this predecessor, move on.
+      if (cloned_nodes[pred_region->timestamp]) {
+        pred->region = cloned_nodes[pred_region->timestamp];
+        continue;
+      }
+      kmp_taskgraph_region_t *cloned_region = __kmp_taskgraph_region_clone(
+          thread, taskgraph, alloc_chain, pred_region, nullptr);
+      cloned_nodes[pred_region->timestamp] = cloned_region;
+
+      **added_worklist = cloned_region;
+      *added_worklist = &cloned_region->next;
+
+      pred->region = cloned_region;
+      // Now make a copy of the predecessor list and call ourselves recursively.
+      kmp_taskgraph_region_dep_t *cloned_preds = nullptr;
+      for (kmp_taskgraph_region_dep_t *p = pred_region->predecessors; p;
+           p = p->next) {
+        cloned_preds = __kmp_region_deplist_add(
+            thread, &taskgraph->recycled_deps, p->region, cloned_preds);
+      }
+      cloned_region->predecessors = cloned_preds;
+      // Note pred_region is the original predecessor region here, not the
+      // newly-cloned one.
+      __kmp_taskgraph_clone_subgraph(thread, taskgraph, alloc_chain,
+                                     cloned_nodes, pred_region, doms,
+                                     cloned_preds, region_dom, added_worklist);
+    }
+  }
+}
+
+/// This function uses several strategies to turn an irreducible taskgraph
+/// into a reducible taskgraph.
+//
+// 1. If a node C depends on node B and also node A which dominates C,
+//    and if B is also dominated by C, then the dependency of C on A can be
+//    dropped.  That is, we know B must execute after A, so we can say
+//    execution must proceed A->B->C, and we don't also need to specify the
+//    transitive A->C dependency directly.
+//
+//            A          A
+//           / \         |
+//          B   )   ->   B
+//           \ /         |
+//            C          C
+//
+// 2. Two nodes with the same set of predecessors and successors are turned
+//    into a parallel region.  This graph form can arise from use of
+//    "inoutset" dependencies.
+//
+//        A  B  C                               A  B  C
+//       / \/ \/ \                              |  |  |
+//      /__/\ /\__\                             |  |  |
+//      D____X____E  (A+B+C->D & A+B+C->E)  ->  par(D,E)
+//       \       /                                 |
+//        '--F--'                                  F
+//
+// 3. We find a node with >1 predecessor R, and group those predecessors by
+//    their immediate dominators.  There are two subcases from here.
+//
+// 3a. If there is more than one group of predecessors (more than one
+//     dominator), we pick the dominator with the highest topological-sort
+//     level, and we clone the subgraph from that dominator to R.
+//
+// 3b. If all predecessors share a single dominator, we instead pick the
+//     predecessor with the highest incoming/outgoing edge count, and we clone
+//     the subgraph from that predecessor to the dominator.
+//
+// For details of how the subgraph cloning works, see the commentary for
+// __kmp_taskgraph_clone_subgraph.
+//
+// In this way, irreducible edges are gradually "teased apart", and the graph
+// thus becomes reducible.
+//
+// Cloning the subgraph means that task nodes can appear more than once in the
+// taskgraph (multiple "instantiations").  The way this should be handled is
+// left to later stages of execution, allowing for runtime or API-specific
+// techniques to be used.
+//
+// Say the resulting graph clones a node N into N1 and N2.  Now:
+//
+//  - All of N1's predecessors and all of N2's predecessors must execute before
+//    either N1 or N2 execute.
+//  - Only N1 or N2 should execute, not both.
+//  - All of N1's, and all of N2's, successors should execute after either N1
+//    or N2 executes.
+//
+// For host execution, this is handled by __kmp_exec_descr_link_instances, etc.
+
+static bool __kmp_taskgraph_rewrite_irreducible(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **alloc_chain, kmp_taskgraph_region_t **region_p,
+    kmp_taskgraph_region_t *exitregion) {
+  kmp_taskgraph_region_t *entryregion = *region_p;
+  bool changed = false;
+
+  kmp_int32 worklist_length = 0;
+  for (kmp_taskgraph_region_t *r = entryregion; r; r = r->next) {
+    // Deleted regions stay deleted.  (We could actually remove these from
+    // the worklist here, I think.)
+    if (r->mark == TASKGRAPH_DELETED)
+      continue;
+    r->mark = TASKGRAPH_UNMARKED;
+    worklist_length++;
+  }
+
+#ifdef DEBUG_TASKGRAPH
+  TGDBG("worklist length: %d\n", worklist_length);
+
+  __kmp_taskgraph_region_dot(entryregion, "PredsAndSuccs");
+#endif
+
+  kmp_taskgraph_region_t **order =
+      (kmp_taskgraph_region_t **)__kmp_fast_allocate(
+          thread, worklist_length * sizeof(kmp_taskgraph_region_t *));
+  kmp_taskgraph_region_t **doms =
+      (kmp_taskgraph_region_t **)__kmp_fast_allocate(
+          thread, worklist_length * sizeof(kmp_taskgraph_region_t *));
+  memset(doms, 0, worklist_length * sizeof(kmp_taskgraph_region_t *));
+  kmp_int32 cursor = worklist_length;
+  assert(entryregion->type == TASKGRAPH_REGION_ENTRY);
+  __kmp_taskgraph_region_dfs(entryregion, order, cursor, false);
+  assert(cursor == 0);
+  __kmp_taskgraph_region_doms(order, doms, worklist_length, false);
+
+#ifdef DEBUG_TASKGRAPH
+  fprintf(stderr, "digraph {\n");
+  for (kmp_int32 i = 0; i < worklist_length; i++) {
+    kmp_taskgraph_region_t *b = order[i];
+    for (kmp_taskgraph_region_dep_t *succ = b->successors; succ;
+         succ = succ->next) {
+      fprintf(stderr, "  \"%d\" -> \"%d\"\n", b->timestamp,
+              succ->region->timestamp);
+    }
+    fprintf(stderr, "  \"%d\" -> \"%d\" [color=green, constraint=false]\n",
+            b->timestamp, doms[b->timestamp]->timestamp);
+  }
+  fprintf(stderr, "}\n");
+#endif
+
+  // Irreducible regions are handled by duplicating regions, and those new
+  // regions need adding to the worklist.  The added_worklist variable stores
+  // the head of the new work to be added.
+  kmp_taskgraph_region_t *added_worklist = nullptr;
+  kmp_taskgraph_region_t **added_worklist_p = &added_worklist;
+
+  bool dropped_preds_p = false;
+
+  for (kmp_int32 i = 0; i < worklist_length; i++) {
+    kmp_taskgraph_region_t *region = order[i];
+    if (__kmp_region_deplist_len(region->predecessors) < 2)
+      continue;
+    TGDBG("checking region %p for redundant predecessors\n", region);
+    kmp_taskgraph_region_dep_t **predp = &region->predecessors;
+    while (*predp) {
+      kmp_taskgraph_region_dep_t *pred = *predp;
+
+      bool passes_pred = false;
+      for (kmp_taskgraph_region_dep_t *rest = region->predecessors; rest;
+           rest = rest->next) {
+        if (rest->region == pred->region)
+          continue;
+        kmp_taskgraph_region_t *dom = doms[rest->region->timestamp];
+        TGDBG("pred region: %p, next: %p\n", pred->region, rest->region);
+        while (true) {
+          TGDBG("check against dom: %p\n", dom);
+          if (dom == pred->region) {
+            passes_pred = true;
+            break;
+          } else if (dom == doms[dom->timestamp]) {
+            break;
+          } else {
+            dom = doms[dom->timestamp];
+          }
+        }
+        if (passes_pred)
+          break;
+      }
+
+      if (passes_pred) {
+        // We can drop this predecessor.
+        TGDBG("dropping pred %p from region %p, dom %p\n", pred->region, region,
+              doms[pred->region->timestamp]);
+        kmp_taskgraph_region_dep_t *next = pred->next;
+        kmp_taskgraph_region_dep_t **succp = &pred->region->successors;
+        while (*succp) {
+          kmp_taskgraph_region_dep_t *succ = *succp;
+          if (succ->region == region) {
+            kmp_taskgraph_region_dep_t *nexts = succ->next;
+            __kmp_region_dep_recycle(&taskgraph->recycled_deps, succ);
+            *succp = nexts;
+          } else {
+            succp = &succ->next;
+          }
+        }
+        __kmp_region_dep_recycle(&taskgraph->recycled_deps, pred);
+        *predp = next;
+        dropped_preds_p = true;
+      } else {
+        predp = &pred->next;
+      }
+    }
+  }
+
+  if (dropped_preds_p)
+    return true;
+
+  kmp_bitset_t **pred_bitsets = nullptr;
+  kmp_bitset_t **succ_bitsets = nullptr;
+
+  bool regions_combined_p = false;
+
+  for (kmp_int32 i = 0; i < worklist_length; i++) {
+    kmp_taskgraph_region_t *region = order[i];
+    struct {
+      kmp_taskgraph_region_t *dom;
+      kmp_int32 count;
+    } dom_groups[worklist_length];
+    kmp_int32 num_groups = 0;
+    kmp_int32 npreds = __kmp_region_deplist_len(region->predecessors);
+    if (npreds >= 2) {
+      kmp_taskgraph_region_dep_t *pred;
+      for (pred = region->predecessors; pred; pred = pred->next) {
+        kmp_taskgraph_region_t *pred_region = pred->region;
+        kmp_taskgraph_region_t *this_dom = doms[pred_region->timestamp];
+#ifdef DEBUG_TASKGRAPH
+        kmp_int32 edges_to_dom =
+            __kmp_taskgraph_count_edges_to_dominator(pred_region, this_dom);
+        TGDBG("this pred: %p, edges_to_dom=%d\n", pred_region, edges_to_dom);
+#endif
+        bool found = false;
+        for (kmp_int32 grp = 0; grp < num_groups; grp++) {
+          if (dom_groups[grp].dom == this_dom) {
+            dom_groups[grp].count++;
+            found = true;
+            break;
+          }
+        }
+        if (!found) {
+          dom_groups[num_groups].dom = this_dom;
+          dom_groups[num_groups].count = 1;
+          num_groups++;
+        }
+      }
+
+      if (num_groups == 1 && region->mark != TASKGRAPH_COMBINED) {
+        TGDBG("region %p: all predecessors have a single dominator\n", region);
+
+        if (!pred_bitsets) {
+          pred_bitsets = (kmp_bitset_t **)__kmp_fast_allocate(
+              thread, sizeof(kmp_bitset_t *) * worklist_length);
+          succ_bitsets = (kmp_bitset_t **)__kmp_fast_allocate(
+              thread, sizeof(kmp_bitset_t *) * worklist_length);
+
+          for (kmp_int32 i = 0; i < worklist_length; i++) {
+            pred_bitsets[i] = __kmp_bitset_alloc(thread, worklist_length);
+            succ_bitsets[i] = __kmp_bitset_alloc(thread, worklist_length);
+          }
+
+          for (kmp_int32 j = 0; j < worklist_length; j++) {
+            kmp_taskgraph_region_t *reg = order[j];
+
+            for (pred = reg->predecessors; pred; pred = pred->next) {
+              __kmp_bitset_set(pred_bitsets[reg->timestamp],
+                               pred->region->timestamp);
+            }
+
+            for (kmp_taskgraph_region_dep_t *succ = reg->successors; succ;
+                 succ = succ->next) {
+              __kmp_bitset_set(succ_bitsets[reg->timestamp],
+                               succ->region->timestamp);
+            }
+          }
+        }
+
+        kmp_taskgraph_region_dep_t *equal_deps_chain = nullptr;
+
+        kmp_int32 same_preds_and_succs = 1;
+        bool any_mutex_p = __kmp_taskgraph_region_mutex_p(region);
+        // FIXME: We might be able to do a bit better than this by hashing.
+        for (kmp_int32 j = i + 1; j < worklist_length; j++) {
+          if (order[j]->mark != TASKGRAPH_COMBINED &&
+              __kmp_bitset_equal(pred_bitsets[j], pred_bitsets[i]) &&
+              __kmp_bitset_equal(succ_bitsets[j], succ_bitsets[i])) {
+            TGDBG("regions %p and %p share all predecessors/successors\n",
+                  order[i], order[j]);
+            same_preds_and_succs++;
+            equal_deps_chain = __kmp_region_deplist_add(
+                thread, &taskgraph->recycled_deps, order[j], equal_deps_chain);
+            if (__kmp_taskgraph_region_mutex_p(order[j]))
+              any_mutex_p = true;
+          }
+        }
+        if (same_preds_and_succs > 1) {
+          kmp_taskgraph_region_type region_type =
+              any_mutex_p ? TASKGRAPH_REGION_EXCLUSIVE
+                          : TASKGRAPH_REGION_PARALLEL;
+          kmp_taskgraph_region_t *par_region = __kmp_taskgraph_region_alloc(
+              thread, taskgraph, alloc_chain, region_type, same_preds_and_succs,
+              nullptr);
+          par_region->inner.children[0] = region;
+          region->mark = TASKGRAPH_COMBINED;
+          region->parent = par_region;
+          for (kmp_int32 j = 1; j < same_preds_and_succs; j++) {
+            kmp_taskgraph_region_dep_t *next = equal_deps_chain->next;
+            par_region->inner.children[j] = equal_deps_chain->region;
+            equal_deps_chain->region->mark = TASKGRAPH_COMBINED;
+            equal_deps_chain->region->parent = par_region;
+            __kmp_region_dep_recycle(&taskgraph->recycled_deps,
+                                     equal_deps_chain);
+            equal_deps_chain = next;
+          }
+          par_region->predecessors =
+              par_region->inner.children[0]->predecessors;
+          par_region->inner.children[0]->predecessors = nullptr;
+          par_region->successors = par_region->inner.children[0]->successors;
+          par_region->inner.children[0]->successors = nullptr;
+
+          // Redirect incoming deps to point to new parallel region.
+          for (pred = par_region->predecessors; pred; pred = pred->next) {
+            kmp_taskgraph_region_t *pred_region = pred->region;
+            kmp_taskgraph_region_dep_t **succp = &pred_region->successors;
+            while (*succp) {
+              kmp_taskgraph_region_dep_t *succ = *succp;
+              if (succ->region == par_region->inner.children[0]) {
+                succ->region = par_region;
+                succp = &succ->next;
+              } else {
+                bool found = false;
+                for (kmp_int32 j = 1; j < same_preds_and_succs; j++) {
+                  if (succ->region == par_region->inner.children[j]) {
+                    found = true;
+                    break;
+                  }
+                }
+                if (found) {
+                  kmp_taskgraph_region_dep_t *next = succ->next;
+                  __kmp_region_dep_recycle(&taskgraph->recycled_deps, succ);
+                  *succp = next;
+                } else {
+                  succp = &succ->next;
+                }
+              }
+            }
+          }
+
+          for (kmp_taskgraph_region_dep_t *succ = par_region->successors; succ;
+               succ = succ->next) {
+            kmp_taskgraph_region_t *succ_region = succ->region;
+            kmp_taskgraph_region_dep_t **predp = &succ_region->predecessors;
+            while (*predp) {
+              kmp_taskgraph_region_dep_t *pred = *predp;
+              if (pred->region == par_region->inner.children[0]) {
+                pred->region = par_region;
+                predp = &pred->next;
+              } else {
+                bool found = false;
+                for (kmp_int32 j = 1; j < same_preds_and_succs; j++) {
+                  if (pred->region == par_region->inner.children[j]) {
+                    found = true;
+                    break;
+                  }
+                }
+                if (found) {
+                  kmp_taskgraph_region_dep_t *next = pred->next;
+                  __kmp_region_dep_recycle(&taskgraph->recycled_deps, pred);
+                  *predp = next;
+                } else {
+                  predp = &pred->next;
+                }
+              }
+            }
+          }
+
+          par_region->next = region->next;
+          region->next = par_region;
+
+          regions_combined_p = true;
+        }
+      }
+
+      if (regions_combined_p)
+        continue;
+
+      assert(num_groups >= 1);
+
+      TGDBG("should split region %p (%d)\n", region, region->timestamp);
+      TGDBG("clone graph to dominator: %p (%d, %s)\n", doms[region->timestamp],
+            doms[region->timestamp]->timestamp,
+            __kmp_taskgraph_region_type_name(doms[region->timestamp]->type));
+      kmp_taskgraph_region_t *region_dom = doms[region->timestamp];
+      kmp_int32 grp = -1;
+      kmp_int32 highest_dom = -1;
+      // Choose a dominator.  We pick one with the highest level, i.e.
+      // with the largest chain of dependents.  Anything we pick should
+      // be irreducible, because we've already tried the serial-parallel
+      // decomposition.
+      for (kmp_int32 j = 0; j < num_groups; j++) {
+        if (dom_groups[j].dom->level > highest_dom) {
+          grp = j;
+          highest_dom = dom_groups[j].dom->level;
+        }
+      }
+
+      // Separate out the predecessors with this dominator (identified by
+      // grp).
+      kmp_taskgraph_region_dep_t *preds_with_dom = nullptr;
+      kmp_taskgraph_region_dep_t **pwd_tail = &preds_with_dom;
+      kmp_taskgraph_region_dep_t **pred_cursor = &region->predecessors;
+      TGDBG("before splitting we have %d preds\n",
+            __kmp_region_deplist_len(region->predecessors));
+      while (*pred_cursor) {
+        kmp_taskgraph_region_dep_t *this_pred = *pred_cursor;
+        kmp_taskgraph_region_t *dom = doms[this_pred->region->timestamp];
+        if (dom == dom_groups[grp].dom) {
+          *pwd_tail = this_pred;
+          pwd_tail = &this_pred->next;
+          *pred_cursor = this_pred->next;
+        } else {
+          pred_cursor = &this_pred->next;
+        }
+      }
+      // Finish list.
+      *pwd_tail = nullptr;
+
+      if (!region->predecessors) {
+        kmp_int32 highest = -1;
+        kmp_taskgraph_region_dep_t **use_pred = nullptr;
+        // This can only happen if...
+        assert(num_groups == 1);
+        region->predecessors = preds_with_dom;
+        for (kmp_taskgraph_region_dep_t **rp = &region->predecessors; *rp;
+             rp = &(*rp)->next) {
+          kmp_int32 count = __kmp_taskgraph_count_edges_to_dominator(
+              (*rp)->region, dom_groups[grp].dom);
+          TGDBG("for pred %p, outgoing edges to dom = %d\n", (*rp)->region,
+                count);
+          if (count > highest) {
+            highest = count;
+            use_pred = rp;
+          }
+        }
+        TGDBG("using pred %p\n", (*use_pred)->region);
+        // Pick the single predecessor with the largest outgoing edge
+        // count (the "most complicated" predecessor).
+        preds_with_dom = *use_pred;
+        *use_pred = (*use_pred)->next;
+        preds_with_dom->next = nullptr;
+      }
+
+      kmp_taskgraph_region_dep_t *unlinked_successors = nullptr;
+
+      // Unlink successors for preds_with_dom nodes, and record where they
+      // came from.
+      for (pred = preds_with_dom; pred; pred = pred->next) {
+        kmp_taskgraph_region_dep_t **succp = &pred->region->successors;
+        while (*succp) {
+          kmp_taskgraph_region_dep_t *succ = *succp;
+          kmp_taskgraph_region_t *succ_region = succ->region;
+          if (succ_region == region) {
+            kmp_taskgraph_region_dep_t *next = succ->next;
+            __kmp_region_dep_recycle(&taskgraph->recycled_deps, succ);
+            TGDBG("unlinking successor %p -> %p\n", pred->region, region);
+            unlinked_successors =
+                __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
+                                         pred->region, unlinked_successors);
+            *succp = next;
+          } else {
+            succp = &succ->next;
+          }
+        }
+      }
+
+      TGDBG("after splitting, # preds_with_dom=%d, others %d\n",
+            __kmp_region_deplist_len(preds_with_dom),
+            __kmp_region_deplist_len(region->predecessors));
+      *pwd_tail = nullptr;
+      kmp_taskgraph_region_t *cloned_nodes[worklist_length];
+      memset(cloned_nodes, 0,
+             sizeof(kmp_taskgraph_region_t *) * worklist_length);
+      __kmp_taskgraph_clone_subgraph(thread, taskgraph, alloc_chain,
+                                     cloned_nodes, region, doms, preds_with_dom,
+                                     region_dom, &added_worklist_p);
+      // Now fill in the successors for the cloned regions.
+      for (kmp_int32 n = 0; n < worklist_length; n++) {
+        kmp_taskgraph_region_t *cloned_region = cloned_nodes[n];
+        if (!cloned_region)
+          continue;
+        for (kmp_taskgraph_region_dep_t *pred = cloned_region->predecessors;
+             pred; pred = pred->next) {
+          kmp_taskgraph_region_t *pred_region = pred->region;
+          pred_region->successors =
+              __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
+                                       cloned_region, pred_region->successors);
+        }
+      }
+
+#ifdef DEBUG_TASKGRAPH
+      TGDBG("before appending:\n");
+      for (pred = region->predecessors; pred; pred = pred->next) {
+        TGDBG("region %p, pred: %p\n", region, pred);
+      }
+#endif
+
+      // Re-attach redirected predecessor list to region's predecessors.
+      pred = region->predecessors;
+      if (pred) {
+        while (pred && pred->next)
+          pred = pred->next;
+        pred->next = preds_with_dom;
+      } else {
+        region->predecessors = preds_with_dom;
+      }
+
+#ifdef DEBUG_TASKGRAPH
+      TGDBG("after appending:\n");
+      for (pred = region->predecessors; pred; pred = pred->next) {
+        TGDBG("region %p, pred: %p\n", region, pred);
+      }
+#endif
+
+      // Redirect the unlinked successors from the region's original
+      // predecessors so that the new (cloned) predecessors still point to
+      // the region.
+      for (kmp_taskgraph_region_dep_t *succ = unlinked_successors; succ;) {
+        kmp_taskgraph_region_t *cloned_reg =
+            cloned_nodes[succ->region->timestamp];
+        kmp_taskgraph_region_dep_t *next = succ->next;
+        __kmp_region_dep_recycle(&taskgraph->recycled_deps, succ);
+        TGDBG("add successor to cloned region: %p -> %p\n", cloned_reg, region);
+        cloned_reg->successors = __kmp_region_deplist_add(
+            thread, &taskgraph->recycled_deps, region, cloned_reg->successors);
+        succ = next;
+      }
+
+      // Cloning subgraph invalidates e.g. the timestamp fields: just do
+      // one round of transformation.  We could possibly do more if we
+      // were careful.
+
+      changed = true;
+    }
+    if (changed)
+      break;
+  }
+
+  if (regions_combined_p)
+    changed = true;
+
+  if (pred_bitsets) {
+    for (kmp_int32 j = 0; j < worklist_length; j++) {
+      __kmp_bitset_free(thread, pred_bitsets[j]);
+      __kmp_bitset_free(thread, succ_bitsets[j]);
+    }
+    __kmp_fast_free(thread, pred_bitsets);
+    __kmp_fast_free(thread, succ_bitsets);
+  }
+
+  *added_worklist_p = nullptr;
+  added_worklist = __kmp_region_worklist_reverse(added_worklist);
+
+  kmp_taskgraph_region_t *last = exitregion;
+  while (last && last->next)
+    last = last->next;
+  last->next = added_worklist;
+
+  TGDBG("starting trim dead edges...\n");
+
+  for (kmp_taskgraph_region_t *r = entryregion; r; r = r->next) {
+    r->mark = TASKGRAPH_UNMARKED;
+  }
+
+  // Remove any regions which are now unreachable by DFS from the exit
+  // region, and any connected dependency edges.
+  int idx = 0;
+  __kmp_taskgraph_region_dfs(exitregion, nullptr, idx, true);
+  for (kmp_taskgraph_region_t *r = entryregion; r; r = r->next) {
+    if (r->mark == TASKGRAPH_UNMARKED) {
+      r->mark = TASKGRAPH_DELETED;
+
+      __kmp_region_deplist_recycle(&taskgraph->recycled_deps, r->successors);
+      r->successors = nullptr;
+
+      // Delete predecessors for deleted nodes (and corresponding
+      // successors).
+      kmp_taskgraph_region_dep_t **predp = &r->predecessors;
+      while (*predp) {
+        kmp_taskgraph_region_dep_t *pred = *predp;
+        if (pred->region->mark != TASKGRAPH_UNMARKED) {
+          kmp_taskgraph_region_dep_t **succp = &pred->region->successors;
+          while (*succp) {
+            kmp_taskgraph_region_dep_t *succ = *succp;
+            if (succ->region == r) {
+              kmp_taskgraph_region_dep_t *next = succ->next;
+              __kmp_region_dep_recycle(&taskgraph->recycled_deps, succ);
+              *succp = next;
+            } else {
+              succp = &succ->next;
+            }
+          }
+        }
+        kmp_taskgraph_region_dep_t *next = pred->next;
+        __kmp_region_dep_recycle(&taskgraph->recycled_deps, pred);
+        *predp = next;
+      }
+    }
+  }
+
+  TGDBG("done trimming dead edges.\n");
+
+  __kmp_taskgraph_region_chain_prune(&entryregion);
+  __kmp_taskgraph_region_worklist_check(thread, taskgraph, entryregion,
+                                        "after irreducible handling");
+
+  worklist_length = 0;
+  for (kmp_taskgraph_region_t *r = entryregion; r; r = r->next) {
+    r->mark = TASKGRAPH_UNMARKED;
+    worklist_length++;
+  }
+
+  // Recalculate topological sort
+  kmp_int32 max_level = -1;
+  kmp_taskgraph_region_t *r = entryregion;
+  kmp_int32 outidx = 0;
+  kmp_taskgraph_region_t *order_out[worklist_length];
+  for (kmp_int32 i = 0; i < worklist_length; i++, r = r->next) {
+    if (r->mark == TASKGRAPH_UNMARKED) {
+      kmp_int32 level =
+          __kmp_taskgraph_topological_order(r, order_out, &outidx);
+      max_level = level > max_level ? level : max_level;
+    }
+  }
+
+  // Re-sort worklist wrt. topological order calculated above.
+  kmp_taskgraph_region_t **relink = &entryregion;
+  for (kmp_int32 i = 0; i < worklist_length; i++) {
+    *relink = order_out[i];
+    relink = &order_out[i]->next;
+  }
+  *relink = nullptr;
+
+#ifdef DEBUG_TASKGRAPH
+  __kmp_taskgraph_region_dot(entryregion, "PredsAndSuccsAfter");
+#endif
+
+  return changed;
+}
+
+/// Build a nested region structure out of a recorded taskgraph.
+//
+// The algorithm proceeds by alternating two phases until a single top-level
+// node is reached.  Briefly, and glossing over some details:
+//
+// 1. Serial-parallel decomposition.  Chains of single-successor,
+//    single-predecessor nodes are collapsed into a "sequential" region, and
+//    nodes with >1 predecessor, where each predecessor has a single
+//    predecessor and a single successor, are collapsed into "parallel" regions.
+//
+// 2. Irreducible-graph processing.  Several techniques are used to turn graphs
+//    not handled by step (1) into graphs that can be handled by that step.
+//
+// Notably, simple graphs that can be handled entirely by step (1) avoid doing
+// much of the heavier processing involved in step (2), so the common case
+// should be relatively fast.
+
+static kmp_taskgraph_region_t *__kmp_taskgraph_build_regions(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t *entryregion,
+    kmp_taskgraph_region_t *exitregion) {
+  bool changed;
+  kmp_int32 phase = 0;
+
+#ifdef DEBUG_TASKGRAPH
+  __kmp_taskgraph_region_dot(entryregion, "InitialPredsAndSuccs");
+#endif
+
+  __kmp_taskgraph_region_chain_clear_marks(entryregion);
+
+  while (true) {
+    do {
+      changed = false;
+      TGDBG("starting seq pass\n");
+      for (kmp_taskgraph_region_t **seq_head = &entryregion; *seq_head;
+           seq_head = &(*seq_head)->next) {
+        TGDBG("consider %s region: %p\n",
+              __kmp_taskgraph_region_type_name((*seq_head)->type), *seq_head);
+        if ((*seq_head)->mark == TASKGRAPH_COMBINED) {
+          TGDBG("already combined\n");
+          continue;
+        }
+        changed |= __kmp_taskgraph_collapse_sequence(thread, taskgraph,
+                                                     alloc_chain, seq_head,
+                                                     /*parent=*/nullptr, phase);
+        TGDBG("changed: %s\n", changed ? "true" : "false");
+      }
+      ++phase;
+      __kmp_taskgraph_region_chain_prune(&entryregion);
+      __kmp_taskgraph_region_worklist_check(thread, taskgraph, entryregion,
+                                            "after seq collapse");
+      TGDBG("starting par/unordered pass\n");
+      for (kmp_taskgraph_region_t **par_head = &entryregion; *par_head;
+           par_head = &(*par_head)->next) {
+        TGDBG("consider %s region: %p\n",
+              __kmp_taskgraph_region_type_name((*par_head)->type), *par_head);
+        if ((*par_head)->mark == TASKGRAPH_COMBINED) {
+          TGDBG("already combined\n");
+          continue;
+        }
+        changed |= __kmp_taskgraph_collapse_par_exclusive(
+            thread, taskgraph, alloc_chain, par_head, /*parent=*/nullptr,
+            phase);
+        TGDBG("changed: %s\n", changed ? "true" : "false");
+      }
+      ++phase;
+      __kmp_taskgraph_region_chain_prune(&entryregion);
+      __kmp_taskgraph_region_worklist_check(thread, taskgraph, entryregion,
+                                            "after par collapse");
+    } while (changed);
+
+    if (entryregion->type == TASKGRAPH_REGION_ENTRY) {
+      if (__kmp_region_deplist_len(entryregion->successors) == 1) {
+        kmp_taskgraph_region_t *one_region = entryregion->successors->region;
+        if (__kmp_region_deplist_len(one_region->successors) == 1) {
+          kmp_taskgraph_region_t *maybe_exit = one_region->successors->region;
+          if (maybe_exit->type == TASKGRAPH_REGION_EXIT)
+            return one_region;
+        }
+      }
+    } else {
+      fprintf(stderr, "FIXME: Expected entry region!\n");
+      return entryregion;
+    }
+
+    TGDBG("attempting to collapse irreducible regions\n");
+
+    changed |= __kmp_taskgraph_rewrite_irreducible(
+        thread, taskgraph, alloc_chain, &entryregion, exitregion);
+
+    if (!changed) {
+      fprintf(stderr, "FIXME: Failed to transform irreducible graph\n");
+      return entryregion;
+    }
+  }
+
+  return entryregion;
+}
+
+static void __kmp_taskgraph_count_nodes(kmp_taskgraph_region_t *region) {
+  switch (region->type) {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+    return;
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT: {
+    TGDBG("process region %p\n", region);
+    region->task.node->u.resolved.count++;
+    kmp_taskgraph_region_t *last_region =
+        region->task.node->u.resolved.last_region;
+    TGDBG("last region: %p\n", last_region);
+    if (last_region) {
+      kmp_taskgraph_region_t *next = last_region->task.next_instance;
+      TGDBG("next: %p\n", next);
+      last_region->task.next_instance = region;
+      region->task.next_instance = next;
+    }
+    region->task.node->u.resolved.last_region = region;
+    return;
+  }
+  default:
+    for (kmp_int32 n = 0; n < region->inner.num_children; n++) {
+      __kmp_taskgraph_count_nodes(region->inner.children[n]);
+    }
+  }
+}
+
+static void __kmp_taskgraph_gather_mutex_sets(kmp_info_t *thread,
+                                              kmp_taskgraph_region_t *region,
+                                              const kmp_bitset_t *held) {
+  switch (region->type) {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+  case TASKGRAPH_REGION_WAIT:
+    return;
+  case TASKGRAPH_REGION_NODE: {
+#ifdef DEBUG_TASKGRAPH
+    if (region->mutexset && __kmp_bitset_subset_p(held, region->mutexset)) {
+      TGDBG("node is mutually exclusive with held: 0x%llx <: 0x%llx\n",
+            (unsigned long long)region->mutexset->bits[0],
+            (unsigned long long)held->bits[0]);
+    }
+#endif
+    return;
+  }
+  case TASKGRAPH_REGION_SEQUENTIAL: {
+    kmp_bitset_t *seq_held = __kmp_bitset_alloc(thread, held->bitsize);
+    __kmp_bitset_clearall(seq_held);
+    for (kmp_int32 child = 0; child < region->inner.num_children; child++) {
+      __kmp_taskgraph_gather_mutex_sets(thread, region->inner.children[child],
+                                        held);
+      if (region->inner.children[child]->mutexset)
+        __kmp_bitset_or(seq_held, seq_held,
+                        region->inner.children[child]->mutexset);
+    }
+    region->mutexset = seq_held;
+    return;
+  }
+  case TASKGRAPH_REGION_PARALLEL:
+  case TASKGRAPH_REGION_EXCLUSIVE: {
+    kmp_bitset_t *par_held = __kmp_bitset_alloc(thread, held->bitsize);
+    kmp_bitset_t *conflicts = __kmp_bitset_alloc(thread, held->bitsize);
+    while (true) {
+      __kmp_bitset_clearall(par_held);
+      for (kmp_int32 child = 0; child < region->inner.num_children; child++) {
+        __kmp_bitset_clearall(conflicts);
+        for (kmp_int32 other = 0; other < region->inner.num_children; other++) {
+          if (other != child) {
+            if (!region->inner.children[other]->mutexset)
+              __kmp_taskgraph_gather_mutex_sets(
+                  thread, region->inner.children[other], held);
+            if (region->inner.children[other]->mutexset)
+              __kmp_bitset_or(conflicts, conflicts,
+                              region->inner.children[other]->mutexset);
+          }
+        }
+        __kmp_taskgraph_gather_mutex_sets(thread, region->inner.children[child],
+                                          conflicts);
+        if (region->inner.children[child]->mutexset)
+          __kmp_bitset_or(par_held, par_held,
+                          region->inner.children[child]->mutexset);
+      }
+      if (!region->mutexset) {
+        region->mutexset = par_held;
+      } else if (__kmp_bitset_equal(region->mutexset, par_held)) {
+        TGDBG("par mutexes stabilized, exiting loop\n");
+        break;
+      } else {
+        TGDBG("par mutexes not stable, iterating\n");
+        __kmp_bitset_copy(region->mutexset, par_held);
+        __kmp_bitset_free(thread, par_held);
+      }
+    }
+    __kmp_bitset_free(thread, conflicts);
+    return;
+  }
+  }
+}
+
+static int __kmp_popcount_cmp(const void *a, const void *b) {
+  const kmp_taskgraph_region_t *reg_a = *(kmp_taskgraph_region_t **)a;
+  const kmp_taskgraph_region_t *reg_b = *(kmp_taskgraph_region_t **)b;
+  kmp_int32 popc_a = 0, popc_b = 0;
+  if (reg_a->mutexset)
+    popc_a = __kmp_bitset_popcount(reg_a->mutexset);
+  if (reg_b->mutexset)
+    popc_b = __kmp_bitset_popcount(reg_b->mutexset);
+  if (popc_a > popc_b)
+    return -1;
+  else if (popc_a < popc_b)
+    return 1;
+  return 0;
+}
+
+/// Find "mutexinoutset" regions that can be represented without explicit
+// mutexes, i.e. using "TASKGRAPH_REGION_EXCLUSIVE".
+
+static void __kmp_taskgraph_find_exclusive_regions(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t **region_p) {
+  kmp_taskgraph_region_t *region = *region_p;
+  switch (region->type) {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT:
+    break;
+  case TASKGRAPH_REGION_SEQUENTIAL:
+  case TASKGRAPH_REGION_PARALLEL: {
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      __kmp_taskgraph_find_exclusive_regions(thread, taskgraph, alloc_chain,
+                                             &region->inner.children[c]);
+    }
+    break;
+  }
+  case TASKGRAPH_REGION_EXCLUSIVE: {
+    qsort(region->inner.children, region->inner.num_children,
+          sizeof(kmp_taskgraph_region_t *), __kmp_popcount_cmp);
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      TGDBG("building tree: region mutexset = 0x%llx\n",
+            (unsigned long long)region->inner.children[c]->mutexset
+                ? region->inner.children[c]->mutexset->bits[0]
+                : 0);
+      region->inner.children[c]->mark = TASKGRAPH_UNMARKED;
+    }
+    kmp_bitset_t *conflicts =
+        __kmp_bitset_alloc(thread, region->mutexset->bitsize);
+    kmp_bitset_t *subsets_cover =
+        __kmp_bitset_alloc(thread, region->mutexset->bitsize);
+    __kmp_bitset_copy(conflicts, region->mutexset);
+    bool irregular = false;
+    kmp_int32 combined_children = 0;
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      kmp_bitset_t *candidate = region->inner.children[c]->mutexset;
+      if (__kmp_bitset_empty_p(candidate))
+        continue;
+      __kmp_bitset_clearall(subsets_cover);
+      bool found_subset = false;
+      bool other_overlaps = false;
+      for (kmp_int32 d = c + 1; d < region->inner.num_children; d++) {
+        // This could test for a subset in some cases, but that adds
+        // complication for later processing.  Maybe revisit later if it
+        // seems worthwhile.
+        // E.g. if we have deps like this:
+        //
+        // #pragma omp task depend(mutexinoutset: deps[0], deps[1]) { /*a*/ }
+        // #pragma omp task depend(mutexinoutset: deps[0]) { /*b*/ }
+        // #pragma omp task depend(mutexinoutset: deps[1]) { /*c*/ }
+        //
+        // This could be represented as:
+        //
+        // exclusive {
+        //   node: a
+        //   parallel {
+        //     node: b
+        //     node: c
+        //   }
+        // }
+        //
+        // We're not doing that yet though.
+        if (__kmp_bitset_equal(candidate,
+                               region->inner.children[d]->mutexset)) {
+          found_subset = true;
+          __kmp_bitset_or(subsets_cover, subsets_cover,
+                          region->inner.children[d]->mutexset);
+        } else if (__kmp_bitset_intersect_p(
+                       candidate, region->inner.children[d]->mutexset)) {
+          other_overlaps = true;
+          break;
+        }
+      }
+      if (!found_subset || other_overlaps)
+        continue;
+      if (!__kmp_bitset_equal(subsets_cover, candidate)) {
+        TGDBG("subsets cover: 0x%llx, candidate: 0x%llx\n",
+              (unsigned long long)subsets_cover->bits[0],
+              (unsigned long long)candidate->bits[0]);
+        irregular = true;
+        break;
+      }
+      for (kmp_int32 d = c + 1; d < region->inner.num_children; d++) {
+        if (region->inner.children[d]->mutexset_parent)
+          continue;
+        // As above wrt. subsets.
+        if (__kmp_bitset_equal(candidate,
+                               region->inner.children[d]->mutexset)) {
+          TGDBG("set index %d's parent to index %d\n", d, c);
+          region->inner.children[d]->mutexset_parent =
+              region->inner.children[c];
+          combined_children++;
+          __kmp_bitset_and_not(conflicts, conflicts, candidate);
+        }
+      }
+    }
+    TGDBG("irregular: %s\n", irregular ? "true" : "false");
+    TGDBG("final conflicts: 0x%llx\n", (unsigned long long)conflicts->bits[0]);
+    __kmp_bitset_free(thread, subsets_cover);
+    region->type = TASKGRAPH_REGION_PARALLEL;
+    if (!irregular && __kmp_bitset_empty_p(conflicts)) {
+      TGDBG("transforming exclusive region %p\n", region);
+      TGDBG("orig region children: %d\n", region->inner.num_children);
+      TGDBG("combined children: %d\n", combined_children);
+      if (region->inner.num_children == combined_children + 1) {
+        region->type = TASKGRAPH_REGION_EXCLUSIVE;
+      } else {
+        kmp_taskgraph_region_t *new_par = __kmp_taskgraph_region_alloc(
+            thread, taskgraph, alloc_chain, TASKGRAPH_REGION_PARALLEL,
+            region->inner.num_children - combined_children, nullptr);
+        for (kmp_int32 c = region->inner.num_children - 1; c >= 0; c--) {
+          kmp_taskgraph_region_t *child = region->inner.children[c];
+          // Make mutex set into a circular list.
+          if (child->mutexset_parent && child->mark != TASKGRAPH_TEMP_MARK) {
+            if (!child->mutexset_parent->mutexset_parent) {
+              // child <-> parent
+              child->mutexset_parent->mutexset_parent = child;
+              child->mutexset_parent->mark = TASKGRAPH_TEMP_MARK;
+            } else {
+              kmp_taskgraph_region_t *parent = child->mutexset_parent;
+              child->mutexset_parent = parent->mutexset_parent;
+              parent->mutexset_parent = child;
+              parent->mark = TASKGRAPH_TEMP_MARK;
+            }
+          }
+        }
+        kmp_int32 idx = 0;
+        for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+          kmp_taskgraph_region_t *child = region->inner.children[c];
+          TGDBG("process child: %p\n", child);
+          if (child->mutexset_parent && child->mark != TASKGRAPH_COMBINED) {
+            kmp_int32 elems = 0;
+            kmp_taskgraph_region_t *next = child;
+            do {
+              elems++;
+              next = next->mutexset_parent;
+            } while (next != child);
+            TGDBG("make exclusive region with %d children\n", elems);
+            kmp_taskgraph_region_t *excl_region = __kmp_taskgraph_region_alloc(
+                thread, taskgraph, alloc_chain, TASKGRAPH_REGION_EXCLUSIVE,
+                elems, nullptr);
+            kmp_int32 excl_child = 0;
+            next = child;
+            do {
+              excl_region->inner.children[excl_child++] = next;
+              next->mark = TASKGRAPH_COMBINED;
+              next = next->mutexset_parent;
+            } while (next != child);
+            assert(excl_child == excl_region->inner.num_children);
+            new_par->inner.children[idx++] = excl_region;
+          } else if (!child->mutexset_parent) {
+            new_par->inner.children[idx++] = child;
+          }
+        }
+        TGDBG("idx=%d, supposed to be %d\n", idx, new_par->inner.num_children);
+        assert(idx == new_par->inner.num_children);
+        *region_p = new_par;
+        region->mark = TASKGRAPH_DELETED;
+      }
+    }
+    __kmp_bitset_free(thread, conflicts);
+    break;
+  }
+  default:
+    assert(false && "unreachable");
+  }
+}
+
+/// Strip mutex sets from taskgraph region, except those needed at runtime.
+
+static kmp_int32
+__kmp_taskgraph_strip_mutex_sets(kmp_info_t *thread,
+                                 kmp_taskgraph_region_t *region,
+                                 bool in_exclusive = false) {
+  kmp_int32 mutexes_needed = 0;
+  switch (region->type) {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+  case TASKGRAPH_REGION_WAIT:
+    assert(!region->mutexset);
+    break;
+  case TASKGRAPH_REGION_NODE:
+    if (region->mutexset) {
+      if (in_exclusive) {
+        __kmp_bitset_free(thread, region->mutexset);
+        region->mutexset = nullptr;
+      } else {
+        // FIXME: This might be pessimistic -- the remaining mutex sets might
+        // have holes or duplicates.  We could compact them.
+        kmp_int32 m = region->mutexset->bitsize;
+        mutexes_needed = std::max(mutexes_needed, m);
+      }
+    }
+    break;
+  case TASKGRAPH_REGION_EXCLUSIVE: {
+    if (region->mutexset) {
+      __kmp_bitset_free(thread, region->mutexset);
+      region->mutexset = nullptr;
+    }
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      kmp_int32 m = __kmp_taskgraph_strip_mutex_sets(
+          thread, region->inner.children[c], true);
+      mutexes_needed = std::max(mutexes_needed, m);
+    }
+    break;
+  }
+  default: {
+    if (region->mutexset) {
+      __kmp_bitset_free(thread, region->mutexset);
+      region->mutexset = nullptr;
+    }
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      kmp_int32 m = __kmp_taskgraph_strip_mutex_sets(
+          thread, region->inner.children[c], in_exclusive);
+      mutexes_needed = std::max(mutexes_needed, m);
+    }
+  }
+  }
+  return mutexes_needed;
+}
+
+static void __kmp_taskgraph_exclusive_regions(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t **region_p,
+    kmp_int32 max_mutex) {
+  kmp_bitset_t *top = __kmp_bitset_alloc(thread, max_mutex);
+  __kmp_bitset_clearall(top);
+  __kmp_taskgraph_gather_mutex_sets(thread, *region_p, top);
+  __kmp_taskgraph_find_exclusive_regions(thread, taskgraph, alloc_chain,
+                                         region_p);
+  kmp_int32 num_mutexes = __kmp_taskgraph_strip_mutex_sets(thread, *region_p);
+  taskgraph->num_mutexes = num_mutexes;
+}
+
+static const char *
+__kmp_taskgraph_region_type_name(kmp_taskgraph_region_type type) {
+  switch (type) {
+  case TASKGRAPH_REGION_ENTRY:
+    return "entry";
+  case TASKGRAPH_REGION_EXIT:
+    return "exit";
+  case TASKGRAPH_REGION_NODE:
+    return "node";
+  case TASKGRAPH_REGION_WAIT:
+    return "wait";
+  case TASKGRAPH_REGION_PARALLEL:
+    return "parallel";
+  case TASKGRAPH_REGION_EXCLUSIVE:
+    return "exclusive";
+  case TASKGRAPH_REGION_SEQUENTIAL:
+    return "sequential";
+  case TASKGRAPH_REGION_IRREDUCIBLE:
+    return "irreducible";
+  default:
+    return "<unknown>";
+  }
+}
+
+#if defined(KMP_DEBUG) || defined(DEBUG_TASKGRAPH)
+static void __kmp_dump_taskgraph_regions(FILE *f,
+                                         kmp_taskgraph_region_t *region,
+                                         int indent = 0) {
+  switch (region->type) {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+    fprintf(f, "%*s%s node\n", indent, "",
+            __kmp_taskgraph_region_type_name(region->type));
+    break;
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT: {
+    char set_membership[40];
+    if (region->mutexset)
+      sprintf(set_membership, " [sets: 0x%llx]",
+              (unsigned long long)region->mutexset->bits[0]);
+    else
+      strcpy(set_membership, "");
+    if (region->task.node->u.resolved.count > 1)
+      fprintf(f, "%*s%s: %p (* %d)%s\n", indent, "",
+              __kmp_taskgraph_region_type_name(region->type), region->task.node,
+              region->task.node->u.resolved.count, set_membership);
+    else
+      fprintf(f, "%*s%s: %p%s\n", indent, "",
+              __kmp_taskgraph_region_type_name(region->type), region->task.node,
+              set_membership);
+    break;
+  }
+  default: {
+    char set_membership[40];
+    if (region->mutexset)
+      sprintf(set_membership, " [sets: 0x%llx]",
+              (unsigned long long)region->mutexset->bits[0]);
+    else
+      strcpy(set_membership, "");
+    fprintf(f, "%*s%s%s {\n", indent, "",
+            __kmp_taskgraph_region_type_name(region->type), set_membership);
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      __kmp_dump_taskgraph_regions(f, region->inner.children[c], indent + 2);
+    }
+    fprintf(f, "%*s}\n", indent, "");
+  }
+  }
+}
+#endif
+
+#ifdef DEBUG_TASKGRAPH
+
+static kmp_taskgraph_region_dep_t *
+__kmp_dump_find_parent_regions(kmp_info *thd, kmp_taskgraph_record_t *taskgraph,
+                               kmp_taskgraph_region_t *region, int numregions,
+                               kmp_taskgraph_region_dep_t *list = nullptr) {
+  for (int r = 0; r < numregions; r++) {
+    if (!region[r].parent)
+      continue;
+    bool in_list = false;
+    for (kmp_taskgraph_region_dep_t *dep = list; dep; dep = dep->next) {
+      if (dep->region == region[r].parent) {
+        in_list = true;
+        break;
+      }
+    }
+    if (!in_list) {
+      list = __kmp_region_deplist_add(thd, &taskgraph->recycled_deps,
+                                      region[r].parent, list);
+      list = __kmp_dump_find_parent_regions(thd, taskgraph, region[r].parent, 1,
+                                            list);
+    }
+  }
+  return list;
+}
+
+static void __kmp_dump_raw_taskgraph_regions(FILE *f, kmp_info *thd,
+                                             kmp_taskgraph_record_t *taskgraph,
+                                             kmp_taskgraph_region_t *region,
+                                             int numregions, int indent = 0) {
+  kmp_taskgraph_region_dep_t *parentlist = nullptr;
+  kmp_taskgraph_region_dep_t *printedlist = nullptr;
+  for (int r = 0; r < numregions; r++) {
+    int children = 0;
+    if (region[r].type == TASKGRAPH_REGION_PARALLEL ||
+        region[r].type == TASKGRAPH_REGION_SEQUENTIAL ||
+        region[r].type == TASKGRAPH_REGION_EXCLUSIVE ||
+        region[r].type == TASKGRAPH_REGION_IRREDUCIBLE)
+      children = region[r].inner.num_children;
+    fprintf(
+        f,
+        "%*sregion %d (%p): %s%s (%d children) parent %p succs %d preds %d\n",
+        indent, "", r, &region[r],
+        __kmp_taskgraph_region_type_name(region[r].type),
+        region[r].mark == TASKGRAPH_COMBINED ? " (combined)" : "", children,
+        region[r].parent, __kmp_region_deplist_len(region[r].successors),
+        __kmp_region_deplist_len(region[r].predecessors));
+    if (children > 0) {
+      for (int c = 0; c < children; c++)
+        __kmp_dump_raw_taskgraph_regions(
+            f, thd, taskgraph, region->inner.children[c], 1, indent + 2);
+    }
+  }
+  if (indent == 0) {
+    parentlist =
+        __kmp_dump_find_parent_regions(thd, taskgraph, region, numregions);
+    fprintf(stderr, "%*sfound %d parent region(s):\n", indent, "",
+            __kmp_region_deplist_len(parentlist));
+    for (kmp_taskgraph_region_dep_t *p = parentlist; p; p = p->next) {
+      __kmp_dump_raw_taskgraph_regions(f, thd, taskgraph, p->region, 1,
+                                       indent + 2);
+    }
+    __kmp_region_deplist_recycle(&taskgraph->recycled_deps, parentlist);
+  }
+}
+#endif
+
+/// Build a nested region structure from a "raw" recorded taskgraph, and mark
+/// the taskgraph ready for replay.
+//
+// The input to this function consists of tasks with *data* dependencies
+// between them.  The output of the function is a nested tree structure: the
+// dependencies between tasks implicitly become *control* dependencies.  In
+// the common case, these ought to map straightforwardly to hardware-provided
+// execution primitives (e.g. on a GPU), or to runtime-provided primitives (for
+// the CPU).
+//
+// Here is an example taskgraph:
+//
+// #pragma omp taskgraph
+// {
+//   #pragma omp task depend(out: deps[2])
+//   { }
+//   #pragma omp task depend(out: deps[0], deps[1])
+//   { }
+//   #pragma omp task depend(inout: deps[0])
+//   { }
+//   #pragma omp task depend(inout: deps[1])
+//   { }
+//   #pragma omp task depend(inout: deps[2])
+//   { }
+//   #pragma omp task depend(in: deps[0], deps[1], deps[2])
+//   { }
+// }
+//
+// This dependency graph is "reducible", and the resulting tree looks like this:
+//
+// sequential {
+//   parallel {
+//     sequential {
+//       node: 0x588aa11021b0
+//       node: 0x588aa1102250
+//     }
+//     sequential {
+//       node: 0x588aa11021d8
+//       parallel {
+//         node: 0x588aa1102228
+//         node: 0x588aa1102200
+//       }
+//     }
+//   }
+//   node: 0x588aa1102278
+// }
+//
+// Each node represents a task, and the containing parallel and sequential
+// regions represent sub-regions that can be executed in parallel, or
+// one-at-a-time, in order.
+//
+// In some cases, the data-dependency graph may not be trivially reducible to
+// parallel and sequential regions.  In this case, several techniques are used
+// to produce a reducible graph from an irreducible graph (see
+// __kmp_taskgraph_rewrite_irreducible).
+//
+// For example in this graph:
+//
+// #pragma omp taskgraph
+// {
+//   #pragma omp task depend(out: deps[0], deps[1])
+//   { }
+//   #pragma omp task depend(out: deps[2], deps[3])
+//   { }
+//   #pragma omp task depend(inout: deps[0])
+//   { }
+//   #pragma omp task depend(inout: deps[1])
+//   { }
+//   #pragma omp task depend(inout: deps[2])
+//   { }
+//   #pragma omp task depend(inout: deps[3])
+//   { }
+//   #pragma omp task depend(in: deps[0], deps[1], deps[2], deps[3])
+//   { }
+//   #pragma omp task depend(in: deps[1], deps[2])
+//   { }
+// }
+//
+// The final two tasks overlap data dependencies in such a way that the
+// resulting dependency graph cannot be trivially decomposed to parallel and
+// sequential regions.  In this case, the graph is handled by duplicating task
+// nodes so they appear in more than one place in the resulting nested region
+// structure:
+//
+// parallel {
+//   sequential {
+//     parallel {
+//       sequential {
+//         node: 0x61bfca8ecfd8 (* 2)
+//         node: 0x61bfca8ed050 (* 2)
+//       }
+//       sequential {
+//         node: 0x61bfca8ed000 (* 2)
+//         node: 0x61bfca8ed078 (* 2)
+//       }
+//     }
+//     node: 0x61bfca8ed0f0
+//   }
+//   sequential {
+//     parallel {
+//       sequential {
+//         node: 0x61bfca8ed000 (* 2)
+//         parallel {
+//           node: 0x61bfca8ed0a0
+//           node: 0x61bfca8ed078 (* 2)
+//         }
+//       }
+//       sequential {
+//         node: 0x61bfca8ecfd8 (* 2)
+//         parallel {
+//           node: 0x61bfca8ed050 (* 2)
+//           node: 0x61bfca8ed028
+//         }
+//       }
+//     }
+//     node: 0x61bfca8ed0c8
+//   }
+// }
+//
+// The "(* 2)" markers show that the task node appears "instantiated" in that
+// number of places in the graph.  Care must be taken at replay time that all
+// nodes preceding a multiply-instantiated node execute before the node, and
+// that all nodes succeeding each "instantiation point" are executed once the
+// task has executed.
+//
+// The final region type is "exclusive", which arises for "mutexinoutset"
+// dependencies that are able to be abstracted away (we can't do this in all
+// cases: when we can't, we still use explicit mutexes).
+//
+// An example of this:
+//
+// #pragma omp taskgraph
+// {
+//   #pragma omp task depend(mutexinoutset: deps[0])
+//   { }
+//   #pragma omp task depend(mutexinoutset: deps[1])
+//   { }
+//   #pragma omp task depend(mutexinoutset: deps[0])
+//   { }
+//   #pragma omp task depend(mutexinoutset: deps[1])
+//   { }
+//   #pragma omp task depend(mutexinoutset: deps[0])
+//   { }
+//   #pragma omp task depend(mutexinoutset: deps[1])
+//   { }
+//   #pragma omp task depend(mutexinoutset: deps[0])
+//   { }
+//   #pragma omp task depend(mutexinoutset: deps[1])
+//   { }
+// }
+//
+// Results in this structure:
+//
+// parallel {
+//   exclusive {
+//     node: 0x5c0c5c571120
+//     node: 0x5c0c5c5710d0
+//     node: 0x5c0c5c571080
+//     node: 0x5c0c5c571030
+//   }
+//   exclusive {
+//     node: 0x5c0c5c5710f8
+//     node: 0x5c0c5c5710a8
+//     node: 0x5c0c5c571058
+//     node: 0x5c0c5c571008
+//   }
+// }
+//
+// The meaning of "exclusive" here is for each of the child regions (task
+// nodes in this case) to be executed in some unspecified order, one at a
+// time relative to the other regions in the structure.  E.g. a GPU
+// implementation could try to dynamically schedule tasks such that they fit
+// instantaneously-available execution resources.
+//
+// In cases where mutexes cannot be abstracted, each affected task node is
+// annotated with a set of mutexes that must be held while executing the task.
+// (Shown with [sets: 0xN] in dump output).
+
+kmp_int32 __kmp_build_taskgraph(kmp_int32 gtid,
+                                kmp_taskdata_t *current_taskdata,
+                                kmp_taskgraph_record_t *taskgraph) {
+  kmp_int32 numnodes = taskgraph->num_tasks;
+  kmp_int32 numregions = numnodes + 2;
+  kmp_taskgraph_node_t *nodes = taskgraph->record_map;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_dephash_t *hash = __kmp_dephash_create(thread, current_taskdata);
+  bool dep_barrier = false;
+
+  // We need to take special care to align the all_depnodes array to the cache
+  // line size, because kmp_depnode_t is marked as 64-byte aligned and
+  // otherwise the compiler might generate faulting memory accesses based on
+  // that alignment assumption.
+  size_t all_depnodes_size = numregions * sizeof(kmp_depnode_t);
+  // The maximum amount of padding we need is CACHE_LINE - 1 bytes.
+  all_depnodes_size = all_depnodes_size + CACHE_LINE - 1;
+  char *all_depnodes_misaligned =
+      (char *)__kmp_thread_malloc(thread, all_depnodes_size);
+  kmp_depnode_t *all_depnodes =
+      (kmp_depnode_t *)((((intptr_t)all_depnodes_misaligned) + CACHE_LINE - 1) &
+                        ~(CACHE_LINE - 1));
+  kmp_int32 next_mutex_set = 0;
+
+  for (kmp_int32 i = 0; i < numnodes; i++) {
+    int n_mtxs = 0;
+    bool dep_all;
+
+    dep_all = __kmp_filter_aliased_deps(nodes[i].u.unresolved.ndeps,
+                                        nodes[i].u.unresolved.dep_list,
+                                        nodes[i].task, &n_mtxs);
+    kmp_depnode_t *node = &all_depnodes[i];
+    __kmp_init_node(node, /*on_stack=*/false);
+    node->dn.task = nodes[i].task;
+    dep_barrier = !nodes[i].task && nodes[i].taskloop_task;
+    if (!dep_all) {
+      __kmp_process_deps<taskgraph_deps>(
+          gtid, node, &hash, dep_barrier, nodes[i].u.unresolved.ndeps,
+          nodes[i].u.unresolved.dep_list, nodes[i].task, next_mutex_set);
+    } else {
+      __kmp_process_dep_all<taskgraph_deps>(gtid, node, hash, dep_barrier,
+                                            nodes[i].task);
+    }
+  }
+
+  kmp_taskgraph_region_t *order_out[numregions];
+  kmp_int32 outidx = 0;
+
+  kmp_taskgraph_region_t *initial_regions =
+      (kmp_taskgraph_region_t *)__kmp_fast_allocate(
+          thread, sizeof(kmp_taskgraph_region_t) * numregions);
+  // FIXME: Something like 'placement new' here?
+  memset(initial_regions, 0, sizeof(kmp_taskgraph_region_t) * numregions);
+
+  kmp_taskgraph_region_t *cfg_barrier = nullptr;
+
+  for (kmp_int32 i = 0; i < numnodes; i++) {
+    initial_regions[i].type =
+        nodes[i].task ? TASKGRAPH_REGION_NODE : TASKGRAPH_REGION_WAIT;
+    initial_regions[i].task.node = &nodes[i];
+    initial_regions[i].task.next_instance = &initial_regions[i];
+    initial_regions[i].parent = nullptr;
+    if (i < numnodes - 1) {
+      initial_regions[i].next = &initial_regions[i + 1];
+    } else {
+      initial_regions[i].next = nullptr;
+    }
+    kmp_depnode_t *depnode = &all_depnodes[i];
+    initial_regions[i].mutexset = depnode->dn.set_membership;
+    for (kmp_depnode_list_t *succ = depnode->dn.successors; succ;
+         succ = succ->next) {
+      kmp_int32 succ_idx = succ->node - all_depnodes;
+      kmp_taskgraph_region_t *tg_succ = &initial_regions[succ_idx];
+      tg_succ->predecessors =
+          __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
+                                   &initial_regions[i], tg_succ->predecessors);
+      initial_regions[i].successors =
+          __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, tg_succ,
+                                   initial_regions[i].successors);
+    }
+    // Handle control flow dependencies.  If a node (e.g. a taskloop task) has
+    // a wait after it corresponding to the end of an implicit taskgroup, join
+    // the task to the wait.  The wait then becomes a barrier; any tasks after
+    // it will depend on the barrier.
+    if (nodes[i].u.unresolved.cfg_successor != -1) {
+      kmp_int32 cfg_succ = nodes[i].u.unresolved.cfg_successor;
+      initial_regions[i].successors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, &initial_regions[cfg_succ],
+          initial_regions[i].successors);
+      initial_regions[cfg_succ].predecessors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, &initial_regions[i],
+          initial_regions[cfg_succ].predecessors);
+    }
+    if (nodes[i].taskloop_task && !nodes[i].task) {
+      cfg_barrier = &initial_regions[i];
+    } else if (cfg_barrier) {
+      cfg_barrier->successors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, &initial_regions[i],
+          cfg_barrier->successors);
+      initial_regions[i].predecessors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, cfg_barrier,
+          initial_regions[i].predecessors);
+    }
+  }
+
+  __kmp_dephash_free<false>(thread, hash);
+  __kmp_thread_free(thread, all_depnodes_misaligned);
+
+  // We're done with the "unresolved" data now.  Initialise node count.
+  for (kmp_int32 i = 0; i < numnodes; i++) {
+    __kmp_thread_free(thread, nodes[i].u.unresolved.dep_list);
+    nodes[i].u.resolved.last_region = nullptr;
+    nodes[i].u.resolved.count = 0;
+  }
+
+  // Use these indices for the virtual entry and exit regions
+  kmp_int32 entryregion = numnodes, exitregion = numnodes + 1;
+
+  // Set entry/exit node types, and add to worklist
+  initial_regions[entryregion].type = TASKGRAPH_REGION_ENTRY;
+  initial_regions[entryregion].next = &initial_regions[0];
+  initial_regions[exitregion].type = TASKGRAPH_REGION_EXIT;
+  initial_regions[numnodes - 1].next = &initial_regions[exitregion];
+
+  // Join entry and exit nodes up to the graph
+  for (kmp_int32 i = 0; i < numnodes; i++) {
+    kmp_taskgraph_region_t *region = &initial_regions[i];
+    kmp_int32 npreds = __kmp_region_deplist_len(region->predecessors);
+    kmp_int32 nsuccs = __kmp_region_deplist_len(region->successors);
+    if (npreds == 0) {
+      initial_regions[entryregion].successors =
+          __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, region,
+                                   initial_regions[entryregion].successors);
+      region->predecessors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, &initial_regions[entryregion],
+          region->predecessors);
+    }
+    if (nsuccs == 0) {
+      initial_regions[exitregion].predecessors =
+          __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, region,
+                                   initial_regions[exitregion].predecessors);
+      region->successors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, &initial_regions[exitregion],
+          region->successors);
+    }
+    region->owner = taskgraph;
+  }
+
+  kmp_int32 max_level = -1;
+
+  for (kmp_int32 i = 0; i < numregions; i++)
+    initial_regions[i].timestamp = i;
+
+  for (kmp_int32 i = 0; i < numregions; i++) {
+    if (initial_regions[i].mark == TASKGRAPH_UNMARKED) {
+      kmp_int32 level = __kmp_taskgraph_topological_order(&initial_regions[i],
+                                                          order_out, &outidx);
+      max_level = level > max_level ? level : max_level;
+    }
+  }
+
+  assert(outidx == numregions);
+
+#ifdef DEBUG_TASKGRAPH
+  fprintf(stderr, "topological order (max level: %d):\n", max_level);
+
+  for (kmp_int32 i = 0; i < outidx; i++) {
+    fprintf(stderr, "node %d (region %p), level %d\n", order_out[i]->timestamp,
+            order_out[i], order_out[i]->level);
+  }
+#endif
+
+  kmp_taskgraph_region_t **alloc_chain = &initial_regions[0].alloc_chain;
+
+  kmp_taskgraph_region_t *root_region = __kmp_taskgraph_build_regions(
+      thread, taskgraph, alloc_chain, &initial_regions[entryregion],
+      &initial_regions[exitregion]);
+
+  __kmp_taskgraph_count_nodes(root_region);
+
+  __kmp_taskgraph_exclusive_regions(thread, taskgraph, alloc_chain,
+                                    &root_region, next_mutex_set);
+
+  *alloc_chain = nullptr;
+
+  taskgraph->root = root_region;
+  taskgraph->alloc_root = initial_regions;
+
+  // Free dependency lists and deleted regions.
+  kmp_taskgraph_region_t **regp = &taskgraph->alloc_root;
+  while (*regp) {
+    kmp_taskgraph_region_t *reg = *regp;
+    __kmp_region_deplist_free(thread, reg->predecessors);
+    __kmp_region_deplist_free(thread, reg->successors);
+    reg->predecessors = nullptr;
+    reg->successors = nullptr;
+    if (reg->mark == TASKGRAPH_DELETED) {
+      kmp_taskgraph_region_t *chain_next = reg->alloc_chain;
+      TGDBG("deleted region from alloc chain: %p\n", reg);
+      __kmp_fast_free(thread, reg);
+      *regp = chain_next;
+    } else {
+      regp = &reg->alloc_chain;
+    }
+  }
+  // Free recycled dep list.  We could pass this along to the next invocation
+  // of this function instead, but we don't do that yet (ownership/thread
+  // safety needs careful consideration if we do that).
+  for (kmp_taskgraph_region_dep_t *dep = taskgraph->recycled_deps; dep;) {
+    kmp_taskgraph_region_dep_t *next = dep->next;
+    TGDBG("free dep from recycled list\n");
+    __kmp_fast_free(thread, dep);
+    dep = next;
+  }
+  taskgraph->recycled_deps = nullptr;
+
+  KG_TRACE(10, ("Processed taskgraph %p (graph_id %" PRIx64 "):\n", taskgraph,
+                taskgraph->graph_id));
+  KG_DUMP(10, __kmp_dump_taskgraph_regions(stderr, root_region));
+
+#ifdef DEBUG_TASKGRAPH
+//__kmp_dump_taskgraph_regions(stderr, root_region);
+//__kmp_dump_raw_taskgraph_regions(stderr, thread, taskgraph,
+//                                 &initial_regions[0], numregions);
+#endif
+
+  KMP_ATOMIC_ST_REL(&taskgraph->status, KMP_TDG_READY);
+
+  return 0;
+}
+#endif
+
+#define NO_DEP_BARRIER (false)
+#define DEP_BARRIER (true)
+
+// returns true if the task has any outstanding dependence
+static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
+                             kmp_task_t *task, kmp_dephash_t **hash,
+                             bool dep_barrier, kmp_int32 ndeps,
+                             kmp_depend_info_t *dep_list,
+                             kmp_int32 ndeps_noalias,
+                             kmp_depend_info_t *noalias_dep_list) {
+  int n_mtxs = 0, dep_all = 0;
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+#endif
+  KA_TRACE(20, ("__kmp_check_deps: T#%d checking dependences for task %p : %d "
+                "possibly aliased dependences, %d non-aliased dependences : "
+                "dep_barrier=%d .\n",
+                gtid, taskdata, ndeps, ndeps_noalias, dep_barrier));
+
+  dep_all = __kmp_filter_aliased_deps(ndeps, dep_list, task, &n_mtxs);
+
   // doesn't need to be atomic as no other thread is going to be accessing this
   // node just yet.
   // npredecessors is set -1 to ensure that none of the releasing tasks queues
@@ -555,14 +3354,17 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
   // used to pack all npredecessors additions into a single atomic operation at
   // the end
   int npredecessors;
+  kmp_int32 next_mutex = 0;
 
   if (!dep_all) { // regular dependences
-    npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier,
-                                             ndeps, dep_list, task);
-    npredecessors += __kmp_process_deps<false>(
-        gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task);
+    npredecessors = __kmp_process_deps<normal_deps>(
+        gtid, node, hash, dep_barrier, ndeps, dep_list, task, next_mutex);
+    npredecessors += __kmp_process_deps<normal_deps>(
+        gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task,
+        next_mutex, false);
   } else { // omp_all_memory dependence
-    npredecessors = __kmp_process_dep_all(gtid, node, *hash, dep_barrier, task);
+    npredecessors = __kmp_process_dep_all<normal_deps>(gtid, node, *hash,
+                                                       dep_barrier, task);
   }
 
   node->dn.task = task;
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
index 71e8e69d44593..e7f23745967e0 100644
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -40,6 +40,7 @@ static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) {
   }
 }
 
+template <bool refcounting>
 static inline void __kmp_depnode_list_free(kmp_info_t *thread,
                                            kmp_depnode_list *list) {
   kmp_depnode_list *next;
@@ -47,7 +48,8 @@ static inline void __kmp_depnode_list_free(kmp_info_t *thread,
   for (; list; list = next) {
     next = list->next;
 
-    __kmp_node_deref(thread, list->node);
+    if (refcounting)
+      __kmp_node_deref(thread, list->node);
 #if USE_FAST_MEMORY
     __kmp_fast_free(thread, list);
 #else
@@ -56,6 +58,7 @@ static inline void __kmp_depnode_list_free(kmp_info_t *thread,
   }
 }
 
+template <bool refcounting>
 static inline void __kmp_dephash_free_entries(kmp_info_t *thread,
                                               kmp_dephash_t *h) {
   for (size_t i = 0; i < h->size; i++) {
@@ -63,12 +66,14 @@ static inline void __kmp_dephash_free_entries(kmp_info_t *thread,
       kmp_dephash_entry_t *next;
       for (kmp_dephash_entry_t *entry = h->buckets[i]; entry; entry = next) {
         next = entry->next_in_bucket;
-        __kmp_depnode_list_free(thread, entry->last_set);
-        __kmp_depnode_list_free(thread, entry->prev_set);
-        __kmp_node_deref(thread, entry->last_out);
-        if (entry->mtx_lock) {
-          __kmp_destroy_lock(entry->mtx_lock);
-          __kmp_free(entry->mtx_lock);
+        __kmp_depnode_list_free<refcounting>(thread, entry->last_set);
+        __kmp_depnode_list_free<refcounting>(thread, entry->prev_set);
+        if (refcounting) {
+          __kmp_node_deref(thread, entry->last_out);
+          if (entry->mtx_lock) {
+            __kmp_destroy_lock(entry->mtx_lock);
+            __kmp_free(entry->mtx_lock);
+          }
         }
 #if USE_FAST_MEMORY
         __kmp_fast_free(thread, entry);
@@ -79,12 +84,14 @@ static inline void __kmp_dephash_free_entries(kmp_info_t *thread,
       h->buckets[i] = 0;
     }
   }
-  __kmp_node_deref(thread, h->last_all);
+  if (refcounting)
+    __kmp_node_deref(thread, h->last_all);
   h->last_all = NULL;
 }
 
+template <bool refcounting>
 static inline void __kmp_dephash_free(kmp_info_t *thread, kmp_dephash_t *h) {
-  __kmp_dephash_free_entries(thread, h);
+  __kmp_dephash_free_entries<refcounting>(thread, h);
 #if USE_FAST_MEMORY
   __kmp_fast_free(thread, h);
 #else
@@ -112,7 +119,7 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
     KA_TRACE(
         40, ("__kmp_release_deps: T#%d freeing dependencies hash of task %p.\n",
              gtid, task));
-    __kmp_dephash_free(thread, task->td_dephash);
+    __kmp_dephash_free<true>(thread, task->td_dephash);
     task->td_dephash = NULL;
   }
 
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 23092927babc1..0a5010da236b1 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -17,6 +17,8 @@
 #include "kmp_wait_release.h"
 #include "kmp_taskdeps.h"
 
+#undef DEBUG_TASKGRAPH
+
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
 #endif
@@ -718,7 +720,7 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
                            "dephash of implicit task %p\n",
                            gtid, taskdata));
             // cleanup dephash of finished implicit task
-            __kmp_dephash_free_entries(thread, taskdata->td_dephash);
+            __kmp_dephash_free_entries<true>(thread, taskdata->td_dephash);
           }
         }
       }
@@ -753,6 +755,12 @@ static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
   return ret;
 }
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+static bool
+__kmp_taskgraph_exec_descr_finish(kmp_int32 gtid, kmp_info_t *thread,
+                                  kmp_taskgraph_exec_descr_t *descr);
+#endif
+
 // __kmp_task_finish: bookkeeping to do when a task finishes execution
 //
 // gtid: global thread ID for calling thread
@@ -769,6 +777,9 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_task_team_t *task_team =
       thread->th.th_task_team; // might be NULL for serial teams...
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  bool is_taskgraph;
+#endif
 #if KMP_DEBUG
   kmp_int32 children = 0;
 #endif
@@ -778,6 +789,10 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
 
   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  is_taskgraph = taskdata->owning_taskgraph;
+#endif
+
   if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) {
     // untied task needs to check the counter so that the task structure is not
     // freed prematurely
@@ -888,6 +903,19 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
     if (ompt)
       __ompt_task_finish(task, resumed_task, ompt_task_complete);
 #endif
+
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    if (is_taskgraph) {
+      __kmp_taskgraph_exec_descr_finish(gtid, thread, taskdata->exec_descr);
+      KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks);
+      if (taskdata->td_taskgroup)
+        KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+      thread->th.th_current_task = resumed_task;
+      resumed_task->td_flags.executing = 1; // resume previous task
+      return;
+    }
+#endif
+
     // TODO: What would be the balance between the conditions in the function
     // and an atomic operation?
     if (__kmp_track_children_task(taskdata)) {
@@ -1106,7 +1134,7 @@ void __kmp_finish_implicit_task(kmp_info_t *thread) {
         KA_TRACE(100, ("__kmp_finish_implicit_task: T#%d cleans "
                        "dephash of implicit task %p\n",
                        thread->th.th_info.ds.ds_gtid, task));
-        __kmp_dephash_free_entries(thread, task->td_dephash);
+        __kmp_dephash_free_entries<true>(thread, task->td_dephash);
       }
     }
   }
@@ -1119,7 +1147,7 @@ void __kmp_finish_implicit_task(kmp_info_t *thread) {
 void __kmp_free_implicit_task(kmp_info_t *thread) {
   kmp_taskdata_t *task = thread->th.th_current_task;
   if (task && task->td_dephash) {
-    __kmp_dephash_free(thread, task->td_dephash);
+    __kmp_dephash_free<true>(thread, task->td_dephash);
     task->td_dephash = NULL;
   }
 }
@@ -1134,7 +1162,7 @@ static size_t __kmp_round_up_to_val(size_t size, size_t val) {
     }
   }
   return size;
-} // __kmp_round_up_to_va
+} // __kmp_round_up_to_val
 
 // __kmp_task_alloc: Allocate the taskdata and task data structures for a task
 //
@@ -1323,6 +1351,10 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   taskdata->td_flags.executing = 0;
   taskdata->td_flags.complete = 0;
   taskdata->td_flags.freed = 0;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  taskdata->owning_taskgraph = nullptr;
+  taskdata->exec_descr = nullptr;
+#endif
   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
   // start at one because counts current task and children
   KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
@@ -2079,6 +2111,410 @@ kmp_int32 __kmpc_omp_taskyield(ident_t *loc_ref, kmp_int32 gtid, int end_part) {
   return TASK_CURRENT_NOT_QUEUED;
 }
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+static kmp_taskgraph_exec_descr_t *
+__kmp_fill_exec_descr(kmp_int32, kmp_info_t *, kmp_taskgraph_record_t *,
+                      kmp_taskgraph_region_t *, kmp_taskdata_t *,
+                      kmp_taskgraph_exec_descr_t *, kmp_size_t &,
+                      kmp_taskgraph_exec_descr_t **);
+
+static kmp_int32 __kmp_pred_list_length(kmp_taskgraph_exec_descr_t *desc) {
+  kmp_int32 res = 0;
+  for (; desc; desc = desc->predecessor_chain)
+    ++res;
+  return res;
+}
+
+static kmp_taskgraph_exec_descr_t *__kmp_fill_sequential_descr(
+    kmp_int32 gtid, kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, kmp_taskdata_t *parent_taskdata,
+    kmp_taskgraph_exec_descr_t *exec_descrs, kmp_size_t &next_idx,
+    kmp_taskgraph_exec_descr_t **succs_to_fill_p) {
+  assert(region->type == TASKGRAPH_REGION_SEQUENTIAL);
+  kmp_taskgraph_exec_descr_t *first_node = nullptr;
+  for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+    kmp_taskgraph_exec_descr *descr = __kmp_fill_exec_descr(
+        gtid, thread, taskgraph, region->inner.children[c], parent_taskdata,
+        exec_descrs, next_idx, succs_to_fill_p);
+    if (!first_node)
+      first_node = descr;
+  }
+  return first_node;
+}
+
+static kmp_taskgraph_exec_descr_t *__kmp_fill_par_or_excl_descr(
+    kmp_int32 gtid, kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, kmp_taskdata_t *parent_taskdata,
+    kmp_taskgraph_exec_descr_t *exec_descrs, kmp_size_t &next_idx,
+    kmp_taskgraph_exec_descr_t **succs_to_fill_p) {
+  assert(region->type == TASKGRAPH_REGION_PARALLEL ||
+         region->type == TASKGRAPH_REGION_EXCLUSIVE);
+
+  kmp_taskgraph_exec_descr *incoming_preds = *succs_to_fill_p;
+
+  kmp_taskgraph_exec_descr *exec_descr = &exec_descrs[next_idx++];
+  exec_descr->region = region;
+  exec_descr->region->exec_descr = exec_descr;
+  exec_descr->nblocks = 0;
+  exec_descr->npredecessors = __kmp_pred_list_length(incoming_preds);
+  exec_descr->predecessor_chain = nullptr;
+  exec_descr->successor = nullptr;
+  exec_descr->sibling = exec_descr;
+  exec_descr->next_instance = nullptr;
+
+  kmp_taskgraph_exec_descr_t *gathered_succs = nullptr;
+  kmp_taskgraph_exec_descr_t **gathered_succs_p = &gathered_succs;
+
+  kmp_taskgraph_exec_descr_t *sibling_list = nullptr;
+
+  for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+    kmp_taskgraph_exec_descr_t *succs_to_fill = nullptr;
+    kmp_taskgraph_exec_descr_t *head = __kmp_fill_exec_descr(
+        gtid, thread, taskgraph, region->inner.children[c], parent_taskdata,
+        exec_descrs, next_idx, &succs_to_fill);
+    if (!sibling_list) {
+      sibling_list = head;
+      sibling_list->sibling = head;
+    } else {
+      kmp_taskgraph_exec_descr_t *next_sibling = sibling_list->sibling;
+      sibling_list->sibling = head;
+      head->sibling = next_sibling;
+      // Make the head of the sibling list the most recently added node (it
+      // doesn't really matter).
+      sibling_list = head;
+    }
+    while (succs_to_fill) {
+      kmp_taskgraph_exec_descr_t *next = succs_to_fill->predecessor_chain;
+      *gathered_succs_p = succs_to_fill;
+      gathered_succs_p = &succs_to_fill->predecessor_chain;
+      succs_to_fill = next;
+    }
+  }
+
+  // The parallel exec descr points to (any of the members of) the following
+  // circular sibling list.
+  exec_descr->successor = sibling_list;
+
+  // All the incoming successors point to the 'parallel' exec descr.
+  for (; incoming_preds; incoming_preds = incoming_preds->predecessor_chain) {
+    incoming_preds->successor = exec_descr;
+  }
+
+  *succs_to_fill_p = gathered_succs;
+
+  return exec_descr;
+}
+
+static kmp_taskgraph_exec_descr_t *__kmp_fill_exec_descr(
+    kmp_int32 gtid, kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, kmp_taskdata_t *parent_taskdata,
+    kmp_taskgraph_exec_descr_t *exec_descrs, kmp_size_t &next_idx,
+    kmp_taskgraph_exec_descr_t **succs_to_fill_p) {
+  switch (region->type) {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+    break;
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT: {
+    kmp_taskgraph_exec_descr_t *incoming_succs_to_fill = *succs_to_fill_p;
+    kmp_taskgraph_exec_descr_t *exec_descr = &exec_descrs[next_idx++];
+    exec_descr->region = region;
+    exec_descr->region->exec_descr = exec_descr;
+    exec_descr->nblocks = region->task.node->u.resolved.count - 1;
+    exec_descr->npredecessors = __kmp_pred_list_length(incoming_succs_to_fill);
+    exec_descr->sibling = exec_descr;
+    exec_descr->predecessor_chain = nullptr;
+    exec_descr->successor = nullptr;
+    exec_descr->next_instance = nullptr;
+
+    // Edit the taskdata for this specific instantiation.  At present the
+    // task/taskdata structures cannot be used simultaneously by different
+    // threads. We could duplicate the structures to allow simultaneous issue,
+    // but that's not done yet.  The exec_descr can already by thread-local,
+    // in principle, but for now it points to the taskgraph's single copy
+    // of each task/taskdata structure.
+    if (region->type == TASKGRAPH_REGION_NODE) {
+      kmp_task_t *task = exec_descr->region->task.node->task;
+      kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+      taskdata->exec_descr = exec_descr;
+    }
+
+    for (kmp_taskgraph_exec_descr_t *pred = incoming_succs_to_fill; pred;
+         pred = pred->predecessor_chain) {
+      pred->successor = exec_descr;
+    }
+
+    *succs_to_fill_p = exec_descr;
+
+    return exec_descr;
+  }
+  case TASKGRAPH_REGION_SEQUENTIAL:
+    return __kmp_fill_sequential_descr(gtid, thread, taskgraph, region,
+                                       parent_taskdata, exec_descrs, next_idx,
+                                       succs_to_fill_p);
+  case TASKGRAPH_REGION_PARALLEL:
+  case TASKGRAPH_REGION_EXCLUSIVE:
+    return __kmp_fill_par_or_excl_descr(gtid, thread, taskgraph, region,
+                                        parent_taskdata, exec_descrs, next_idx,
+                                        succs_to_fill_p);
+  }
+  return nullptr;
+}
+
+#ifdef DEBUG_TASKGRAPH
+static void __kmp_debug_taskgraph_exec_descr(kmp_taskgraph_exec_descr_t *descrs,
+                                             kmp_size_t count) {
+  fprintf(stderr, "digraph ExecDescr {\n");
+  fprintf(stderr, "  end [shape=diamond]\n");
+  for (kmp_size_t i = 0; i < count; i++) {
+    kmp_taskgraph_exec_descr_t *descr = &descrs[i];
+    fprintf(stderr, "  \"%p\" [label=< <B>", descr->region);
+    switch (descr->region->type) {
+    case TASKGRAPH_REGION_PARALLEL:
+      fprintf(stderr, "par</B> %p<BR/>preds=%d", descr->region,
+              descr->npredecessors.load());
+      break;
+    case TASKGRAPH_REGION_EXCLUSIVE:
+      fprintf(stderr, "excl</B> %p<BR/>preds=%d", descr->region,
+              descr->npredecessors.load());
+      break;
+    case TASKGRAPH_REGION_NODE:
+      if (descr->region->task.node->u.resolved.count > 1) {
+        fprintf(stderr, "task</B> %p<BR/>preds=%d instances=%d",
+                descr->region->task.node, descr->npredecessors.load(),
+                descr->region->task.node->u.resolved.count);
+      } else {
+        fprintf(stderr, "task</B> %p<BR/>preds=%d", descr->region->task.node,
+                descr->npredecessors.load());
+      }
+      break;
+    case TASKGRAPH_REGION_WAIT:
+      if (descr->region->task.node->u.resolved.count > 1) {
+        fprintf(stderr, "wait</B> %p<BR/>preds=%d instances=%d", descr->region,
+                descr->npredecessors.load(),
+                descr->region->task.node->u.resolved.count);
+      } else {
+        fprintf(stderr, "wait</B> %p<BR/>preds=%d", descr->region,
+                descr->npredecessors.load());
+      }
+      break;
+    default:
+      fprintf(stderr, "???</B>");
+    }
+    fprintf(stderr, " >, shape=box]\n");
+
+    if ((descr->region->type == TASKGRAPH_REGION_NODE ||
+         descr->region->type == TASKGRAPH_REGION_WAIT) &&
+        descr->region->task.node->u.resolved.count > 1) {
+      kmp_taskgraph_region_t *region = descr->region;
+      fprintf(
+          stderr,
+          "  \"%p\" -> \"%p\" [style=dotted, color=blue, constraint=false]\n",
+          region, region->task.next_instance);
+    }
+
+    if (descr->successor) {
+      fprintf(stderr, "  \"%p\" -> \"%p\"\n", descr->region,
+              descr->successor->region);
+      if (descr->region->type == TASKGRAPH_REGION_PARALLEL ||
+          descr->region->type == TASKGRAPH_REGION_EXCLUSIVE) {
+        kmp_taskgraph_exec_descr_t *succ = descr->successor;
+        if (succ->sibling != succ) {
+          kmp_taskgraph_exec_descr_t *walk = succ;
+          fprintf(stderr, "  subgraph { rank=same;\n");
+          do {
+            fprintf(stderr, "    \"%p\" -> \"%p\" [color=red]\n", walk->region,
+                    walk->sibling->region);
+            walk = walk->sibling;
+          } while (walk != succ);
+          fprintf(stderr, "  }\n");
+        } else {
+          fprintf(stderr, "*** Expected parallel/exclusive to have >1 tasks\n");
+        }
+      }
+    } else {
+      fprintf(stderr, "  \"%p\" -> end\n", descr->region);
+    }
+  }
+  fprintf(stderr, "}\n");
+}
+#endif
+
+static void __kmp_exec_descr_link_instances(kmp_taskgraph_exec_descr_t *descrs,
+                                            kmp_size_t count) {
+  for (kmp_size_t i = 0; i < count; i++) {
+    kmp_taskgraph_exec_descr_t *descr = &descrs[i];
+    if (descr->region->type == TASKGRAPH_REGION_NODE ||
+        descr->region->type == TASKGRAPH_REGION_WAIT)
+      descr->next_instance = descr->region->task.next_instance->exec_descr;
+  }
+}
+
+/// Reset, reparent and regroup the recorded task TASK and re-invoke it.
+
+static void __kmp_omp_tg_task(kmp_int32 gtid, kmp_task_t *task,
+                              kmp_taskgroup_t *taskgroup,
+                              kmp_taskdata_t *parent_taskdata,
+                              bool serialize_immediate) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  taskdata->td_parent = parent_taskdata;
+
+  taskdata->td_flags.complete = 0;
+  taskdata->td_flags.started = 0;
+  taskdata->td_flags.freed = 0;
+  taskdata->td_flags.executing = 0;
+  taskdata->td_flags.task_serial =
+      (parent_taskdata->td_flags.final || taskdata->td_flags.team_serial ||
+       taskdata->td_flags.tasking_ser);
+
+  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
+  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
+  // Start at one because counter represents current task and children.
+  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
+
+  taskdata->td_taskgroup = taskgroup;
+  KMP_ATOMIC_INC(&taskgroup->count);
+  KMP_ATOMIC_INC(&parent_taskdata->td_incomplete_child_tasks);
+  if (parent_taskdata->td_flags.tasktype == TASK_EXPLICIT)
+    KMP_ATOMIC_INC(&parent_taskdata->td_allocated_child_tasks);
+
+  __kmp_omp_task(gtid, task, false);
+}
+
+struct kmp_taskred_input;
+template <typename T>
+void *__kmp_task_reduction_init(int gtid, int num, T *data);
+
+static void __kmp_taskgraph_exec_descr_start(kmp_int32 gtid, kmp_info_t *thread,
+                                             kmp_taskgraph_exec_descr_t *descr,
+                                             kmp_taskgroup_t *taskgroup) {
+  kmp_int32 npredecessors = KMP_ATOMIC_DEC(&descr->npredecessors) - 1;
+  if (npredecessors > 0)
+    return;
+
+  switch (descr->region->type) {
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT: {
+    kmp_taskgraph_exec_descr_t *lowest_descr = nullptr, *iter = descr;
+    do {
+      if (!lowest_descr || lowest_descr > iter)
+        lowest_descr = iter;
+      iter = iter->next_instance;
+    } while (iter != descr);
+    kmp_int32 nblocks = KMP_ATOMIC_DEC(&lowest_descr->nblocks);
+    if (nblocks <= 0) {
+      if (descr->region->type == TASKGRAPH_REGION_NODE) {
+        kmp_task_t *task = descr->region->task.node->task;
+        kmp_taskdata_t *current_taskdata = thread->th.th_current_task;
+        __kmp_omp_tg_task(gtid, task, taskgroup, current_taskdata, false);
+      } else {
+        // There's no task for a 'taskwait', so start successors immediately.
+        kmp_taskgraph_exec_descr_t *walk = descr;
+        do {
+          if (walk->successor) {
+            __kmp_taskgraph_exec_descr_start(gtid, thread, walk->successor,
+                                             taskgroup);
+          }
+          walk = walk->next_instance;
+        } while (walk != descr);
+      }
+    }
+    break;
+  }
+  case TASKGRAPH_REGION_PARALLEL: {
+    if (descr->region->reduce_input) {
+      // If there are reductions associated with this parallel region, we
+      // start a new taskgroup here.
+      __kmpc_taskgroup(/*loc=*/nullptr, gtid);
+      // Update variable to the newly-created taskgroup.
+      taskgroup = thread->th.th_current_task->td_taskgroup;
+      __kmp_task_reduction_init(
+          gtid, descr->region->reduce_input->reduce_num_data,
+          (struct kmp_taskred_input *)descr->region->reduce_input->reduce_data);
+    }
+    kmp_taskgraph_exec_descr_t *head = descr->successor;
+    kmp_taskgraph_exec_descr_t *item = head;
+    do {
+      __kmp_taskgraph_exec_descr_start(gtid, thread, item, taskgroup);
+      item = item->sibling;
+    } while (item != head);
+    if (descr->region->reduce_input)
+      __kmpc_end_taskgroup(/*loc=*/nullptr, gtid);
+    break;
+  }
+  case TASKGRAPH_REGION_EXCLUSIVE: {
+    kmp_taskgraph_exec_descr_t *head = descr->successor;
+    kmp_taskgraph_exec_descr_t *item = head;
+    do {
+      assert(item->region->type == TASKGRAPH_REGION_NODE);
+      kmp_task_t *task = item->region->task.node->task;
+      kmp_taskdata_t *current_taskdata = thread->th.th_current_task;
+      __kmp_omp_tg_task(gtid, task, taskgroup, current_taskdata, true);
+      item = item->sibling;
+    } while (item != head);
+    break;
+  }
+  default:;
+  }
+}
+
+static bool
+__kmp_taskgraph_exec_descr_finish(kmp_int32 gtid, kmp_info_t *thread,
+                                  kmp_taskgraph_exec_descr_t *descr) {
+  switch (descr->region->type) {
+  case TASKGRAPH_REGION_NODE: {
+    kmp_task_t *task = descr->region->task.node->task;
+    kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+    taskdata->td_flags.started = 0;
+    taskdata->td_flags.executing = 0;
+    taskdata->td_flags.complete = 0;
+    taskdata->td_flags.freed = 0;
+    bool any_successors = false;
+    kmp_taskgraph_exec_descr_t *walk = descr;
+    do {
+      if (walk->successor) {
+        any_successors = true;
+        __kmp_taskgraph_exec_descr_start(gtid, thread, walk->successor,
+                                         taskdata->td_taskgroup);
+      }
+      walk = walk->next_instance;
+    } while (walk != descr);
+    return any_successors;
+  }
+  default:
+    fprintf(stderr, "unexpected exec descr type for finish? (%p)\n", descr);
+    exit(1);
+  }
+
+  return false;
+}
+
+static kmp_size_t __kmp_exec_descr_count(kmp_taskgraph_region_t *region) {
+  kmp_size_t sum = 0;
+
+  switch (region->type) {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+    return 0;
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT:
+    return 1;
+  case TASKGRAPH_REGION_PARALLEL:
+  case TASKGRAPH_REGION_EXCLUSIVE:
+    sum++;
+    KMP_FALLTHROUGH();
+  case TASKGRAPH_REGION_SEQUENTIAL:
+    for (kmp_int32 i = 0; i < region->inner.num_children; i++)
+      sum += __kmp_exec_descr_count(region->inner.children[i]);
+    break;
+  default:
+    fprintf(stderr, "unexpected region type\n");
+    exit(1);
+  }
+  return sum;
+}
+#endif
+
 // Task Reduction implementation
 //
 // Note: initial implementation didn't take into account the possibility
@@ -2266,6 +2702,48 @@ void *__kmpc_taskred_init(int gtid, int num, void *data) {
   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
 }
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+static kmp_taskgraph_record_t *
+__kmp_taskgraph_or_parent_recording(kmp_taskgroup_t *taskgroup) {
+  kmp_taskgraph_record_t *rec = nullptr;
+
+  for (; taskgroup; taskgroup = taskgroup->parent) {
+    rec = KMP_ATOMIC_LD_ACQ(&taskgroup->taskgraph.recording);
+    if (rec)
+      return rec;
+  }
+
+  return nullptr;
+}
+
+void *__kmpc_taskgraph_taskred_init(kmp_int32 gtid, kmp_int32 num, void *data) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgroup_t *taskgroup = thread->th.th_current_task->td_taskgroup;
+  kmp_taskgraph_record_t *rec = __kmp_taskgraph_or_parent_recording(taskgroup);
+
+  if (rec) {
+    kmp_taskgraph_status_t status = KMP_ATOMIC_LD_ACQ(&rec->status);
+    if (status == KMP_TDG_RECORDING) {
+      kmp_taskgraph_reduce_input_data_t *input_data =
+          (kmp_taskgraph_reduce_input_data_t *)__kmp_fast_allocate(
+              thread, sizeof(kmp_taskgraph_reduce_input_data_t));
+      // The compiler might build the reduction input data on the stack, so
+      // we must make a copy.
+      input_data->reduce_data =
+          __kmp_fast_allocate(thread, sizeof(kmp_taskred_input_t) * num);
+      KMP_MEMCPY(input_data->reduce_data, data,
+                 sizeof(kmp_taskred_input_t) * num);
+      ;
+      input_data->reduce_num_data = num;
+      taskgroup->taskgraph.reduce_input = input_data;
+    } else if (status == KMP_TDG_READY)
+      assert(false &&
+             "unexpected __kmpc_taskgraph_taskred_init with ready taskgraph");
+  }
+  return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
+}
+#endif
+
 // Copy task reduction data (except for shared pointers).
 template <typename T>
 void __kmp_task_reduction_init_copy(kmp_info_t *thr, int num, T *data,
@@ -2302,7 +2780,11 @@ void *__kmpc_task_reduction_get_th_data(int gtid, void *tskgrp, void *data) {
     return data; // nothing to do
 
   kmp_taskgroup_t *tg = (kmp_taskgroup_t *)tskgrp;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  if (tg == NULL || thread->th.th_current_task->owning_taskgraph)
+#else
   if (tg == NULL)
+#endif
     tg = thread->th.th_current_task->td_taskgroup;
   KMP_ASSERT(tg != NULL);
   kmp_taskred_data_t *arr;
@@ -2513,6 +2995,10 @@ void __kmpc_taskgroup(ident_t *loc, int gtid) {
   tg_new->reduce_data = NULL;
   tg_new->reduce_num_data = 0;
   tg_new->gomp_data = NULL;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  tg_new->taskgraph.recording = nullptr;
+  tg_new->taskgraph.reduce_input = nullptr;
+#endif
   taskdata->td_taskgroup = tg_new;
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -2668,6 +3154,13 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
       __kmp_task_reduction_fini(thread, taskgroup);
     }
   }
+
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  // This should have been moved to a task node within the group, else it will
+  // leak here.
+  assert(!taskgroup->taskgraph.reduce_input);
+#endif
+
   // Restore parent taskgroup for the current task
   taskdata->td_taskgroup = taskgroup->parent;
   __kmp_thread_free(thread, taskgroup);
@@ -2684,6 +3177,37 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
 #endif
 }
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+void __kmp_replay_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
+                            kmp_taskgraph_record_t *taskgraph,
+                            kmp_uint32 graph_id, kmp_taskgroup_t *taskgroup) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+
+  kmp_taskgraph_exec_descr_t *exec_descrs = taskgraph->exec_descrs;
+
+  if (!exec_descrs) {
+    kmp_int32 exec_descr_count = __kmp_exec_descr_count(taskgraph->root);
+    exec_descrs = (kmp_taskgraph_exec_descr_t *)__kmp_thread_malloc(
+        thread, exec_descr_count * sizeof(kmp_taskgraph_exec_descr_t));
+    taskgraph->exec_descrs = exec_descrs;
+    taskgraph->exec_descr_size = exec_descr_count;
+  }
+
+  kmp_taskgraph_exec_descr_t *succs_to_fill = nullptr;
+  kmp_size_t next_idx = 0;
+  kmp_taskgraph_exec_descr_t *head = __kmp_fill_exec_descr(
+      gtid, thread, taskgraph, taskgraph->root, current_taskdata, exec_descrs,
+      next_idx, &succs_to_fill);
+  assert(next_idx == taskgraph->exec_descr_size);
+
+  __kmp_exec_descr_link_instances(exec_descrs, taskgraph->exec_descr_size);
+#ifdef DEBUG_TASKGRAPH
+  __kmp_debug_taskgraph_exec_descr(exec_descrs, taskgraph->exec_descr_size);
+#endif
+  __kmp_taskgraph_exec_descr_start(gtid, thread, head, taskgroup);
+}
+#endif
+
 static kmp_task_t *__kmp_get_priority_task(kmp_int32 gtid,
                                            kmp_task_team_t *task_team,
                                            kmp_int32 is_constrained) {
@@ -4276,7 +4800,12 @@ void __kmp_fulfill_event(kmp_event_t *event) {
 // taskloop_recur: used only when dealing with taskgraph,
 //      indicating whether we need to update task->td_task_id
 // returns:  a pointer to the allocated kmp_task_t structure (task).
-kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
+kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src
+#if OMP_TASKGRAPH_EXPERIMENTAL
+                                 ,
+                                 bool taskgraph
+#endif
+) {
   kmp_task_t *task;
   kmp_taskdata_t *taskdata;
   kmp_taskdata_t *taskdata_src = KMP_TASK_TO_TASKDATA(task_src);
@@ -4322,7 +4851,12 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
 
   // Only need to keep track of child task counts if team parallel and tasking
   // not serialized
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ||
+        taskgraph)) {
+#else
   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
+#endif
     KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
     if (parent_task->td_taskgroup)
       KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
@@ -4455,6 +4989,47 @@ class kmp_taskloop_bounds_t {
   }
 };
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+kmp_taskgraph_node_t *
+__kmp_taskgraph_node_alloc(kmp_taskgraph_record_t *rec, kmp_task_t *task,
+                           kmp_size_t *index_p = nullptr) {
+  kmp_int32 gtid = rec->gtid;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgraph_node_t *new_task = nullptr;
+
+  __kmp_acquire_lock(&rec->map_lock, gtid);
+
+  if (!rec->record_map) {
+    rec->nodes_allocated = 4;
+    rec->record_map = (kmp_taskgraph_node_t *)__kmp_thread_malloc(
+        thread, rec->nodes_allocated * sizeof(kmp_taskgraph_node_t));
+  }
+
+  if (rec->num_tasks >= rec->nodes_allocated) {
+    rec->record_map = (kmp_taskgraph_node_t *)__kmp_thread_realloc(
+        thread, rec->record_map,
+        2 * rec->nodes_allocated * sizeof(kmp_taskgraph_node_t));
+    rec->nodes_allocated *= 2;
+  }
+
+  new_task = &rec->record_map[rec->num_tasks];
+  if (index_p)
+    *index_p = rec->num_tasks;
+  ++rec->num_tasks;
+
+  __kmp_release_lock(&rec->map_lock, gtid);
+
+  new_task->task = task;
+  new_task->taskloop_task = false;
+  new_task->reduce_input = nullptr;
+  new_task->u.unresolved.ndeps = 0;
+  new_task->u.unresolved.dep_list = nullptr;
+  new_task->u.unresolved.cfg_successor = -1;
+
+  return new_task;
+}
+#endif
+
 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
 //
 // loc        Source location information
@@ -4471,15 +5046,21 @@ class kmp_taskloop_bounds_t {
 // tc         Iterations count
 // task_dup   Tasks duplication routine
 // codeptr_ra Return address for OMPT events
-void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
-                           kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
-                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
-                           kmp_uint64 grainsize, kmp_uint64 extras,
-                           kmp_int64 last_chunk, kmp_uint64 tc,
+static void
+__kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, kmp_uint64 *lb,
+                      kmp_uint64 *ub, kmp_int64 st, kmp_int32 nogroup,
+                      kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                      kmp_uint64 grainsize, kmp_uint64 extras,
+                      kmp_int64 last_chunk, kmp_uint64 tc,
 #if OMPT_SUPPORT
-                           void *codeptr_ra,
+                      void *codeptr_ra,
 #endif
-                           void *task_dup) {
+                      void *task_dup
+#if OMP_TASKGRAPH_EXPERIMENTAL
+                      ,
+                      kmp_taskgraph_record_t *taskgraph_rec = nullptr
+#endif
+) {
   KMP_COUNT_BLOCK(OMP_TASKLOOP);
   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
@@ -4492,6 +5073,9 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
   kmp_taskdata_t *current_task = thread->th.th_current_task;
   kmp_task_t *next_task;
   kmp_int32 lastpriv = 0;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  kmp_int32 taskloop_prev_idx = -1, taskloop_first_idx = -1;
+#endif
 
   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize +
                              (last_chunk < 0 ? last_chunk : extras));
@@ -4532,7 +5116,11 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
       }
     }
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    next_task = __kmp_task_dup_alloc(thread, task, /*taskgraph=*/taskgraph_rec);
+#else
     next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+#endif
 
     kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task);
     kmp_taskloop_bounds_t next_task_bounds =
@@ -4554,24 +5142,78 @@ void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
               gtid, i, next_task, lower, upper, st,
               next_task_bounds.get_lower_offset(),
               next_task_bounds.get_upper_offset()));
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    if (taskgraph_rec) {
+      kmp_size_t rec_index = -1;
+      // Record the task in the taskgraph.
+      kmp_taskgraph_node_t *node =
+          __kmp_taskgraph_node_alloc(taskgraph_rec, next_task, &rec_index);
+      kmp_taskgroup_t *taskgroup = current_task->td_taskgroup;
+      if (taskgroup->taskgraph.reduce_input) {
+        node->reduce_input = taskgroup->taskgraph.reduce_input;
+        taskgroup->taskgraph.reduce_input = nullptr;
+      }
+      node->taskloop_task = true;
+      next_taskdata->owning_taskgraph = taskgraph_rec;
+      // FIXME: These dependency fields might be back-filled by the as-yet
+      // unimplemented task_iteration subsidiary directive.  We'll need a way
+      // to locate the correct task given the value of the iteration variable,
+      // or similar.
+      node->u.unresolved.ndeps = 0;
+      node->u.unresolved.dep_list = nullptr;
+      if (nogroup)
+        taskgraph_rec->record_map[rec_index].u.unresolved.cfg_successor = -1;
+      else if (taskloop_prev_idx != -1)
+        taskgraph_rec->record_map[taskloop_prev_idx]
+            .u.unresolved.cfg_successor = rec_index;
+      if (taskloop_first_idx == -1)
+        taskloop_first_idx = rec_index;
+      taskloop_prev_idx = rec_index;
+    } else {
+#endif
 #if OMPT_SUPPORT
-    __kmp_omp_taskloop_task(NULL, gtid, next_task,
-                            codeptr_ra); // schedule new task
+      __kmp_omp_taskloop_task(NULL, gtid, next_task,
+                              codeptr_ra); // schedule new task
 #if OMPT_OPTIONAL
-    if (ompt_enabled.ompt_callback_dispatch) {
-      OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
-                              lower, upper, st);
-    }
+      if (ompt_enabled.ompt_callback_dispatch) {
+        OMPT_GET_DISPATCH_CHUNK(next_taskdata->ompt_task_info.dispatch_chunk,
+                                lower, upper, st);
+      }
 #endif // OMPT_OPTIONAL
 #else
     __kmp_omp_task(gtid, next_task, true); // schedule new task
+#endif
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    }
 #endif
     lower = upper + st; // adjust lower bound for the next iteration
   }
-  // free the pattern task and exit
-  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
-  // do not execute the pattern task, just do internal bookkeeping
-  __kmp_task_finish<false>(gtid, task, current_task);
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  if (taskgraph_rec) {
+    if (taskloop_prev_idx != -1 && !nogroup) {
+      // Create a node to act as an "end group" marker.
+      kmp_size_t endgroup_idx = -1;
+      kmp_taskgraph_node_t *endgrpnode =
+          __kmp_taskgraph_node_alloc(taskgraph_rec, nullptr, &endgroup_idx);
+      endgrpnode->taskloop_task = true;
+      // Point all the cfg_successor indices to this node now.
+      for (kmp_int32 looptask = taskloop_first_idx; looptask != -1;) {
+        kmp_int32 next_task =
+            taskgraph_rec->record_map[looptask].u.unresolved.cfg_successor;
+        taskgraph_rec->record_map[looptask].u.unresolved.cfg_successor =
+            endgroup_idx;
+        looptask = next_task;
+      }
+    }
+  } else {
+#endif
+    // free the pattern task and exit
+    __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
+    // do not execute the pattern task, just do internal bookkeeping
+    __kmp_task_finish<false>(gtid, task, current_task);
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  }
+#endif
 }
 
 // Structure to keep taskloop parameters for auxiliary task
@@ -4641,8 +5283,8 @@ int __kmp_taskloop_task(int gtid, void *ptask) {
 #endif
                          task_dup);
   else
-    __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                          grainsize, extras, last_chunk, tc,
+    __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, /*nogroup=*/true,
+                          ub_glob, num_tasks, grainsize, extras, last_chunk, tc,
 #if OMPT_SUPPORT
                           codeptr_ra,
 #endif
@@ -4730,7 +5372,11 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   lb1 = ub0 + st;
 
   // create pattern task for 2nd half of the loop
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  next_task = __kmp_task_dup_alloc(thread, task, /*taskgraph=*/false);
+#else
   next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
+#endif
   // adjust lower bound (upper bound is not changed) for the 2nd half
   *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
   if (ptask_dup != NULL) // construct firstprivates, etc.
@@ -4763,6 +5409,11 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
   p->codeptr_ra = codeptr_ra;
 #endif
 
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
+  new_task_data->owning_taskgraph = nullptr;
+#endif
+
 #if OMPT_SUPPORT
   // schedule new task with correct return address for OMPT events
   __kmp_omp_taskloop_task(NULL, gtid, new_task, codeptr_ra);
@@ -4779,8 +5430,8 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
 #endif
                          task_dup);
   else
-    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
-                          gr_size0, ext0, last_chunk0, tc0,
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, /*nogroup=*/true,
+                          ub_glob, n_tsk0, gr_size0, ext0, last_chunk0, tc0,
 #if OMPT_SUPPORT
                           codeptr_ra,
 #endif
@@ -4792,14 +5443,22 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
 static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
                            int nogroup, int sched, kmp_uint64 grainsize,
-                           int modifier, void *task_dup) {
+                           int modifier, void *task_dup
+#if OMP_TASKGRAPH_EXPERIMENTAL
+                           ,
+                           kmp_taskgraph_record_t *taskgraph_rec = nullptr
+#endif
+) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   KMP_DEBUG_ASSERT(task != NULL);
   if (nogroup == 0) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
-    __kmpc_taskgroup(loc, gtid);
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    if (!taskgraph_rec)
+#endif
+      __kmpc_taskgroup(loc, gtid);
   }
   // =========================================================================
   // calculate loop parameters
@@ -4831,17 +5490,27 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
   }
   if (tc == 0) {
     KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid));
-    // free the pattern task and exit
-    __kmp_task_start(gtid, task, current_task);
-    // do not execute anything for zero-trip loop
-    __kmp_task_finish<false>(gtid, task, current_task);
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    if (!taskgraph_rec) {
+#endif
+      // free the pattern task and exit
+      __kmp_task_start(gtid, task, current_task);
+      // do not execute anything for zero-trip loop
+      __kmp_task_finish<false>(gtid, task, current_task);
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    }
+#endif
     return;
   }
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
   ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  if (ompt_enabled.ompt_callback_work && !taskgraph_rec) {
+#else
   if (ompt_enabled.ompt_callback_work) {
+#endif
     ompt_callbacks.ompt_callback(ompt_callback_work)(
         ompt_work_taskloop, ompt_scope_begin, &(team_info->parallel_data),
         &(task_info->task_data), tc, OMPT_GET_RETURN_ADDRESS(0));
@@ -4898,14 +5567,31 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
   KMP_DEBUG_ASSERT(num_tasks > 0);
   // =========================================================================
 
-  // check if clause value first
-  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
-  if (if_val == 0) { // if(0) specified, mark task as serial
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  // Handle taskgraph case first.  We just generate tasks and record them in
+  // the graph, but we do not execute them here.
+  if (taskgraph_rec) {
+    if (if_val == 0) {
+      taskdata->td_flags.task_serial = 1;
+      taskdata->td_flags.tiedness = TASK_TIED;
+    }
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, nogroup, ub_glob,
+                          num_tasks, grainsize, extras, last_chunk, tc,
+#if OMPT_SUPPORT
+                          OMPT_GET_RETURN_ADDRESS(0),
+#endif
+                          task_dup, taskgraph_rec);
+    // check if clause value next
+    // Also require GOMP_taskloop to reduce to linear
+    // (taskdata->td_flags.native)
+  } else
+#endif
+      if (if_val == 0) { // if(0) specified, mark task as serial
     taskdata->td_flags.task_serial = 1;
     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
     // always start serial tasks linearly
-    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                          grainsize, extras, last_chunk, tc,
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, nogroup, ub_glob,
+                          num_tasks, grainsize, extras, last_chunk, tc,
 #if OMPT_SUPPORT
                           OMPT_GET_RETURN_ADDRESS(0),
 #endif
@@ -4928,8 +5614,8 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
                   "(%lld), grain %llu, extras %llu, last_chunk %lld\n",
                   gtid, tc, num_tasks, num_tasks_min, grainsize, extras,
                   last_chunk));
-    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
-                          grainsize, extras, last_chunk, tc,
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, nogroup, ub_glob,
+                          num_tasks, grainsize, extras, last_chunk, tc,
 #if OMPT_SUPPORT
                           OMPT_GET_RETURN_ADDRESS(0),
 #endif
@@ -4948,7 +5634,10 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
-    __kmpc_end_taskgroup(loc, gtid);
+#if OMP_TASKGRAPH_EXPERIMENTAL
+    if (!taskgraph_rec)
+#endif
+      __kmpc_end_taskgroup(loc, gtid);
   }
   KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid));
 }
@@ -5048,3 +5737,272 @@ bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
 
   return taskdata->td_task_team != NULL;
 }
+
+#if OMP_TASKGRAPH_EXPERIMENTAL
+
+static kmp_taskgraph_record_t *__kmp_taskgraph_alloc(kmp_int32 gtid,
+                                                     kmp_int32 graph_id) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgraph_record_t *new_rec =
+      (kmp_taskgraph_record_t *)__kmp_fast_allocate(
+          thread, sizeof(kmp_taskgraph_record_t));
+  new_rec->status = KMP_TDG_RECORDING;
+  new_rec->gtid = gtid;
+  new_rec->graph_id = graph_id;
+  __kmp_init_lock(&new_rec->map_lock);
+  new_rec->record_map = nullptr;
+  new_rec->alloc_root = nullptr;
+  new_rec->recycled_deps = nullptr;
+  new_rec->num_tasks = 0;
+  new_rec->nodes_allocated = 0;
+  new_rec->num_mutexes = 0;
+  new_rec->exec_descrs = nullptr;
+  new_rec->exec_descr_size = 0;
+  new_rec->next = nullptr;
+  return new_rec;
+}
+
+// Clone a (new) task that has had its private variables and shared variables
+// initialised already.
+static kmp_task_t *__kmp_taskgraph_clone_task(kmp_info_t *thread,
+                                              kmp_taskgraph_record_t *taskgraph,
+                                              kmp_task_t *orig,
+                                              size_t sizeof_kmp_task_t,
+                                              size_t sizeof_shareds) {
+  // FIXME: This should use a "taskdup" function like taskloops in cases where
+  // private variables are not trivially copyable.  For now, do it by plain
+  // bitwise copy.
+  // FIXME 2: It's intended that this copy be persistent, and can be
+  // re-executed on taskgraph replay.  Make sure that works (for shared
+  // variables) if stack addresses change (i.e. a task-generating function is
+  // called from different call stack depths).
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(orig);
+  size_t shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
+  shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
+  kmp_taskdata_t *copy_td = (kmp_taskdata_t *)__kmp_fast_allocate(
+      thread, shareds_offset + sizeof_shareds);
+  KMP_MEMCPY(copy_td, taskdata, shareds_offset + sizeof_shareds);
+  // Tasks cloned for a taskgraph always have this field set.
+  copy_td->owning_taskgraph = taskgraph;
+  return KMP_TASKDATA_TO_TASK(copy_td);
+}
+
+// __kmpc_taskgraph: record or replay taskgraph
+// loc_ref:     Location of TDG, not used yet
+// gtid:        Global Thread ID of the encountering thread
+// tdg_handle:  Handle of taskgraph -- the address of a slot in the host
+//              program that we write the taskgraph (list) pointer back to.
+// graph_id:    Graph ID for the taskgraph.
+// graph_reset: 1 to reset taskgraph for this taskgraph/graph_id, 0 to replay
+//              (or record, initially).
+// nogroup:     1 to omit implicit taskgroup, 0 to include it.
+// entry:       Pointer to the entry function
+// args:        Pointer to the function arguments
+void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
+                      std::atomic<void *> *tdg_handle, kmp_uint32 graph_id,
+                      kmp_int32 graph_reset, kmp_int32 nogroup,
+                      void (*entry)(void *), void *args) {
+  kmp_taskgraph_record_t *record =
+      (kmp_taskgraph_record_t *)KMP_ATOMIC_LD_ACQ(tdg_handle);
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgroup_t *taskgroup;
+
+  __kmpc_taskgroup(loc_ref, gtid);
+
+  taskgroup = thread->th.th_current_task->td_taskgroup;
+
+  // FIXME: Implement graph_id and graph_reset functionality.  For graph_id, we
+  // will form a singly-linked list of task records chained through their
+  // "next" pointers (per taskgraph construct handle).  Thread safety and
+  // locking need careful consideration.  We could use a "list header" node
+  // consisting of a lock and a pointer to
+  // the list proper, perhaps.  Ideally we'd want to avoid locking/unlocking in
+  // the common case (replay).
+
+  if (!record) {
+    record = __kmp_taskgraph_alloc(gtid, graph_id);
+    // Another thread may have allocated the taskgraph already.  Check that
+    // here.
+    kmp_taskgraph_record_t *other =
+        (kmp_taskgraph_record_t *)KMP_COMPARE_AND_STORE_RET64(tdg_handle,
+                                                              nullptr, record);
+    if (other != nullptr) {
+      __kmp_fast_free(thread, record);
+      record = other;
+      // Should we stall here until the other thread has finished recording the
+      // taskgraph?  That might be safer.  Otherwise multiple threads will add
+      // tasks to the taskgraph simultaneously, which is unlikely to be what
+      // the user wants.  Unclear what to do here.  FIXME.
+    } else {
+      // We record 'nogroup' here.  We always create a group for recording the
+      // taskgraph, but we could avoid doing so for replay.  That's not done
+      // yet though.
+      record->nogroup_taskgroup = nogroup;
+      // Store our taskgraph record into the taskgraph directive's implicit
+      // taskgroup.
+      KMP_ATOMIC_ST_REL(&taskgroup->taskgraph.recording, record);
+    }
+  }
+
+  kmp_taskgraph_status_t status = KMP_ATOMIC_LD_ACQ(&record->status);
+  if (status == KMP_TDG_RECORDING)
+    entry(args);
+  else if (status == KMP_TDG_READY) {
+    kmp_taskdata *current_taskdata = thread->th.th_current_task;
+    KG_TRACE(10, ("Replay taskgraph %p from task %p\n", record,
+                  KMP_TASKDATA_TO_TASK(current_taskdata)));
+    __kmp_acquire_lock(&record->map_lock, gtid);
+    __kmp_replay_taskgraph(gtid, current_taskdata, record, graph_id, taskgroup);
+    __kmpc_end_taskgroup(loc_ref, gtid);
+    __kmp_release_lock(&record->map_lock, gtid);
+    return;
+  }
+
+  __kmpc_end_taskgroup(loc_ref, gtid);
+
+  // This could perhaps be spawned as a separate task in order to avoid
+  // blocking this thread.
+  if (record->gtid == gtid) {
+    kmp_taskdata *current_taskdata = thread->th.th_current_task;
+    __kmp_build_taskgraph(gtid, current_taskdata, record);
+  }
+}
+
+kmp_uint32 __kmpc_taskgraph_task(ident_t *loc_ref, kmp_int32 gtid,
+                                 kmp_task_t *new_task, kmp_int32 flags,
+                                 size_t sizeof_kmp_task_t, void *shareds,
+                                 size_t sizeof_shareds, kmp_int32 ndeps,
+                                 kmp_depend_info_t *dep_list) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgroup_t *taskgroup = thread->th.th_current_task->td_taskgroup;
+  kmp_taskgraph_record_t *rec = __kmp_taskgraph_or_parent_recording(taskgroup);
+
+  if (rec) {
+    kmp_taskgraph_status_t status = KMP_ATOMIC_LD_ACQ(&rec->status);
+    if (status == KMP_TDG_RECORDING) {
+      kmp_task_t *cloned_task = __kmp_taskgraph_clone_task(
+          thread, rec, new_task, sizeof_kmp_task_t, sizeof_shareds);
+      kmp_taskgraph_node_t *node = __kmp_taskgraph_node_alloc(rec, cloned_task);
+      if (taskgroup->taskgraph.reduce_input) {
+        node->reduce_input = taskgroup->taskgraph.reduce_input;
+        taskgroup->taskgraph.reduce_input = nullptr;
+      }
+#if defined(DEBUG_TASKGRAPH)
+      fprintf(stderr, "__kmpc_taskgraph_task: record task here!\n");
+      fprintf(stderr, "private size: %d, shared size: %d\n",
+              (int)(sizeof_kmp_task_t - sizeof(kmp_task_t)),
+              (int)sizeof_shareds);
+      fprintf(stderr, "ndeps: %d\n", (int)ndeps);
+      fprintf(stderr, "gtid: %d rec->gtid: %d\n", gtid, rec->gtid);
+      fprintf(stderr, "taskgroup: %p\n",
+              thread->th.th_current_task->td_taskgroup);
+      kmp_taskdata_t *parent = thread->th.th_current_task->td_parent;
+      while (parent) {
+        fprintf(stderr, "  parent: %p (taskgroup %p)\n", parent,
+                parent->td_taskgroup);
+        parent = parent->td_parent;
+      }
+#endif
+      node->u.unresolved.ndeps = ndeps;
+      node->u.unresolved.dep_list = (kmp_depend_info_t *)__kmp_thread_malloc(
+          thread, ndeps * sizeof(kmp_depend_info_t));
+      KMP_MEMCPY(node->u.unresolved.dep_list, dep_list,
+                 ndeps * sizeof(kmp_depend_info_t));
+    } else if (status == KMP_TDG_READY) {
+#ifdef DEBUG_TASKGRAPH
+      fprintf(stderr,
+              "non-taskgraph task entry point for task in finalized taskgraph");
+#endif
+      return 0;
+    }
+  } else {
+    kmp_taskdata_t *parent = thread->th.th_current_task->td_parent;
+    while (parent) {
+      parent = parent->td_parent;
+    }
+  }
+
+  kmp_int32 res;
+  if (ndeps == 0)
+    res = __kmpc_omp_task(loc_ref, gtid, new_task);
+  else
+    res = __kmpc_omp_task_with_deps(loc_ref, gtid, new_task, ndeps, dep_list, 0,
+                                    nullptr);
+
+  return res;
+}
+
+void __kmpc_taskgraph_taskwait(ident_t *loc_ref, kmp_int32 gtid,
+                               kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                               kmp_int32 has_no_wait) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgroup_t *taskgroup = thread->th.th_current_task->td_taskgroup;
+  kmp_taskgraph_record_t *rec = __kmp_taskgraph_or_parent_recording(taskgroup);
+
+  if (rec) {
+    kmp_taskgraph_status_t status = KMP_ATOMIC_LD_ACQ(&rec->status);
+    if (status == KMP_TDG_RECORDING) {
+      kmp_taskgraph_node_t *node = __kmp_taskgraph_node_alloc(rec, nullptr);
+#ifdef DEBUG_TASKGRAPH
+      fprintf(stderr, "__kmpc_taskgraph_taskwait: record taskwait here!\n");
+      fprintf(stderr, "ndeps: %d\n", (int)ndeps);
+#endif
+      node->u.unresolved.ndeps = ndeps;
+      node->u.unresolved.dep_list = (kmp_depend_info_t *)__kmp_thread_malloc(
+          thread, ndeps * sizeof(kmp_depend_info_t));
+      KMP_MEMCPY(node->u.unresolved.dep_list, dep_list,
+                 ndeps * sizeof(kmp_depend_info_t));
+      // TODO: Record has_no_wait somewhere?
+      // if (has_no_wait)
+      //  return;
+    } else if (status == KMP_TDG_READY) {
+#ifdef DEBUG_TASKGRAPH
+      fprintf(stderr, "non-taskgraph taskwait entry point for taskwait in "
+                      "finalized taskgraph\n");
+#endif
+      return;
+    }
+  }
+
+  __kmpc_omp_taskwait_deps_51(loc_ref, gtid, ndeps, dep_list, 0, nullptr,
+                              has_no_wait);
+}
+
+kmp_uint32 __kmpc_taskgraph_taskloop(ident_t *loc_ref, kmp_int32 gtid,
+                                     kmp_task_t *new_task, kmp_int32 flags,
+                                     size_t sizeof_kmp_task_t, void *shareds,
+                                     size_t sizeof_shareds, kmp_int32 if_val,
+                                     kmp_uint64 *lb, kmp_uint64 *ub,
+                                     kmp_int64 st, kmp_int32 nogroup,
+                                     kmp_int32 sched, kmp_uint64 grainsize,
+                                     kmp_int32 modifier, void *task_dup) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskgroup_t *taskgroup = thread->th.th_current_task->td_taskgroup;
+  kmp_taskgraph_record_t *rec = __kmp_taskgraph_or_parent_recording(taskgroup);
+
+  if (rec) {
+    kmp_taskgraph_status_t status = KMP_ATOMIC_LD_ACQ(&rec->status);
+    if (status == KMP_TDG_RECORDING)
+      __kmp_taskloop(loc_ref, gtid, new_task, if_val, lb, ub, st, nogroup,
+                     sched, grainsize, modifier, task_dup, rec);
+    else if (status == KMP_TDG_READY) {
+#ifdef DEBUG_TASKGRAPH
+      fprintf(stderr, "non-taskgraph taskloop entry point for taskloop in "
+                      "finalized taskgraph\n");
+#endif
+      return 0;
+    }
+  }
+
+  // For 'nogroup' here, we pass TRUE because this entry point does want to
+  // know if we originally had the 'nogroup' clause or not -- but the group is
+  // created using separate API calls wrapping this one (or __kmpc_taskloop).
+  // We don't want to create another taskgroup in __kmp_taskloop here in any
+  // case.
+  __kmp_taskloop(loc_ref, gtid, new_task, if_val, lb, ub, st, /*nogroup=*/true,
+                 sched, grainsize, modifier, task_dup);
+
+  return 0;
+}
+
+#endif

>From 03c24c91455022c945ebb734f97f2f0d9c79f306 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Fri, 27 Mar 2026 09:40:01 -0500
Subject: [PATCH 03/24] [OpenMP] Add graph_id and graph_reset clause support
 (for taskgraph directive)

This patch adds support for the graph_id and graph_reset clauses for the
OpenMP 6.0 "taskgraph" directive.  Relative to previously-posted versions,
this one does not have the clauses inherit from "OMPClauseWithPreInit",
since AFAICT it shouldn't be necessary to do that.  It also avoids
casting the graph_id argument to a boolean, and allows the expression
to be omitted for the graph_reset clause (which seems of limited use,
but is allowed by the spec).

Co-Authored-By: Josep Pinot <jpinot at bsc.es>

commit-id:bef7e706

Pull Request: https://github.com/llvm/llvm-project/pull/194048
---
 clang/include/clang/AST/OpenMPClause.h        | 95 +++++++++++++++++++
 clang/include/clang/AST/RecursiveASTVisitor.h | 13 +++
 clang/include/clang/Sema/SemaOpenMP.h         |  9 ++
 clang/lib/AST/OpenMPClause.cpp                | 26 +++++
 clang/lib/AST/StmtProfile.cpp                 |  8 ++
 clang/lib/Basic/OpenMPKinds.cpp               |  4 +
 clang/lib/Parse/ParseOpenMP.cpp               | 15 +++
 clang/lib/Sema/SemaOpenMP.cpp                 | 60 ++++++++++++
 clang/lib/Sema/TreeTransform.h                | 43 +++++++++
 clang/lib/Serialization/ASTReader.cpp         | 16 ++++
 clang/lib/Serialization/ASTWriter.cpp         | 10 ++
 clang/tools/libclang/CIndex.cpp               |  6 ++
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |  2 +
 13 files changed, 307 insertions(+)

diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index ccf2c40bc5efa..e18dea7d9bd47 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -8541,6 +8541,101 @@ class OMPIsDevicePtrClause final
   }
 };
 
+/// This represents clause 'graph_id' in the '#pragma omp taskgraph"
+/// directives.
+///
+/// \code
+/// #pragma omp taskgraph graph_id(a)
+class OMPGraphIdClause final
+    : public OMPOneStmtClause<llvm::omp::OMPC_graph_id, OMPClause> {
+  friend class OMPClauseReader;
+
+  /// Set condition.
+  void setId(Expr *Id) { setStmt(Id); }
+
+public:
+  /// Build 'graph_id' clause with identifier value \a Id.
+  ///
+  /// \param Id Id value for the clause.
+  /// \param StartLoc Starting location of the clause.
+  /// \param LParenLoc Location of '('.
+  /// \param EndLoc Ending location of the clause.
+  OMPGraphIdClause(Expr *Id, SourceLocation StartLoc, SourceLocation LParenLoc,
+                   SourceLocation EndLoc)
+      : OMPOneStmtClause(Id, StartLoc, LParenLoc, EndLoc) {}
+
+  /// Build an empty clause.
+  OMPGraphIdClause() : OMPOneStmtClause() {}
+
+  /// Returns condition.
+  Expr *getId() const { return getStmtAs<Expr>(); }
+};
+
+// This represents clause 'graph_reset' in the '#pragma omp taskgraph"
+/// directives.
+///
+/// \code
+/// #pragma omp taskgraph graph_reset(true)
+class OMPGraphResetClause final : public OMPClause {
+  friend class OMPClauseReader;
+
+  /// Location of '('.
+  SourceLocation LParenLoc;
+
+  /// Condition of the 'graph_reset' clause.
+  Stmt *Condition = nullptr;
+
+public:
+  /// Build 'graph_reset' clause with condition \a Cond.
+  ///
+  /// \param Cond Condition of the clause.
+  /// \param StartLoc Starting location of the clause.
+  /// \param LParenLoc Location of '('.
+  /// \param EndLoc Ending location of the clause.
+  OMPGraphResetClause(Expr *Cond, SourceLocation StartLoc,
+                      SourceLocation LParenLoc, SourceLocation EndLoc)
+      : OMPClause(llvm::omp::OMPC_graph_reset, StartLoc, EndLoc),
+        LParenLoc(LParenLoc), Condition(Cond) {}
+
+  /// Build an empty clause.
+  OMPGraphResetClause()
+      : OMPClause(llvm::omp::OMPC_graph_reset, SourceLocation(),
+                  SourceLocation()) {}
+
+  /// Sets the location of '('.
+  void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+
+  /// Returns the location of '('.
+  SourceLocation getLParenLoc() const { return LParenLoc; }
+
+  /// Set condition.
+  void setCondition(Expr *Cond) { Condition = Cond; }
+
+  /// Returns condition.
+  Expr *getCondition() const { return cast_or_null<Expr>(Condition); }
+
+  child_range children() {
+    if (Condition)
+      return child_range(&Condition, &Condition + 1);
+    return child_range(child_iterator(), child_iterator());
+  }
+
+  const_child_range children() const {
+    if (Condition)
+      return const_child_range(&Condition, &Condition + 1);
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  child_range used_children();
+  const_child_range used_children() const {
+    return const_cast<OMPGraphResetClause *>(this)->used_children();
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == llvm::omp::OMPC_graph_reset;
+  }
+};
+
 /// This represents clause 'has_device_ptr' in the '#pragma omp ...'
 /// directives.
 ///
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index b5be0910194bd..b86f122f75460 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -4162,6 +4162,19 @@ bool RecursiveASTVisitor<Derived>::VisitOMPIsDevicePtrClause(
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPGraphIdClause(OMPGraphIdClause *C) {
+  TRY_TO(TraverseStmt(C->getId()));
+  return true;
+}
+
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPGraphResetClause(
+    OMPGraphResetClause *C) {
+  TRY_TO(TraverseStmt(C->getCondition()));
+  return true;
+}
+
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPHasDeviceAddrClause(
     OMPHasDeviceAddrClause *C) {
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 3621ce96b8724..480b18960fd67 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -951,6 +951,15 @@ class SemaOpenMP : public SemaBase {
   ActOnOpenMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc,
                            SourceLocation LParenLoc = SourceLocation(),
                            Expr *NumForLoops = nullptr);
+  /// Called on well-formed 'graph_id' clause.
+  OMPClause *ActOnOpenMPGraphIdClause(Expr *Id, SourceLocation StartLoc,
+                                      SourceLocation LParenLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'graph_reset' clause.
+  OMPClause *ActOnOpenMPGraphResetClause(Expr *Condition,
+                                         SourceLocation StartLoc,
+                                         SourceLocation LParenLoc,
+                                         SourceLocation EndLoc);
   /// Called on well-formed 'grainsize' clause.
   OMPClause *ActOnOpenMPGrainsizeClause(OpenMPGrainsizeClauseModifier Modifier,
                                         Expr *Size, SourceLocation StartLoc,
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 3a35e17aff40b..db4d5519acb38 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -254,6 +254,8 @@ const OMPClauseWithPostUpdate *OMPClauseWithPostUpdate::get(const OMPClause *C)
   case OMPC_thread_limit:
   case OMPC_priority:
   case OMPC_grainsize:
+  case OMPC_graph_id:
+  case OMPC_graph_reset:
   case OMPC_nogroup:
   case OMPC_num_tasks:
   case OMPC_hint:
@@ -328,6 +330,12 @@ OMPClause::child_range OMPGrainsizeClause::used_children() {
   return child_range(&Grainsize, &Grainsize + 1);
 }
 
+OMPClause::child_range OMPGraphResetClause::used_children() {
+  if (Condition)
+    return child_range(&Condition, &Condition + 1);
+  return children();
+}
+
 OMPClause::child_range OMPNumTasksClause::used_children() {
   if (Stmt **C = getAddrOfExprAsWritten(getPreInitStmt()))
     return child_range(C, C + 1);
@@ -2369,6 +2377,24 @@ void OMPClausePrinter::VisitOMPGrainsizeClause(OMPGrainsizeClause *Node) {
   OS << ")";
 }
 
+void OMPClausePrinter::VisitOMPGraphIdClause(OMPGraphIdClause *Node) {
+  OS << "graph_id";
+  if (Expr *E = Node->getId()) {
+    OS << "(";
+    E->printPretty(OS, nullptr, Policy, 0);
+    OS << ")";
+  }
+}
+
+void OMPClausePrinter::VisitOMPGraphResetClause(OMPGraphResetClause *Node) {
+  OS << "graph_reset";
+  if (Expr *E = Node->getCondition()) {
+    OS << "(";
+    E->printPretty(OS, nullptr, Policy, 0);
+    OS << ")";
+  }
+}
+
 void OMPClausePrinter::VisitOMPNumTasksClause(OMPNumTasksClause *Node) {
   OS << "num_tasks(";
   OpenMPNumTasksClauseModifier Modifier = Node->getModifier();
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 8219e57644be6..2b991ff8e7b19 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -916,6 +916,14 @@ void OMPClauseProfiler::VisitOMPGrainsizeClause(const OMPGrainsizeClause *C) {
   if (C->getGrainsize())
     Profiler->VisitStmt(C->getGrainsize());
 }
+void OMPClauseProfiler::VisitOMPGraphIdClause(const OMPGraphIdClause *C) {
+  if (C->getId())
+    Profiler->VisitStmt(C->getId());
+}
+void OMPClauseProfiler::VisitOMPGraphResetClause(const OMPGraphResetClause *C) {
+  if (C->getCondition())
+    Profiler->VisitStmt(C->getCondition());
+}
 void OMPClauseProfiler::VisitOMPNumTasksClause(const OMPNumTasksClause *C) {
   VisitOMPClauseWithPreInit(C);
   if (C->getNumTasks())
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 287eb217ba458..72015a7224275 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -312,6 +312,8 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
   case OMPC_when:
   case OMPC_append_args:
   case OMPC_looprange:
+  case OMPC_graph_id:
+  case OMPC_graph_reset:
     break;
   default:
     break;
@@ -692,6 +694,8 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
   case OMPC_when:
   case OMPC_append_args:
   case OMPC_looprange:
+  case OMPC_graph_id:
+  case OMPC_graph_reset:
     break;
   default:
     break;
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 45a47ec797f01..5be99550f1ef6 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3241,6 +3241,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
   case OMPC_partial:
   case OMPC_align:
   case OMPC_message:
+  case OMPC_graph_id:
   case OMPC_ompx_dyn_cgroup_mem:
   case OMPC_dyn_groupprivate:
   case OMPC_transparent:
@@ -3391,6 +3392,20 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
     else
       Clause = ParseOpenMPClause(CKind, WrongDirective);
     break;
+  case OMPC_graph_reset:
+    if (!FirstClause) {
+      Diag(Tok, diag::err_omp_more_one_clause)
+          << getOpenMPDirectiveName(DKind, OMPVersion)
+          << getOpenMPClauseName(CKind) << 0;
+      ErrorFound = true;
+    }
+
+    if (PP.LookAhead(/*N=*/0).is(tok::l_paren)) {
+      Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective);
+    } else {
+      Clause = ParseOpenMPClause(CKind, WrongDirective);
+    }
+    break;
   case OMPC_self_maps:
     // OpenMP [6.0, self_maps clause]
     if (getLangOpts().OpenMP < 60) {
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 53ded7a5e177e..22ce0190a0907 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -6816,6 +6816,8 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
       case OMPC_final:
       case OMPC_priority:
       case OMPC_novariants:
+      case OMPC_graph_id:
+      case OMPC_graph_reset:
       case OMPC_nocontext:
         // Do not analyze if no parent parallel directive.
         if (isOpenMPParallelDirective(Kind))
@@ -16858,6 +16860,12 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
   case OMPC_detach:
     Res = ActOnOpenMPDetachClause(Expr, StartLoc, LParenLoc, EndLoc);
     break;
+  case OMPC_graph_id:
+    Res = ActOnOpenMPGraphIdClause(Expr, StartLoc, LParenLoc, EndLoc);
+    break;
+  case OMPC_graph_reset:
+    Res = ActOnOpenMPGraphResetClause(Expr, StartLoc, LParenLoc, EndLoc);
+    break;
   case OMPC_novariants:
     Res = ActOnOpenMPNovariantsClause(Expr, StartLoc, LParenLoc, EndLoc);
     break;
@@ -17646,6 +17654,8 @@ OMPClause *SemaOpenMP::ActOnOpenMPSimpleClause(
   case OMPC_match:
   case OMPC_nontemporal:
   case OMPC_destroy:
+  case OMPC_graph_id:
+  case OMPC_graph_reset:
   case OMPC_novariants:
   case OMPC_nocontext:
   case OMPC_detach:
@@ -18389,6 +18399,8 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
   case OMPC_severity:
   case OMPC_message:
   case OMPC_destroy:
+  case OMPC_graph_id:
+  case OMPC_graph_reset:
   case OMPC_novariants:
   case OMPC_nocontext:
   case OMPC_detach:
@@ -18608,6 +18620,10 @@ OMPClause *SemaOpenMP::ActOnOpenMPClause(OpenMPClauseKind Kind,
   case OMPC_ompx_bare:
     Res = ActOnOpenMPXBareClause(StartLoc, EndLoc);
     break;
+  case OMPC_graph_reset:
+    Res = ActOnOpenMPGraphResetClause(/*Condition=*/nullptr, StartLoc,
+                                      SourceLocation(), EndLoc);
+    break;
   case OMPC_if:
   case OMPC_final:
   case OMPC_num_threads:
@@ -18662,6 +18678,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPClause(OpenMPClauseKind Kind,
   case OMPC_at:
   case OMPC_severity:
   case OMPC_message:
+  case OMPC_graph_id:
   case OMPC_novariants:
   case OMPC_nocontext:
   case OMPC_detach:
@@ -19019,6 +19036,47 @@ OMPClause *SemaOpenMP::ActOnOpenMPDestroyClause(Expr *InteropVar,
       OMPDestroyClause(InteropVar, StartLoc, LParenLoc, VarLoc, EndLoc);
 }
 
+OMPClause *SemaOpenMP::ActOnOpenMPGraphIdClause(Expr *Id,
+                                                SourceLocation StartLoc,
+                                                SourceLocation LParenLoc,
+                                                SourceLocation EndLoc) {
+  Expr *ValExpr = Id;
+
+  if (!Id->isValueDependent() && !Id->isTypeDependent() &&
+      !Id->isInstantiationDependent() &&
+      !Id->containsUnexpandedParameterPack()) {
+    ExprResult Val = PerformOpenMPImplicitIntegerConversion(LParenLoc, Id);
+    if (Val.isInvalid())
+      return nullptr;
+
+    ValExpr = Val.get();
+  }
+
+  return new (getASTContext())
+      OMPGraphIdClause(ValExpr, StartLoc, LParenLoc, EndLoc);
+}
+
+OMPClause *SemaOpenMP::ActOnOpenMPGraphResetClause(Expr *Condition,
+                                                   SourceLocation StartLoc,
+                                                   SourceLocation LParenLoc,
+                                                   SourceLocation EndLoc) {
+  Expr *ValExpr = Condition;
+  if (Condition && LParenLoc.isValid()) {
+    if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
+        !Condition->isInstantiationDependent() &&
+        !Condition->containsUnexpandedParameterPack()) {
+      ExprResult Val = SemaRef.CheckBooleanCondition(StartLoc, Condition);
+      if (Val.isInvalid())
+        return nullptr;
+
+      ValExpr = Val.get();
+    }
+  }
+
+  return new (getASTContext())
+      OMPGraphResetClause(ValExpr, StartLoc, LParenLoc, EndLoc);
+}
+
 OMPClause *SemaOpenMP::ActOnOpenMPNovariantsClause(Expr *Condition,
                                                    SourceLocation StartLoc,
                                                    SourceLocation LParenLoc,
@@ -19319,6 +19377,8 @@ OMPClause *SemaOpenMP::ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
   case OMPC_severity:
   case OMPC_message:
   case OMPC_destroy:
+  case OMPC_graph_id:
+  case OMPC_graph_reset:
   case OMPC_novariants:
   case OMPC_nocontext:
   case OMPC_detach:
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 40187f71231bd..f49b3eceace46 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -2174,6 +2174,29 @@ class TreeTransform {
                                                         LParenLoc, EndLoc);
   }
 
+  /// Build a new OpenMP 'graph_id' clause.
+  ///
+  /// By default, performs semantic analysis to build the new OpenMP clause.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPGraphIdClause(Expr *Condition, SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc) {
+    return getSema().OpenMP().ActOnOpenMPGraphIdClause(Condition, StartLoc,
+                                                       LParenLoc, EndLoc);
+  }
+
+  /// Build a new OpenMP 'graph_reset' clause.
+  ///
+  /// By default, performs semantic analysis to build the new OpenMP clause.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPGraphResetClause(Expr *Condition,
+                                        SourceLocation StartLoc,
+                                        SourceLocation LParenLoc,
+                                        SourceLocation EndLoc) {
+    return getSema().OpenMP().ActOnOpenMPGraphResetClause(Condition, StartLoc,
+                                                          LParenLoc, EndLoc);
+  }
+
   /// Build a new OpenMP 'grainsize' clause.
   ///
   /// By default, performs semantic analysis to build the new statement.
@@ -11610,6 +11633,26 @@ TreeTransform<Derived>::TransformOMPPriorityClause(OMPPriorityClause *C) {
       E.get(), C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
 }
 
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPGraphIdClause(OMPGraphIdClause *C) {
+  ExprResult Cond = getDerived().TransformExpr(C->getId());
+  if (Cond.isInvalid())
+    return nullptr;
+  return getDerived().RebuildOMPGraphIdClause(
+      Cond.get(), C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
+}
+
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPGraphResetClause(OMPGraphResetClause *C) {
+  ExprResult Cond = getDerived().TransformExpr(C->getCondition());
+  if (Cond.isInvalid())
+    return nullptr;
+  return getDerived().RebuildOMPGraphResetClause(
+      Cond.get(), C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
+}
+
 template <typename Derived>
 OMPClause *
 TreeTransform<Derived>::TransformOMPGrainsizeClause(OMPGrainsizeClause *C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 7e8bb6509e84b..d98a5a3f240ef 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11700,6 +11700,12 @@ OMPClause *OMPClauseReader::readClause() {
   case llvm::omp::OMPC_grainsize:
     C = new (Context) OMPGrainsizeClause();
     break;
+  case llvm::omp::OMPC_graph_id:
+    C = new (Context) OMPGraphIdClause();
+    break;
+  case llvm::omp::OMPC_graph_reset:
+    C = new (Context) OMPGraphResetClause();
+    break;
   case llvm::omp::OMPC_num_tasks:
     C = new (Context) OMPNumTasksClause();
     break;
@@ -12603,6 +12609,16 @@ void OMPClauseReader::VisitOMPPriorityClause(OMPPriorityClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
 }
 
+void OMPClauseReader::VisitOMPGraphIdClause(OMPGraphIdClause *C) {
+  C->setId(Record.readSubExpr());
+  C->setLParenLoc(Record.readSourceLocation());
+}
+
+void OMPClauseReader::VisitOMPGraphResetClause(OMPGraphResetClause *C) {
+  C->setCondition(Record.readSubExpr());
+  C->setLParenLoc(Record.readSourceLocation());
+}
+
 void OMPClauseReader::VisitOMPGrainsizeClause(OMPGrainsizeClause *C) {
   VisitOMPClauseWithPreInit(C);
   C->setModifier(Record.readEnum<OpenMPGrainsizeClauseModifier>());
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index ba644fefc109a..df0d1a35ea715 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8581,6 +8581,16 @@ void OMPClauseWriter::VisitOMPPriorityClause(OMPPriorityClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
 }
 
+void OMPClauseWriter::VisitOMPGraphIdClause(OMPGraphIdClause *C) {
+  Record.AddStmt(C->getId());
+  Record.AddSourceLocation(C->getLParenLoc());
+}
+
+void OMPClauseWriter::VisitOMPGraphResetClause(OMPGraphResetClause *C) {
+  Record.AddStmt(C->getCondition());
+  Record.AddSourceLocation(C->getLParenLoc());
+}
+
 void OMPClauseWriter::VisitOMPGrainsizeClause(OMPGrainsizeClause *C) {
   VisitOMPClauseWithPreInit(C);
   Record.writeEnum(C->getModifier());
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 350cd2135657d..1a92542c9595c 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2749,6 +2749,12 @@ void OMPClauseEnqueue::VisitOMPIsDevicePtrClause(
     const OMPIsDevicePtrClause *C) {
   VisitOMPClauseList(C);
 }
+void OMPClauseEnqueue::VisitOMPGraphIdClause(const OMPGraphIdClause *C) {
+  Visitor->AddStmt(C->getId());
+}
+void OMPClauseEnqueue::VisitOMPGraphResetClause(const OMPGraphResetClause *C) {
+  Visitor->AddStmt(C->getCondition());
+}
 void OMPClauseEnqueue::VisitOMPHasDeviceAddrClause(
     const OMPHasDeviceAddrClause *C) {
   VisitOMPClauseList(C);
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index e1e66df72dfc5..91ce4eac9e370 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -247,9 +247,11 @@ def OMPC_GrainSize : Clause<[Spelling<"grainsize">]> {
   ];
 }
 def OMPC_GraphId : Clause<[Spelling<"graph_id">]> {
+  let clangClass = "OMPGraphIdClause";
   let flangClass = "OmpGraphIdClause";
 }
 def OMPC_GraphReset : Clause<[Spelling<"graph_reset">]> {
+  let clangClass = "OMPGraphResetClause";
   let flangClass = "OmpGraphResetClause";
   let isValueOptional = true;
 }

>From 15b31621193249d6f6be0a09f2cc363af91f15e9 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Fri, 27 Mar 2026 12:54:04 -0500
Subject: [PATCH 04/24] [OpenMP] Add OpenMP 6.0 taskgraph parsing/trivial
 semantics

This patch adds OpenMP 6.0 'taskgraph' support, as a no-op: i.e. the
contents of the taskgraph region are just executed as if the directive
was omitted.

Co-Authored-By: Adrian Munera <adrian.munera at bsc.es>
Co-Authored-By: Jose M Monsalve Diaz <JoseM.MonsalveDiaz at amd.com>

Pull Request: https://github.com/llvm/llvm-project/pull/194050
---
 clang/bindings/python/clang/cindex.py         |  3 ++
 clang/include/clang-c/Index.h                 |  4 ++
 clang/include/clang/AST/RecursiveASTVisitor.h |  3 ++
 clang/include/clang/AST/StmtOpenMP.h          | 49 +++++++++++++++++++
 clang/include/clang/Basic/StmtNodes.td        |  1 +
 clang/include/clang/Sema/SemaOpenMP.h         |  4 ++
 .../include/clang/Serialization/ASTBitCodes.h |  1 +
 clang/lib/AST/StmtOpenMP.cpp                  | 15 ++++++
 clang/lib/AST/StmtPrinter.cpp                 |  5 ++
 clang/lib/AST/StmtProfile.cpp                 |  4 ++
 clang/lib/Basic/OpenMPKinds.cpp               |  3 ++
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 40 +++++++++++++++
 clang/lib/CodeGen/CGOpenMPRuntime.h           | 13 +++++
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |  2 +
 clang/lib/CodeGen/CGStmt.cpp                  |  3 ++
 clang/lib/CodeGen/CGStmtOpenMP.cpp            | 15 ++++++
 clang/lib/CodeGen/CodeGenFunction.h           |  1 +
 clang/lib/Sema/SemaExceptionSpec.cpp          |  1 +
 clang/lib/Sema/SemaOpenMP.cpp                 | 36 ++++++++++++++
 clang/lib/Sema/TreeTransform.h                | 11 +++++
 clang/lib/Serialization/ASTReaderStmt.cpp     | 10 ++++
 clang/lib/Serialization/ASTWriterStmt.cpp     |  6 +++
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |  1 +
 clang/test/OpenMP/taskgraph_bad_clauses.cpp   | 19 +++++++
 clang/tools/libclang/CIndex.cpp               |  2 +
 clang/tools/libclang/CXCursor.cpp             |  3 ++
 26 files changed, 255 insertions(+)
 create mode 100644 clang/test/OpenMP/taskgraph_bad_clauses.cpp

diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index a90d48cf6d481..19d8f12a2c0c3 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -1456,6 +1456,9 @@ def is_unexposed(self):
     # OpenMP split directive.
     OMP_SPLIT_DIRECTIVE = 312
 
+    # OpenMP taskgraph directive.
+    OMP_TASKGRAPH_DIRECTIVE = 313
+
     # OpenACC Compute Construct.
     OPEN_ACC_COMPUTE_DIRECTIVE = 320
 
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 119bd68ff9814..0ccae0d146142 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2170,6 +2170,10 @@ enum CXCursorKind {
    */
   CXCursor_OMPSplitDirective = 312,
 
+  /** OpenMP taskgraph directive.
+   */
+  CXCursor_OMPTaskgraphDirective = 313,
+
   /** OpenACC Compute Construct.
    */
   CXCursor_OpenACCComputeConstruct = 320,
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index b86f122f75460..da18c040bf570 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3287,6 +3287,9 @@ DEF_TRAVERSE_STMT(OMPBarrierDirective,
 DEF_TRAVERSE_STMT(OMPTaskwaitDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
+DEF_TRAVERSE_STMT(OMPTaskgraphDirective,
+                  { TRY_TO(TraverseOMPExecutableDirective(S)); })
+
 DEF_TRAVERSE_STMT(OMPTaskgroupDirective,
                   { TRY_TO(TraverseOMPExecutableDirective(S)); })
 
diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h
index dbc76e7df8ecd..4aa0baed7eb9e 100644
--- a/clang/include/clang/AST/StmtOpenMP.h
+++ b/clang/include/clang/AST/StmtOpenMP.h
@@ -2760,6 +2760,55 @@ class OMPTaskwaitDirective : public OMPExecutableDirective {
   }
 };
 
+/// This represents '#pragma omp taskgraph' directive.
+/// Available with OpenMP 6.0.
+///
+/// \code
+/// #pragma omp taskgraph
+/// \endcode
+///
+class OMPTaskgraphDirective final : public OMPExecutableDirective {
+  friend class ASTStmtReader;
+  friend class OMPExecutableDirective;
+  /// Build directive with the given start and end location.
+  ///
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending location of the directive.
+  ///
+  OMPTaskgraphDirective(SourceLocation StartLoc, SourceLocation EndLoc)
+      : OMPExecutableDirective(OMPTaskgraphDirectiveClass,
+                               llvm::omp::OMPD_taskgraph, StartLoc, EndLoc) {}
+
+  /// Build an empty directive.
+  ///
+  explicit OMPTaskgraphDirective()
+      : OMPExecutableDirective(OMPTaskgraphDirectiveClass,
+                               llvm::omp::OMPD_taskgraph, SourceLocation(),
+                               SourceLocation()) {}
+
+public:
+  /// Creates directive.
+  ///
+  /// \param C AST context.
+  /// \param StartLoc Starting location of the directive kind.
+  /// \param EndLoc Ending Location of the directive.
+  ///
+  static OMPTaskgraphDirective *
+  Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+         ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt);
+
+  /// Creates an empty directive.
+  ///
+  /// \param C AST context.
+  ///
+  static OMPTaskgraphDirective *CreateEmpty(const ASTContext &C,
+                                            unsigned NumClauses, EmptyShell);
+
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OMPTaskgraphDirectiveClass;
+  }
+};
+
 /// This represents '#pragma omp taskgroup' directive.
 ///
 /// \code
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index e166894ea024b..20758b1357bef 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -266,6 +266,7 @@ def OMPTaskDirective : StmtNode<OMPExecutableDirective>;
 def OMPTaskyieldDirective : StmtNode<OMPExecutableDirective>;
 def OMPBarrierDirective : StmtNode<OMPExecutableDirective>;
 def OMPTaskwaitDirective : StmtNode<OMPExecutableDirective>;
+def OMPTaskgraphDirective : StmtNode<OMPExecutableDirective>;
 def OMPTaskgroupDirective : StmtNode<OMPExecutableDirective>;
 def OMPFlushDirective : StmtNode<OMPExecutableDirective>;
 def OMPDepobjDirective : StmtNode<OMPExecutableDirective>;
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 480b18960fd67..7c500847881f0 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -563,6 +563,10 @@ class SemaOpenMP : public SemaBase {
   /// Called on well-formed '\#pragma omp barrier'.
   StmtResult ActOnOpenMPBarrierDirective(SourceLocation StartLoc,
                                          SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp taskgraph'.
+  StmtResult ActOnOpenMPTaskgraphDirective(ArrayRef<OMPClause *> Clauses,
+                                           Stmt *AStmt, SourceLocation StartLoc,
+                                           SourceLocation EndLoc);
   /// Called on well-formed '\#pragma omp taskwait'.
   StmtResult ActOnOpenMPTaskwaitDirective(ArrayRef<OMPClause *> Clauses,
                                           SourceLocation StartLoc,
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 9a41f9e89df98..f9c48f5bda553 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1988,6 +1988,7 @@ enum StmtCode {
   STMT_OMP_ERROR_DIRECTIVE,
   STMT_OMP_BARRIER_DIRECTIVE,
   STMT_OMP_TASKWAIT_DIRECTIVE,
+  STMT_OMP_TASKGRAPH_DIRECTIVE,
   STMT_OMP_FLUSH_DIRECTIVE,
   STMT_OMP_DEPOBJ_DIRECTIVE,
   STMT_OMP_SCAN_DIRECTIVE,
diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp
index 9d6b315effb41..4b56c2b649b4b 100644
--- a/clang/lib/AST/StmtOpenMP.cpp
+++ b/clang/lib/AST/StmtOpenMP.cpp
@@ -966,6 +966,21 @@ OMPTaskwaitDirective *OMPTaskwaitDirective::CreateEmpty(const ASTContext &C,
   return createEmptyDirective<OMPTaskwaitDirective>(C, NumClauses);
 }
 
+OMPTaskgraphDirective *OMPTaskgraphDirective::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
+    ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt) {
+  auto *Dir = createDirective<OMPTaskgraphDirective>(
+      C, Clauses, AssociatedStmt, /*NumChildren=*/0, StartLoc, EndLoc);
+  return Dir;
+}
+
+OMPTaskgraphDirective *OMPTaskgraphDirective::CreateEmpty(const ASTContext &C,
+                                                          unsigned NumClauses,
+                                                          EmptyShell) {
+  return createEmptyDirective<OMPTaskgraphDirective>(
+      C, NumClauses, /*HasAssociatedStmt=*/true);
+}
+
 OMPTaskgroupDirective *OMPTaskgroupDirective::Create(
     const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc,
     ArrayRef<OMPClause *> Clauses, Stmt *AssociatedStmt, Expr *ReductionRef) {
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 6c3294573e9d4..b78c60584d9ce 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -909,6 +909,11 @@ void StmtPrinter::VisitOMPAssumeDirective(OMPAssumeDirective *Node) {
   PrintOMPExecutableDirective(Node);
 }
 
+void StmtPrinter::VisitOMPTaskgraphDirective(OMPTaskgraphDirective *Node) {
+  Indent() << "#pragma omp taskgraph";
+  PrintOMPExecutableDirective(Node);
+}
+
 void StmtPrinter::VisitOMPErrorDirective(OMPErrorDirective *Node) {
   Indent() << "#pragma omp error";
   PrintOMPExecutableDirective(Node);
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 2b991ff8e7b19..cde2ad8d8dc98 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -1156,6 +1156,10 @@ void StmtProfiler::VisitOMPAssumeDirective(const OMPAssumeDirective *S) {
   VisitOMPExecutableDirective(S);
 }
 
+void StmtProfiler::VisitOMPTaskgraphDirective(const OMPTaskgraphDirective *S) {
+  VisitOMPExecutableDirective(S);
+}
+
 void StmtProfiler::VisitOMPErrorDirective(const OMPErrorDirective *S) {
   VisitOMPExecutableDirective(S);
 }
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 72015a7224275..7cca74ffe711d 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -946,6 +946,9 @@ void clang::getOpenMPCaptureRegions(
     case OMPD_teams:
       CaptureRegions.push_back(OMPD_teams);
       break;
+    case OMPD_taskgraph:
+      CaptureRegions.push_back(OMPD_taskgraph);
+      break;
     case OMPD_taskloop:
       CaptureRegions.push_back(OMPD_taskloop);
       break;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index a99a257c14a2a..dfd9f2d25ac30 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -60,6 +60,8 @@ class CGOpenMPRegionInfo : public CodeGenFunction::CGCapturedStmtInfo {
     ParallelOutlinedRegion,
     /// Region with outlined function for standalone 'task' directive.
     TaskOutlinedRegion,
+    /// Region with outlined function for standalone 'taskgraph' directive.
+    TaskgraphOutlinedRegion,
     /// Region for constructs that do not require function outlining,
     /// like 'for', 'sections', 'atomic' etc. directives.
     InlinedRegion,
@@ -2218,6 +2220,33 @@ void CGOpenMPRuntime::emitTaskyieldCall(CodeGenFunction &CGF,
     Region->emitUntiedSwitch(CGF);
 }
 
+void CGOpenMPRuntime::emitTaskgraphCall(CodeGenFunction &CGF,
+                                        SourceLocation Loc,
+                                        const OMPExecutableDirective &D,
+                                        const Expr *IfCond) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  CodeGenFunction OutlinedCGF(CGM, /*suppressNewContext=*/true);
+
+  const auto *CS = cast<CapturedStmt>(D.getAssociatedStmt());
+  LValue CapStruct = CGF.InitCapturedStruct(*CS);
+
+  llvm::Function *OutlinedFn = OutlinedCGF.GenerateCapturedStmtFunction(*CS);
+
+  llvm::Value *CapturedArgsPtr =
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+          CapStruct.getPointer(OutlinedCGF), CGM.VoidPtrTy);
+
+  auto &&CodeGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+    Action.Enter(CGF);
+    CGF.CGM.getOpenMPRuntime().emitOutlinedFunctionCall(CGF, Loc, OutlinedFn,
+                                                        CapturedArgsPtr);
+  };
+  RegionCodeGenTy RCG(CodeGen);
+  RCG(CGF);
+}
+
 void CGOpenMPRuntime::emitTaskgroupRegion(CodeGenFunction &CGF,
                                           const RegionCodeGenTy &TaskgroupOpGen,
                                           SourceLocation Loc) {
@@ -6538,6 +6567,7 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
   case OMPD_taskyield:
   case OMPD_barrier:
   case OMPD_taskwait:
+  case OMPD_taskgraph:
   case OMPD_taskgroup:
   case OMPD_atomic:
   case OMPD_flush:
@@ -10462,6 +10492,7 @@ getNestedDistributeDirective(ASTContext &Ctx, const OMPExecutableDirective &D) {
     case OMPD_taskyield:
     case OMPD_barrier:
     case OMPD_taskwait:
+    case OMPD_taskgraph:
     case OMPD_taskgroup:
     case OMPD_atomic:
     case OMPD_flush:
@@ -11197,6 +11228,7 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S,
     case OMPD_taskyield:
     case OMPD_barrier:
     case OMPD_taskwait:
+    case OMPD_taskgraph:
     case OMPD_taskgroup:
     case OMPD_atomic:
     case OMPD_flush:
@@ -11767,6 +11799,7 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
     case OMPD_taskyield:
     case OMPD_barrier:
     case OMPD_taskwait:
+    case OMPD_taskgraph:
     case OMPD_taskgroup:
     case OMPD_atomic:
     case OMPD_flush:
@@ -13121,6 +13154,13 @@ void CGOpenMPSIMDRuntime::emitTaskyieldCall(CodeGenFunction &CGF,
   llvm_unreachable("Not supported in SIMD-only mode");
 }
 
+void CGOpenMPSIMDRuntime::emitTaskgraphCall(CodeGenFunction &CGF,
+                                            SourceLocation Loc,
+                                            const OMPExecutableDirective &D,
+                                            const Expr *IfCond) {
+  llvm_unreachable("Not supported in SIMD-only mode");
+}
+
 void CGOpenMPSIMDRuntime::emitTaskgroupRegion(
     CodeGenFunction &CGF, const RegionCodeGenTy &TaskgroupOpGen,
     SourceLocation Loc) {
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index a81d3830a8035..451f08e8a32e6 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1380,6 +1380,11 @@ class CGOpenMPRuntime {
   virtual void emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc,
                                 const OMPTaskDataTy &Data);
 
+  /// Emit code for 'taskgraph' directive.
+  virtual void emitTaskgraphCall(CodeGenFunction &CGF, SourceLocation Loc,
+                                 const OMPExecutableDirective &D,
+                                 const Expr *IfCond);
+
   /// Emit code for 'cancellation point' construct.
   /// \param CancelRegion Region kind for which the cancellation point must be
   /// emitted.
@@ -2208,6 +2213,14 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime {
   void emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc,
                         const OMPTaskDataTy &Data) override;
 
+  /// Emit code for 'taskgraph' directive.
+  /// \param IfCond Expression evaluated in if clause associated with the
+  // taskgraph.
+  /// \param D Directive to emit.
+  void emitTaskgraphCall(CodeGenFunction &CGF, SourceLocation Loc,
+                         const OMPExecutableDirective &D,
+                         const Expr *IfCond) override;
+
   /// Emit code for 'cancellation point' construct.
   /// \param CancelRegion Region kind for which the cancellation point must be
   /// emitted.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 52537a725bbc2..8e52d19ed4247 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -572,6 +572,7 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx,
     case OMPD_taskyield:
     case OMPD_barrier:
     case OMPD_taskwait:
+    case OMPD_taskgraph:
     case OMPD_taskgroup:
     case OMPD_atomic:
     case OMPD_flush:
@@ -660,6 +661,7 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx,
   case OMPD_taskyield:
   case OMPD_barrier:
   case OMPD_taskwait:
+  case OMPD_taskgraph:
   case OMPD_taskgroup:
   case OMPD_atomic:
   case OMPD_flush:
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 7b6035a6968b1..40df35c168fcc 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -287,6 +287,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OMPTaskwaitDirectiveClass:
     EmitOMPTaskwaitDirective(cast<OMPTaskwaitDirective>(*S));
     break;
+  case Stmt::OMPTaskgraphDirectiveClass:
+    EmitOMPTaskgraphDirective(cast<OMPTaskgraphDirective>(*S));
+    break;
   case Stmt::OMPTaskgroupDirectiveClass:
     EmitOMPTaskgroupDirective(cast<OMPTaskgroupDirective>(*S));
     break;
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 82307d3a064c6..a2ae5bcfe1160 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1676,6 +1676,7 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
     case OMPD_error:
     case OMPD_barrier:
     case OMPD_taskwait:
+    case OMPD_taskgraph:
     case OMPD_taskgroup:
     case OMPD_flush:
     case OMPD_depobj:
@@ -5879,6 +5880,20 @@ void CodeGenFunction::EmitOMPTaskwaitDirective(const OMPTaskwaitDirective &S) {
   CGM.getOpenMPRuntime().emitTaskwaitCall(*this, S.getBeginLoc(), Data);
 }
 
+void CodeGenFunction::EmitOMPTaskgraphDirective(
+    const OMPTaskgraphDirective &S) {
+  const Expr *IfCond = nullptr;
+  for (const auto *C : S.getClausesOfKind<OMPIfClause>()) {
+    if (C->getNameModifier() == OMPD_unknown ||
+        C->getNameModifier() == OMPD_taskgraph) {
+      IfCond = C->getCondition();
+      break;
+    }
+  }
+
+  CGM.getOpenMPRuntime().emitTaskgraphCall(*this, S.getBeginLoc(), S, IfCond);
+}
+
 static bool isSupportedByOpenMPIRBuilder(const OMPTaskgroupDirective &T) {
   return T.clauses().empty();
 }
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 29b87a0616992..05585aad8467f 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3951,6 +3951,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   void EmitOMPErrorDirective(const OMPErrorDirective &S);
   void EmitOMPBarrierDirective(const OMPBarrierDirective &S);
   void EmitOMPTaskwaitDirective(const OMPTaskwaitDirective &S);
+  void EmitOMPTaskgraphDirective(const OMPTaskgraphDirective &S);
   void EmitOMPTaskgroupDirective(const OMPTaskgroupDirective &S);
   void EmitOMPFlushDirective(const OMPFlushDirective &S);
   void EmitOMPDepobjDirective(const OMPDepobjDirective &S);
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 40d530a1f3925..5609d076bb637 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1528,6 +1528,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OMPScopeDirectiveClass:
   case Stmt::OMPTaskDirectiveClass:
   case Stmt::OMPTaskgroupDirectiveClass:
+  case Stmt::OMPTaskgraphDirectiveClass:
   case Stmt::OMPTaskLoopDirectiveClass:
   case Stmt::OMPTaskLoopSimdDirectiveClass:
   case Stmt::OMPTaskwaitDirectiveClass:
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 22ce0190a0907..8bb928e70e6a5 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4526,6 +4526,14 @@ getUnknownRegionParams(Sema &SemaRef) {
   return Params;
 }
 
+static SmallVector<SemaOpenMP::CapturedParamNameType>
+getTaskgraphRegionParams(Sema &SemaRef) {
+  SmallVector<SemaOpenMP::CapturedParamNameType> Params{
+      std::make_pair(StringRef(), QualType()) // __context with shared vars
+  };
+  return Params;
+}
+
 static SmallVector<SemaOpenMP::CapturedParamNameType>
 getTaskloopRegionParams(Sema &SemaRef) {
   ASTContext &Context = SemaRef.getASTContext();
@@ -4599,6 +4607,10 @@ static void processCapturedRegions(Sema &SemaRef, OpenMPDirectiveKind DKind,
       // function directly.
       MarkAsInlined(SemaRef.getCurCapturedRegion());
       break;
+    case OMPD_taskgraph:
+      SemaRef.ActOnCapturedRegionStart(
+          Loc, CurScope, CR_OpenMP, getTaskgraphRegionParams(SemaRef), Level);
+      break;
     case OMPD_target:
       SemaRef.ActOnCapturedRegionStart(Loc, CurScope, CR_OpenMP,
                                        getTargetRegionParams(SemaRef), Level);
@@ -6565,6 +6577,12 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
            "No associated statement allowed for 'omp taskwait' directive");
     Res = ActOnOpenMPTaskwaitDirective(ClausesWithImplicit, StartLoc, EndLoc);
     break;
+  case OMPD_taskgraph:
+    assert(AStmt &&
+           "Associated statement required for 'omp taskgraph' directive");
+    Res = ActOnOpenMPTaskgraphDirective(ClausesWithImplicit, AStmt, StartLoc,
+                                        EndLoc);
+    break;
   case OMPD_taskgroup:
     Res = ActOnOpenMPTaskgroupDirective(ClausesWithImplicit, AStmt, StartLoc,
                                         EndLoc);
@@ -11455,6 +11473,24 @@ SemaOpenMP::ActOnOpenMPTaskwaitDirective(ArrayRef<OMPClause *> Clauses,
                                       Clauses);
 }
 
+StmtResult
+SemaOpenMP::ActOnOpenMPTaskgraphDirective(ArrayRef<OMPClause *> Clauses,
+                                          Stmt *AStmt, SourceLocation StartLoc,
+                                          SourceLocation EndLoc) {
+  if (!getLangOpts().OpenMP || getLangOpts().OpenMP < 60) {
+    Diag(StartLoc, diag::err_omp_unexpected_directive)
+        << 1 << getOpenMPDirectiveName(OMPD_taskgraph, getLangOpts().OpenMP);
+    return StmtError();
+  }
+  if (!AStmt)
+    return StmtError();
+
+  assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
+
+  return OMPTaskgraphDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                       Clauses, AStmt);
+}
+
 StmtResult
 SemaOpenMP::ActOnOpenMPTaskgroupDirective(ArrayRef<OMPClause *> Clauses,
                                           Stmt *AStmt, SourceLocation StartLoc,
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index f49b3eceace46..91c5314886692 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -10012,6 +10012,17 @@ TreeTransform<Derived>::TransformOMPAssumeDirective(OMPAssumeDirective *D) {
   return Res;
 }
 
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOMPTaskgraphDirective(
+    OMPTaskgraphDirective *D) {
+  DeclarationNameInfo DirName;
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_taskgraph, DirName, nullptr, D->getBeginLoc());
+  StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
+  return Res;
+}
+
 template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPErrorDirective(OMPErrorDirective *D) {
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 4ada1dc58042d..41c6cc3bc49ed 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2637,6 +2637,11 @@ void ASTStmtReader::VisitOMPAssumeDirective(OMPAssumeDirective *D) {
   VisitOMPExecutableDirective(D);
 }
 
+void ASTStmtReader::VisitOMPTaskgraphDirective(OMPTaskgraphDirective *D) {
+  VisitStmt(D);
+  VisitOMPExecutableDirective(D);
+}
+
 void ASTStmtReader::VisitOMPErrorDirective(OMPErrorDirective *D) {
   VisitStmt(D);
   // The NumClauses field was read in ReadStmtFromStream.
@@ -3805,6 +3810,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
           Context, Record[ASTStmtReader::NumStmtFields], Empty);
       break;
 
+    case STMT_OMP_TASKGRAPH_DIRECTIVE:
+      S = OMPTaskgraphDirective::CreateEmpty(
+          Context, Record[ASTStmtReader::NumStmtFields], Empty);
+      break;
+
     case STMT_OMP_ERROR_DIRECTIVE:
       S = OMPErrorDirective::CreateEmpty(
           Context, Record[ASTStmtReader::NumStmtFields], Empty);
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 47cbef06c3cc4..10aea326920c7 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2730,6 +2730,12 @@ void ASTStmtWriter::VisitOMPAssumeDirective(OMPAssumeDirective *D) {
   Code = serialization::STMT_OMP_ASSUME_DIRECTIVE;
 }
 
+void ASTStmtWriter::VisitOMPTaskgraphDirective(OMPTaskgraphDirective *D) {
+  VisitStmt(D);
+  VisitOMPExecutableDirective(D);
+  Code = serialization::STMT_OMP_TASKGRAPH_DIRECTIVE;
+}
+
 void ASTStmtWriter::VisitOMPErrorDirective(OMPErrorDirective *D) {
   VisitStmt(D);
   Record.push_back(D->getNumClauses());
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 1efe7e6f84b23..e34100f17e83d 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1749,6 +1749,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OMPTaskyieldDirectiveClass:
     case Stmt::OMPBarrierDirectiveClass:
     case Stmt::OMPTaskwaitDirectiveClass:
+    case Stmt::OMPTaskgraphDirectiveClass:
     case Stmt::OMPErrorDirectiveClass:
     case Stmt::OMPTaskgroupDirectiveClass:
     case Stmt::OMPFlushDirectiveClass:
diff --git a/clang/test/OpenMP/taskgraph_bad_clauses.cpp b/clang/test/OpenMP/taskgraph_bad_clauses.cpp
new file mode 100644
index 0000000000000..16b38a35fb4bd
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_bad_clauses.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -verify=expected,omp51 -fopenmp -fopenmp-version=51 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify=expected,omp60 -fopenmp -fopenmp-version=60 -ferror-limit 100 -o - %s
+
+int main() {
+  int data[10];
+#pragma omp taskgraph map(tofrom: data[0:10]) // expected-error {{unexpected OpenMP clause 'map' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}}
+  {
+  }
+#pragma omp taskgraph depend(inout: data) // expected-error {{unexpected OpenMP clause 'depend' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}}
+  {
+  }
+#pragma omp taskgraph if(taskgraph: 1) // omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}}
+  {
+  }
+#pragma omp taskgraph if(cancel: 1) // omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}} expected-error {{directive name modifier 'cancel' is not allowed for '#pragma omp taskgraph'}}
+  {
+  }
+  return 0;
+}
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 1a92542c9595c..94c7501ff7887 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -6369,6 +6369,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OMPTaskwaitDirective");
   case CXCursor_OMPAssumeDirective:
     return cxstring::createRef("OMPAssumeDirective");
+  case CXCursor_OMPTaskgraphDirective:
+    return cxstring::createRef("OMPTaskgraphDirective");
   case CXCursor_OMPErrorDirective:
     return cxstring::createRef("OMPErrorDirective");
   case CXCursor_OMPTaskgroupDirective:
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index 242380c68c667..2c07cb0c73ca4 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -757,6 +757,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OMPTaskwaitDirectiveClass:
     K = CXCursor_OMPTaskwaitDirective;
     break;
+  case Stmt::OMPTaskgraphDirectiveClass:
+    K = CXCursor_OMPTaskgraphDirective;
+    break;
   case Stmt::OMPErrorDirectiveClass:
     K = CXCursor_OMPErrorDirective;
     break;

>From 10f72c814a5556d72fa1f47910e38d7718e89a06 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Tue, 21 Apr 2026 17:01:24 -0500
Subject: [PATCH 05/24] [OpenMP] Add graph_id/graph_reset tests

Assisted-By: Codex with gpt-5.3

commit-id:9545b560

Pull Request: https://github.com/llvm/llvm-project/pull/194049
---
 .../OpenMP/taskgraph_clauses_ast_print.cpp    | 94 +++++++++++++++++++
 .../OpenMP/taskgraph_clauses_messages.cpp     | 36 +++++++
 2 files changed, 130 insertions(+)
 create mode 100644 clang/test/OpenMP/taskgraph_clauses_ast_print.cpp
 create mode 100644 clang/test/OpenMP/taskgraph_clauses_messages.cpp

diff --git a/clang/test/OpenMP/taskgraph_clauses_ast_print.cpp b/clang/test/OpenMP/taskgraph_clauses_ast_print.cpp
new file mode 100644
index 0000000000000..8ac721634380a
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_clauses_ast_print.cpp
@@ -0,0 +1,94 @@
+// Check no warnings/errors
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Check AST and unparsing
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -ast-dump  %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix=PRINT
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -include-pch %t -ast-print    %s | FileCheck %s --check-prefix=PRINT
+
+#ifndef HEADER
+#define HEADER
+
+void taskgraph_clauses() {
+  int A = 1;
+
+  // --- graph_id clause ---
+
+  // DUMP: OMPTaskgraphDirective
+  // DUMP: OMPGraph_idClause
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 0
+  // PRINT: #pragma omp taskgraph graph_id(0)
+  #pragma omp taskgraph graph_id(0)
+  {}
+
+  // DUMP: OMPTaskgraphDirective
+  // DUMP: OMPGraph_idClause
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 42
+  // PRINT: #pragma omp taskgraph graph_id(42)
+  #pragma omp taskgraph graph_id(42)
+  {}
+
+  // DUMP: OMPTaskgraphDirective
+  // DUMP: OMPGraph_idClause
+  // DUMP: BinaryOperator {{.*}} 'int' '+'
+  // DUMP-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+  // DUMP-NEXT: DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'A' 'int'
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 10
+  // PRINT: #pragma omp taskgraph graph_id(A + 10)
+  #pragma omp taskgraph graph_id(A + 10)
+  {}
+
+  // --- graph_reset clause ---
+
+  // DUMP: OMPTaskgraphDirective
+  // DUMP: OMPGraph_resetClause
+  // DUMP-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' false
+  // PRINT: #pragma omp taskgraph graph_reset(false)
+  #pragma omp taskgraph graph_reset(false)
+  {}
+
+  // DUMP: OMPTaskgraphDirective
+  // DUMP: OMPGraph_resetClause
+  // DUMP-NEXT: CXXBoolLiteralExpr {{.*}} 'bool' true
+  // PRINT: #pragma omp taskgraph graph_reset(true)
+  #pragma omp taskgraph graph_reset(true)
+  {}
+
+  // DUMP: OMPTaskgraphDirective
+  // DUMP: OMPGraph_resetClause
+  // DUMP-NEXT: BinaryOperator {{.*}} 'bool' '>'
+  // DUMP-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+  // DUMP-NEXT: DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'A' 'int'
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 5
+  // PRINT: #pragma omp taskgraph graph_reset(A > 5)
+  #pragma omp taskgraph graph_reset(A > 5)
+  {}
+
+  // DUMP: OMPTaskgraphDirective
+  // DUMP: OMPGraph_resetClause
+  // PRINT: #pragma omp taskgraph graph_reset
+  #pragma omp taskgraph graph_reset
+  {}
+
+  // --- Combinations using both clauses ---
+
+  // DUMP: OMPTaskgraphDirective
+  // DUMP: OMPGraph_idClause
+  // DUMP: OMPGraph_resetClause
+  // PRINT: #pragma omp taskgraph graph_id(1) graph_reset(true)
+  #pragma omp taskgraph graph_id(1) graph_reset(true)
+  {}
+
+  // DUMP: OMPTaskgraphDirective
+  // DUMP: OMPGraph_resetClause
+  // DUMP: OMPGraph_idClause
+  // PRINT: #pragma omp taskgraph graph_reset(false) graph_id(2)
+  #pragma omp taskgraph graph_reset(false) graph_id(2)
+  {}
+}
+#endif
diff --git a/clang/test/OpenMP/taskgraph_clauses_messages.cpp b/clang/test/OpenMP/taskgraph_clauses_messages.cpp
new file mode 100644
index 0000000000000..0327f52475d98
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_clauses_messages.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -verify=expected,omp51 -fsyntax-only %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -verify=expected,omp60 -fsyntax-only %s
+
+// Tests that the 'graph_id' and 'graph_reset' clauses are accepted in OpenMP 6.0
+// and rejected in prior versions on the 'taskgraph' directive. Also tests
+// duplicate-clause and invalid-condition diagnostics.
+
+void foo() {}
+
+void taskgraph_clauses_messages() {
+  int A = 1;
+
+  // Basic version error tests.
+  #pragma omp taskgraph graph_id(0) // omp51-error {{unexpected OpenMP clause 'graph_id' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}}
+  {}
+
+  #pragma omp taskgraph graph_reset(true) // omp51-error {{unexpected OpenMP clause 'graph_reset' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}}
+  {}
+
+  // Same, without argument.
+  #pragma omp taskgraph graph_reset // omp51-error {{unexpected OpenMP clause 'graph_reset' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}}
+  {}
+
+  // Duplicate clause tests (OMP 6.0 only; in OMP 5.1 both are unexpected).
+  #pragma omp taskgraph graph_id(0) graph_id(1) // omp51-error {{unexpected OpenMP clause 'graph_id' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP clause 'graph_id' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}} expected-error {{directive '#pragma omp taskgraph' cannot contain more than one 'graph_id' clause}}
+  {}
+
+  #pragma omp taskgraph graph_reset(true) graph_reset(false) // omp51-error {{unexpected OpenMP clause 'graph_reset' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP clause 'graph_reset' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}} expected-error {{directive '#pragma omp taskgraph' cannot contain more than one 'graph_reset' clause}}
+  {}
+
+  #pragma omp taskgraph graph_id(foo()) // omp51-error {{unexpected OpenMP clause 'graph_id' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}} omp60-error {{expression must have integral or unscoped enumeration type, not 'void'}}
+  {}
+
+  #pragma omp taskgraph graph_reset(foo()) // omp51-error {{unexpected OpenMP clause 'graph_reset' in directive '#pragma omp taskgraph'}} omp51-error {{unexpected OpenMP directive '#pragma omp taskgraph'}} omp60-error {{value of type 'void' is not contextually convertible to 'bool'}}
+  {}
+}

>From bb5b2560ffc93c252cb00e2716785393a68e8ce6 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Tue, 5 May 2026 08:31:54 -0500
Subject: [PATCH 06/24] [OpenMP] Support 'taskgraph' semantics via new libomp
 API entry points

This patch supports the OpenMP 6.0 'taskgraph' directive via several
new API entry points for libomp.

New entry points are used for several reasons.  The first reason
is to pass extra information from the compiler relevant to the
taskgraph-recording case -- e.g. the __kmpc_taskgraph_task entry point
has extra arguments relating to shared data.  The second reason is to
reduce the potential overhead of the taskgraph implementation on the
rest of the runtime.  A third intention is to capture OpenMP semantics
at a slightly higher level: in particular when we come to add offload
target tasks to this implementation, those will also use new API entry
points to hopefully allow dependencies to be handled entirely on the GPU,
rather than by being wrapped in a host task.

Pull Request: https://github.com/llvm/llvm-project/pull/194051
---
 .../clang/Basic/DiagnosticSemaKinds.td        |   2 +
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 718 +++++++++++++-----
 clang/lib/CodeGen/CGOpenMPRuntime.h           |   4 +-
 clang/lib/CodeGen/CodeGenFunction.h           |  15 +
 clang/lib/Sema/SemaOpenMP.cpp                 |   4 +
 .../test/OpenMP/taskgraph_taskwait_nodeps.cpp |  12 +
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |  11 +
 7 files changed, 555 insertions(+), 211 deletions(-)
 create mode 100644 clang/test/OpenMP/taskgraph_taskwait_nodeps.cpp

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index c69b2ce3648f8..fd442a7b93492 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12341,6 +12341,8 @@ def err_omp_single_copyprivate_with_nowait : Error<
   "the 'copyprivate' clause must not be used with the 'nowait' clause">;
 def err_omp_nowait_clause_without_depend: Error<
   "directive '#pragma omp taskwait' cannot use 'nowait' clause without 'depend' clause">;
+def err_omp_taskgraph_taskwait_without_depend: Error<
+  "directive '#pragma omp taskwait' within '#pragma omp taskgraph' must use 'depend' clause to be task-generating">;
 def note_omp_nowait_clause_here : Note<
   "'nowait' clause is here">;
 def err_omp_single_decl_in_declare_simd_variant : Error<
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index dfd9f2d25ac30..30514facd0546 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Frontend/OpenMP/OMP.h.inc"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalValue.h"
@@ -348,6 +349,26 @@ class CGOpenMPTargetRegionInfo final : public CGOpenMPRegionInfo {
   StringRef HelperName;
 };
 
+/// API for captured statement code generation in OpenMP taskgraphs.
+class CGOpenMPTaskgraphRegionInfo final : public CGOpenMPRegionInfo {
+public:
+  CGOpenMPTaskgraphRegionInfo(const CapturedStmt &CS,
+                              const RegionCodeGenTy &CodeGen)
+      : CGOpenMPRegionInfo(CS, TaskgraphOutlinedRegion, CodeGen,
+                           llvm::omp::OMPD_taskgraph, false) {}
+
+  const VarDecl *getThreadIDVariable() const override { return 0; }
+
+  /// Get the name of the capture helper.
+  StringRef getHelperName() const override { return "taskgraph.omp_outlined."; }
+
+  static bool classof(const CGCapturedStmtInfo *Info) {
+    return CGOpenMPRegionInfo::classof(Info) &&
+           cast<CGOpenMPRegionInfo>(Info)->getRegionKind() ==
+               TaskgraphOutlinedRegion;
+  }
+};
+
 static void EmptyCodeGen(CodeGenFunction &, PrePostActionTy &) {
   llvm_unreachable("No codegen for expressions");
 }
@@ -2227,24 +2248,98 @@ void CGOpenMPRuntime::emitTaskgraphCall(CodeGenFunction &CGF,
   if (!CGF.HaveInsertPoint())
     return;
 
+  // The nogroup clause doesn't support an argument yet.  FIXME.
+  const OMPNogroupClause *NoGroupClause = D.getSingleClause<OMPNogroupClause>();
+  llvm::Value *NoGroup;
+  if (NoGroupClause) {
+    NoGroup = CGF.Builder.getInt32(1);
+  } else {
+    NoGroup = CGF.Builder.getInt32(0);
+  }
+
+  const OMPGraphResetClause *GraphResetClause =
+      D.getSingleClause<OMPGraphResetClause>();
+  llvm::Value *GraphReset;
+  if (GraphResetClause) {
+    const Expr *Cond = GraphResetClause->getCondition();
+    llvm::Value *CondVal = CGF.EvaluateExprAsBool(Cond);
+    GraphReset =
+        CGF.Builder.CreateIntCast(CondVal, CGF.IntTy, /*isSigned=*/true);
+  } else {
+    GraphReset = CGF.Builder.getInt32(0);
+  }
+
+  llvm::Value *GraphId = CGF.Builder.getInt32(0);
+  const OMPGraphIdClause *GraphIdClause = D.getSingleClause<OMPGraphIdClause>();
+  if (GraphIdClause) {
+    const auto *E = GraphIdClause->getId();
+    auto *GraphIdVal = CGF.EmitScalarExpr(E);
+    GraphId =
+        CGF.Builder.CreateIntCast(GraphIdVal, CGM.Int32Ty, /*isSigned=*/false);
+  }
+
   CodeGenFunction OutlinedCGF(CGM, /*suppressNewContext=*/true);
 
   const auto *CS = cast<CapturedStmt>(D.getAssociatedStmt());
+
+  auto BodyGen = [CS](CodeGenFunction &CGF, PrePostActionTy &) {
+    CodeGenFunction::OMPWithinTaskgraphRAII WithinTaskgraph(CGF);
+    CGF.EmitStmt(CS->getCapturedStmt());
+  };
+
   LValue CapStruct = CGF.InitCapturedStruct(*CS);
+  CGOpenMPTaskgraphRegionInfo TaskgraphRegion(*CS, BodyGen);
+  CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(OutlinedCGF,
+                                                  &TaskgraphRegion);
 
   llvm::Function *OutlinedFn = OutlinedCGF.GenerateCapturedStmtFunction(*CS);
 
-  llvm::Value *CapturedArgsPtr =
+  // Create an internal-linkage global variable to hold the taskgraph handle.
+  std::string GraphHandleName = getName({"omp", "taskgraph", "handle"});
+  auto *GraphHandle = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.VoidPtrTy,
+      /*IsConstant=*/false, llvm::GlobalValue::InternalLinkage,
+      llvm::Constant::getNullValue(CGM.VoidPtrTy), GraphHandleName);
+
+  std::array<llvm::Value *, 8> Args{
+      emitUpdateLocation(CGF, Loc),
+      getThreadID(CGF, Loc),
+      GraphHandle,
+      GraphId,
+      GraphReset,
+      NoGroup,
+      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(OutlinedFn,
+                                                      CGM.VoidPtrTy),
       CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-          CapStruct.getPointer(OutlinedCGF), CGM.VoidPtrTy);
+          CapStruct.getPointer(OutlinedCGF), CGM.VoidPtrTy)};
 
-  auto &&CodeGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
-    Action.Enter(CGF);
-    CGF.CGM.getOpenMPRuntime().emitOutlinedFunctionCall(CGF, Loc, OutlinedFn,
-                                                        CapturedArgsPtr);
+  auto &&ThenGen = [&CGF, this, &Args](CodeGenFunction &, PrePostActionTy &) {
+    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                            CGM.getModule(), OMPRTL___kmpc_taskgraph),
+                        Args);
   };
-  RegionCodeGenTy RCG(CodeGen);
-  RCG(CGF);
+  auto &&ElseGen = [&CGF, this, &OutlinedFn, &CapStruct, &Loc,
+                    &OutlinedCGF](CodeGenFunction &, PrePostActionTy &) {
+    llvm::Value *CapturedArgsPtr =
+        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+            CapStruct.getPointer(OutlinedCGF), CGM.VoidPtrTy);
+
+    auto &&CodeGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+      Action.Enter(CGF);
+      CGF.CGM.getOpenMPRuntime().emitOutlinedFunctionCall(CGF, Loc, OutlinedFn,
+                                                          CapturedArgsPtr);
+    };
+    RegionCodeGenTy RCG(CodeGen);
+    RCG(CGF);
+  };
+
+  if (IfCond) {
+    emitIfClause(CGF, IfCond, ThenGen, ElseGen);
+  } else {
+    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                            CGM.getModule(), OMPRTL___kmpc_taskgraph),
+                        Args);
+  }
 }
 
 void CGOpenMPRuntime::emitTaskgroupRegion(CodeGenFunction &CGF,
@@ -3776,11 +3871,11 @@ static void getKmpAffinityType(ASTContext &C, QualType &KmpTaskAffinityInfoTy) {
   }
 }
 
-CGOpenMPRuntime::TaskResultTy
-CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
-                              const OMPExecutableDirective &D,
-                              llvm::Function *TaskFunction, QualType SharedsTy,
-                              Address Shareds, const OMPTaskDataTy &Data) {
+CGOpenMPRuntime::TaskResultTy CGOpenMPRuntime::emitTaskInit(
+    CodeGenFunction &CGF, SourceLocation Loc, const OMPExecutableDirective &D,
+    llvm::Function *TaskFunction, QualType SharedsTy, Address Shareds,
+    const OMPTaskDataTy &Data, bool ForTaskgraph,
+    std::array<llvm::Value *, 3> &TaskAllocArgs) {
   ASTContext &C = CGM.getContext();
   llvm::SmallVector<PrivateDataTy, 4> Privates;
   // Aggregate privates and sort them by the alignment.
@@ -3927,6 +4022,11 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
       SharedsSize, CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
           TaskEntry, KmpRoutineEntryPtrTy)};
   llvm::Value *NewTask;
+  if (ForTaskgraph) {
+    TaskAllocArgs[0] = TaskFlags;
+    TaskAllocArgs[1] = KmpTaskTWithPrivatesTySize;
+    TaskAllocArgs[2] = SharedsSize;
+  }
   if (D.hasClausesOfKind<OMPNowaitClause>()) {
     // Check if we have any device clause associated with the directive.
     const Expr *Device = nullptr;
@@ -4684,114 +4784,169 @@ void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
   if (!CGF.HaveInsertPoint())
     return;
 
-  TaskResultTy Result =
-      emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data);
-  llvm::Value *NewTask = Result.NewTask;
-  llvm::Function *TaskEntry = Result.TaskEntry;
-  llvm::Value *NewTaskNewTaskTTy = Result.NewTaskNewTaskTTy;
-  LValue TDBase = Result.TDBase;
-  const RecordDecl *KmpTaskTQTyRD = Result.KmpTaskTQTyRD;
-  // Process list of dependences.
-  Address DependenciesArray = Address::invalid();
-  llvm::Value *NumOfElements;
-  std::tie(NumOfElements, DependenciesArray) =
-      emitDependClause(CGF, Data.Dependences, Loc);
-
-  // NOTE: routine and part_id fields are initialized by __kmpc_omp_task_alloc()
-  // libcall.
-  // Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid,
-  // kmp_task_t *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
-  // kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) if dependence
-  // list is not empty
-  llvm::Value *ThreadID = getThreadID(CGF, Loc);
-  llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
-  llvm::Value *TaskArgs[] = { UpLoc, ThreadID, NewTask };
-  llvm::Value *DepTaskArgs[7];
-  if (!Data.Dependences.empty()) {
-    DepTaskArgs[0] = UpLoc;
-    DepTaskArgs[1] = ThreadID;
-    DepTaskArgs[2] = NewTask;
-    DepTaskArgs[3] = NumOfElements;
-    DepTaskArgs[4] = DependenciesArray.emitRawPointer(CGF);
-    DepTaskArgs[5] = CGF.Builder.getInt32(0);
-    DepTaskArgs[6] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
-  }
-  auto &&ThenCodeGen = [this, &Data, TDBase, KmpTaskTQTyRD, &TaskArgs,
-                        &DepTaskArgs](CodeGenFunction &CGF, PrePostActionTy &) {
-    if (!Data.Tied) {
-      auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
-      LValue PartIdLVal = CGF.EmitLValueForField(TDBase, *PartIdFI);
-      CGF.EmitStoreOfScalar(CGF.Builder.getInt32(0), PartIdLVal);
+  auto &&TaskgraphTaskCodeGen = [this, &Loc, &D, TaskFunction, &SharedsTy,
+                                 &Shareds, &Data](CodeGenFunction &CGF,
+                                                  PrePostActionTy &) {
+    llvm::Value *ThreadId = getThreadID(CGF, Loc);
+    llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
+    std::array<llvm::Value *, 9> TGTaskArgs;
+    std::array<llvm::Value *, 3> TaskAllocArgs;
+    TaskResultTy Result = emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy,
+                                       Shareds, Data, true, TaskAllocArgs);
+    Address DependenciesArray = Address::invalid();
+    llvm::Value *NumOfElements;
+    std::tie(NumOfElements, DependenciesArray) =
+        emitDependClause(CGF, Data.Dependences, Loc);
+    TGTaskArgs[0] = UpLoc;
+    TGTaskArgs[1] = ThreadId;
+    TGTaskArgs[2] = Result.NewTask;
+    TGTaskArgs[3] = TaskAllocArgs[0]; // TaskFlags
+    TGTaskArgs[4] = TaskAllocArgs[1]; // KmpTaskTWithPrivatesTySize
+    TGTaskArgs[5] = Shareds.emitRawPointer(CGF);
+    TGTaskArgs[6] = TaskAllocArgs[2]; // SharedsSize
+    if (auto RecType = dyn_cast<RecordType>(SharedsTy)) {
+      auto *RD = RecType->getAsRecordDecl();
+      if (RD->fields().empty()) {
+        // FIXME: The condition might not be precisely correct here.
+        TGTaskArgs[6] = CGF.Builder.getSize(0);
+      }
     }
-    if (!Data.Dependences.empty()) {
-      CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), OMPRTL___kmpc_omp_task_with_deps),
-          DepTaskArgs);
+    if (Data.Dependences.size() == 0) {
+      TGTaskArgs[7] = CGF.Builder.getInt32(0);
+      TGTaskArgs[8] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
     } else {
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              CGM.getModule(), OMPRTL___kmpc_omp_task),
-                          TaskArgs);
+      TGTaskArgs[7] = NumOfElements;
+      TGTaskArgs[8] = DependenciesArray.emitRawPointer(CGF);
     }
-    // Check if parent region is untied and build return for untied task;
-    if (auto *Region =
-            dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
-      Region->emitUntiedSwitch(CGF);
+    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                            CGM.getModule(), OMPRTL___kmpc_taskgraph_task),
+                        TGTaskArgs);
   };
 
-  llvm::Value *DepWaitTaskArgs[7];
-  if (!Data.Dependences.empty()) {
-    DepWaitTaskArgs[0] = UpLoc;
-    DepWaitTaskArgs[1] = ThreadID;
-    DepWaitTaskArgs[2] = NumOfElements;
-    DepWaitTaskArgs[3] = DependenciesArray.emitRawPointer(CGF);
-    DepWaitTaskArgs[4] = CGF.Builder.getInt32(0);
-    DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
-    DepWaitTaskArgs[6] =
-        llvm::ConstantInt::get(CGF.Int32Ty, Data.HasNowaitClause);
-  }
-  auto &M = CGM.getModule();
-  auto &&ElseCodeGen = [this, &M, &TaskArgs, ThreadID, NewTaskNewTaskTTy,
-                        TaskEntry, &Data, &DepWaitTaskArgs,
-                        Loc](CodeGenFunction &CGF, PrePostActionTy &) {
-    CodeGenFunction::RunCleanupsScope LocalScope(CGF);
-    // Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid,
-    // kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
-    // ndeps_noalias, kmp_depend_info_t *noalias_dep_list); if dependence info
-    // is specified.
-    if (!Data.Dependences.empty())
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              M, OMPRTL___kmpc_omp_taskwait_deps_51),
-                          DepWaitTaskArgs);
-    // Call proxy_task_entry(gtid, new_task);
-    auto &&CodeGen = [TaskEntry, ThreadID, NewTaskNewTaskTTy,
-                      Loc](CodeGenFunction &CGF, PrePostActionTy &Action) {
-      Action.Enter(CGF);
-      llvm::Value *OutlinedFnArgs[] = {ThreadID, NewTaskNewTaskTTy};
-      CGF.CGM.getOpenMPRuntime().emitOutlinedFunctionCall(CGF, Loc, TaskEntry,
-                                                          OutlinedFnArgs);
+  auto &&NonTaskgraphTaskCodeGen = [this, &Loc, &D, TaskFunction, &SharedsTy,
+                                    &Shareds, IfCond,
+                                    &Data](CodeGenFunction &CGF,
+                                           PrePostActionTy &) {
+    std::array<llvm::Value *, 3> DummyArray;
+    TaskResultTy Result = emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy,
+                                       Shareds, Data, false, DummyArray);
+    llvm::Value *NewTask = Result.NewTask;
+    llvm::Function *TaskEntry = Result.TaskEntry;
+    llvm::Value *NewTaskNewTaskTTy = Result.NewTaskNewTaskTTy;
+    LValue TDBase = Result.TDBase;
+    const RecordDecl *KmpTaskTQTyRD = Result.KmpTaskTQTyRD;
+    // Process list of dependences.
+    Address DependenciesArray = Address::invalid();
+    llvm::Value *NumOfElements;
+    std::tie(NumOfElements, DependenciesArray) =
+        emitDependClause(CGF, Data.Dependences, Loc);
+
+    // NOTE: routine and part_id fields are initialized by
+    // __kmpc_omp_task_alloc() libcall. Build kmp_int32
+    // __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid, kmp_task_t
+    // *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
+    // ndeps_noalias, kmp_depend_info_t *noalias_dep_list) if dependence list is
+    // not empty
+    llvm::Value *ThreadID = getThreadID(CGF, Loc);
+    llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
+    llvm::Value *TaskArgs[] = {UpLoc, ThreadID, NewTask};
+    llvm::Value *DepTaskArgs[7];
+    if (!Data.Dependences.empty()) {
+      DepTaskArgs[0] = UpLoc;
+      DepTaskArgs[1] = ThreadID;
+      DepTaskArgs[2] = NewTask;
+      DepTaskArgs[3] = NumOfElements;
+      DepTaskArgs[4] = DependenciesArray.emitRawPointer(CGF);
+      DepTaskArgs[5] = CGF.Builder.getInt32(0);
+      DepTaskArgs[6] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
+    }
+    auto &&ThenCodeGen = [this, &Data, TDBase, KmpTaskTQTyRD, &TaskArgs,
+                          &DepTaskArgs](CodeGenFunction &CGF,
+                                        PrePostActionTy &) {
+      if (!Data.Tied) {
+        auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
+        LValue PartIdLVal = CGF.EmitLValueForField(TDBase, *PartIdFI);
+        CGF.EmitStoreOfScalar(CGF.Builder.getInt32(0), PartIdLVal);
+      }
+      if (!Data.Dependences.empty()) {
+        CGF.EmitRuntimeCall(
+            OMPBuilder.getOrCreateRuntimeFunction(
+                CGM.getModule(), OMPRTL___kmpc_omp_task_with_deps),
+            DepTaskArgs);
+      } else {
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                CGM.getModule(), OMPRTL___kmpc_omp_task),
+                            TaskArgs);
+      }
+      // Check if parent region is untied and build return for untied task;
+      if (auto *Region =
+              dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
+        Region->emitUntiedSwitch(CGF);
     };
 
-    // Build void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
-    // kmp_task_t *new_task);
-    // Build void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
-    // kmp_task_t *new_task);
-    RegionCodeGenTy RCG(CodeGen);
-    CommonActionTy Action(OMPBuilder.getOrCreateRuntimeFunction(
-                              M, OMPRTL___kmpc_omp_task_begin_if0),
-                          TaskArgs,
-                          OMPBuilder.getOrCreateRuntimeFunction(
-                              M, OMPRTL___kmpc_omp_task_complete_if0),
-                          TaskArgs);
-    RCG.setAction(Action);
-    RCG(CGF);
+    llvm::Value *DepWaitTaskArgs[7];
+    if (!Data.Dependences.empty()) {
+      DepWaitTaskArgs[0] = UpLoc;
+      DepWaitTaskArgs[1] = ThreadID;
+      DepWaitTaskArgs[2] = NumOfElements;
+      DepWaitTaskArgs[3] = DependenciesArray.emitRawPointer(CGF);
+      DepWaitTaskArgs[4] = CGF.Builder.getInt32(0);
+      DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
+      DepWaitTaskArgs[6] =
+          llvm::ConstantInt::get(CGF.Int32Ty, Data.HasNowaitClause);
+    }
+    auto &M = CGM.getModule();
+    auto &&ElseCodeGen = [this, &M, &TaskArgs, ThreadID, NewTaskNewTaskTTy,
+                          TaskEntry, &Data, &DepWaitTaskArgs,
+                          Loc](CodeGenFunction &CGF, PrePostActionTy &) {
+      CodeGenFunction::RunCleanupsScope LocalScope(CGF);
+      // Build void __kmpc_omp_wait_deps(ident_t *, kmp_int32 gtid,
+      // kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
+      // ndeps_noalias, kmp_depend_info_t *noalias_dep_list); if dependence info
+      // is specified.
+      if (!Data.Dependences.empty())
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                M, OMPRTL___kmpc_omp_taskwait_deps_51),
+                            DepWaitTaskArgs);
+      // Call proxy_task_entry(gtid, new_task);
+      auto &&CodeGen = [TaskEntry, ThreadID, NewTaskNewTaskTTy,
+                        Loc](CodeGenFunction &CGF, PrePostActionTy &Action) {
+        Action.Enter(CGF);
+        llvm::Value *OutlinedFnArgs[] = {ThreadID, NewTaskNewTaskTTy};
+        CGF.CGM.getOpenMPRuntime().emitOutlinedFunctionCall(CGF, Loc, TaskEntry,
+                                                            OutlinedFnArgs);
+      };
+
+      // Build void __kmpc_omp_task_begin_if0(ident_t *, kmp_int32 gtid,
+      // kmp_task_t *new_task);
+      // Build void __kmpc_omp_task_complete_if0(ident_t *, kmp_int32 gtid,
+      // kmp_task_t *new_task);
+      RegionCodeGenTy RCG(CodeGen);
+      CommonActionTy Action(OMPBuilder.getOrCreateRuntimeFunction(
+                                M, OMPRTL___kmpc_omp_task_begin_if0),
+                            TaskArgs,
+                            OMPBuilder.getOrCreateRuntimeFunction(
+                                M, OMPRTL___kmpc_omp_task_complete_if0),
+                            TaskArgs);
+      RCG.setAction(Action);
+      RCG(CGF);
+    };
+
+    if (IfCond) {
+      emitIfClause(CGF, IfCond, ThenCodeGen, ElseCodeGen);
+    } else {
+      RegionCodeGenTy ThenRCG(ThenCodeGen);
+      ThenRCG(CGF);
+    }
   };
 
-  if (IfCond) {
-    emitIfClause(CGF, IfCond, ThenCodeGen, ElseCodeGen);
+  if (CGF.getOMPWithinTaskgraph()) {
+    // Lexically within taskgraph, always replayable.
+    RegionCodeGenTy TaskgraphRCG(TaskgraphTaskCodeGen);
+    TaskgraphRCG(CGF);
   } else {
-    RegionCodeGenTy ThenRCG(ThenCodeGen);
-    ThenRCG(CGF);
+    RegionCodeGenTy NonTaskgraphRCG(NonTaskgraphTaskCodeGen);
+    NonTaskgraphRCG(CGF);
   }
 }
 
@@ -4803,15 +4958,11 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
                                        const OMPTaskDataTy &Data) {
   if (!CGF.HaveInsertPoint())
     return;
-  TaskResultTy Result =
-      emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data);
-  // NOTE: routine and part_id fields are initialized by __kmpc_omp_task_alloc()
-  // libcall.
-  // Call to void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
-  // if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
-  // sched, kmp_uint64 grainsize, void *task_dup);
-  llvm::Value *ThreadID = getThreadID(CGF, Loc);
-  llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
+
+  std::array<llvm::Value *, 3> TaskAllocArgs;
+  TaskResultTy TaskInitResult = emitTaskInit(
+      CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data, true, TaskAllocArgs);
+
   llvm::Value *IfVal;
   if (IfCond) {
     IfVal = CGF.Builder.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.IntTy,
@@ -4820,68 +4971,177 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
     IfVal = llvm::ConstantInt::getSigned(CGF.IntTy, /*V=*/1);
   }
 
-  LValue LBLVal = CGF.EmitLValueForField(
-      Result.TDBase,
-      *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound));
-  const auto *LBVar =
-      cast<VarDecl>(cast<DeclRefExpr>(D.getLowerBoundVariable())->getDecl());
-  CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(), LBLVal.getQuals(),
-                       /*IsInitializer=*/true);
-  LValue UBLVal = CGF.EmitLValueForField(
-      Result.TDBase,
-      *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound));
-  const auto *UBVar =
-      cast<VarDecl>(cast<DeclRefExpr>(D.getUpperBoundVariable())->getDecl());
-  CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(), UBLVal.getQuals(),
-                       /*IsInitializer=*/true);
-  LValue StLVal = CGF.EmitLValueForField(
-      Result.TDBase,
-      *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTStride));
-  const auto *StVar =
-      cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
-  CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(),
-                       /*IsInitializer=*/true);
-  // Store reductions address.
-  LValue RedLVal = CGF.EmitLValueForField(
-      Result.TDBase,
-      *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTReductions));
-  if (Data.Reductions) {
-    CGF.EmitStoreOfScalar(Data.Reductions, RedLVal);
+  enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
+
+  auto &&TaskgraphTaskloopCodeGen = [this, &Loc, &D, &TaskInitResult, &Shareds,
+                                     IfVal, &Data,
+                                     &TaskAllocArgs](CodeGenFunction &CGF,
+                                                     PrePostActionTy &) {
+    llvm::Value *ThreadId = getThreadID(CGF, Loc);
+    llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
+    std::array<llvm::Value *, 16> TGTaskLoopArgs;
+
+    // This is all copy/pasted from below. Refactor!
+    LValue LBLVal = CGF.EmitLValueForField(
+        TaskInitResult.TDBase,
+        *std::next(TaskInitResult.KmpTaskTQTyRD->field_begin(),
+                   KmpTaskTLowerBound));
+    const auto *LBVar =
+        cast<VarDecl>(cast<DeclRefExpr>(D.getLowerBoundVariable())->getDecl());
+    CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(),
+                         LBLVal.getQuals(),
+                         /*IsInitializer=*/true);
+    LValue UBLVal = CGF.EmitLValueForField(
+        TaskInitResult.TDBase,
+        *std::next(TaskInitResult.KmpTaskTQTyRD->field_begin(),
+                   KmpTaskTUpperBound));
+    const auto *UBVar =
+        cast<VarDecl>(cast<DeclRefExpr>(D.getUpperBoundVariable())->getDecl());
+    CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(),
+                         UBLVal.getQuals(),
+                         /*IsInitializer=*/true);
+    LValue StLVal = CGF.EmitLValueForField(
+        TaskInitResult.TDBase,
+        *std::next(TaskInitResult.KmpTaskTQTyRD->field_begin(),
+                   KmpTaskTStride));
+    const auto *StVar =
+        cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
+    CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(),
+                         StLVal.getQuals(), /*IsInitializer=*/true);
+    // Store reductions address.
+    LValue RedLVal = CGF.EmitLValueForField(
+        TaskInitResult.TDBase,
+        *std::next(TaskInitResult.KmpTaskTQTyRD->field_begin(),
+                   KmpTaskTReductions));
+    if (Data.Reductions) {
+      CGF.EmitStoreOfScalar(Data.Reductions, RedLVal);
+    } else {
+      CGF.EmitNullInitialization(RedLVal.getAddress(),
+                                 CGF.getContext().VoidPtrTy);
+    }
+
+    TGTaskLoopArgs[0] = UpLoc;
+    TGTaskLoopArgs[1] = ThreadId;
+    TGTaskLoopArgs[2] = TaskInitResult.NewTask;
+    TGTaskLoopArgs[3] = TaskAllocArgs[0]; // TaskFlags
+    TGTaskLoopArgs[4] = TaskAllocArgs[1]; // KmpTaskTWithPrivatesTySize
+    TGTaskLoopArgs[5] = Shareds.emitRawPointer(CGF);
+    TGTaskLoopArgs[6] = TaskAllocArgs[2]; // SharedsSize
+    TGTaskLoopArgs[7] = IfVal;
+    TGTaskLoopArgs[8] = LBLVal.getPointer(CGF);
+    TGTaskLoopArgs[9] = UBLVal.getPointer(CGF);
+    TGTaskLoopArgs[10] = CGF.EmitLoadOfScalar(StLVal, Loc);
+    TGTaskLoopArgs[11] =
+        llvm::ConstantInt::getSigned(CGF.IntTy, Data.Nogroup ? 1 : 0);
+    TGTaskLoopArgs[12] = llvm::ConstantInt::getSigned(
+        CGF.IntTy, Data.Schedule.getPointer()
+                       ? Data.Schedule.getInt() ? NumTasks : Grainsize
+                       : NoSchedule);
+    TGTaskLoopArgs[13] =
+        Data.Schedule.getPointer()
+            ? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
+                                        /*isSigned=*/false)
+            : llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0);
+    TGTaskLoopArgs[14] =
+        llvm::ConstantInt::getSigned(CGF.IntTy, Data.HasModifier ? 1 : 0);
+    TGTaskLoopArgs[15] = TaskInitResult.TaskDupFn
+                             ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+                                   TaskInitResult.TaskDupFn, CGF.VoidPtrTy)
+                             : llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
+    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                            CGM.getModule(), OMPRTL___kmpc_taskgraph_taskloop),
+                        TGTaskLoopArgs);
+  };
+
+  auto &&NonTaskgraphTaskloopCodeGen = [this, &Loc, &D, &TaskInitResult, IfVal,
+                                        &Data](CodeGenFunction &CGF,
+                                               PrePostActionTy &) {
+    // NOTE: routine and part_id fields are initialized by
+    // __kmpc_omp_task_alloc() libcall. Call to void __kmpc_taskloop(ident_t
+    // *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64
+    // *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void
+    // *task_dup);
+    llvm::Value *ThreadID = getThreadID(CGF, Loc);
+    llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
+
+    LValue LBLVal = CGF.EmitLValueForField(
+        TaskInitResult.TDBase,
+        *std::next(TaskInitResult.KmpTaskTQTyRD->field_begin(),
+                   KmpTaskTLowerBound));
+    const auto *LBVar =
+        cast<VarDecl>(cast<DeclRefExpr>(D.getLowerBoundVariable())->getDecl());
+    CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(),
+                         LBLVal.getQuals(),
+                         /*IsInitializer=*/true);
+    LValue UBLVal = CGF.EmitLValueForField(
+        TaskInitResult.TDBase,
+        *std::next(TaskInitResult.KmpTaskTQTyRD->field_begin(),
+                   KmpTaskTUpperBound));
+    const auto *UBVar =
+        cast<VarDecl>(cast<DeclRefExpr>(D.getUpperBoundVariable())->getDecl());
+    CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(),
+                         UBLVal.getQuals(),
+                         /*IsInitializer=*/true);
+    LValue StLVal = CGF.EmitLValueForField(
+        TaskInitResult.TDBase,
+        *std::next(TaskInitResult.KmpTaskTQTyRD->field_begin(),
+                   KmpTaskTStride));
+    const auto *StVar =
+        cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
+    CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(),
+                         StLVal.getQuals(),
+                         /*IsInitializer=*/true);
+    // Store reductions address.
+    LValue RedLVal = CGF.EmitLValueForField(
+        TaskInitResult.TDBase,
+        *std::next(TaskInitResult.KmpTaskTQTyRD->field_begin(),
+                   KmpTaskTReductions));
+    if (Data.Reductions) {
+      CGF.EmitStoreOfScalar(Data.Reductions, RedLVal);
+    } else {
+      CGF.EmitNullInitialization(RedLVal.getAddress(),
+                                 CGF.getContext().VoidPtrTy);
+    }
+    llvm::SmallVector<llvm::Value *, 12> TaskArgs{
+        UpLoc,
+        ThreadID,
+        TaskInitResult.NewTask,
+        IfVal,
+        LBLVal.getPointer(CGF),
+        UBLVal.getPointer(CGF),
+        CGF.EmitLoadOfScalar(StLVal, Loc),
+        llvm::ConstantInt::getSigned(
+            CGF.IntTy, 1), // Always 1 because taskgroup emitted by the compiler
+        llvm::ConstantInt::getSigned(
+            CGF.IntTy, Data.Schedule.getPointer()
+                           ? Data.Schedule.getInt() ? NumTasks : Grainsize
+                           : NoSchedule),
+        Data.Schedule.getPointer()
+            ? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
+                                        /*isSigned=*/false)
+            : llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0)};
+    if (Data.HasModifier)
+      TaskArgs.push_back(llvm::ConstantInt::get(CGF.Int32Ty, 1));
+
+    TaskArgs.push_back(TaskInitResult.TaskDupFn
+                           ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+                                 TaskInitResult.TaskDupFn, CGF.VoidPtrTy)
+                           : llvm::ConstantPointerNull::get(CGF.VoidPtrTy));
+    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                            CGM.getModule(), Data.HasModifier
+                                                 ? OMPRTL___kmpc_taskloop_5
+                                                 : OMPRTL___kmpc_taskloop),
+                        TaskArgs);
+  };
+
+  if (CGF.getOMPWithinTaskgraph()) {
+    // Lexically within taskgraph, always replayable.
+    RegionCodeGenTy TaskgraphRCG(TaskgraphTaskloopCodeGen);
+    TaskgraphRCG(CGF);
   } else {
-    CGF.EmitNullInitialization(RedLVal.getAddress(),
-                               CGF.getContext().VoidPtrTy);
+    RegionCodeGenTy NonTaskgraphRCG(NonTaskgraphTaskloopCodeGen);
+    NonTaskgraphRCG(CGF);
   }
-  enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
-  llvm::SmallVector<llvm::Value *, 12> TaskArgs{
-      UpLoc,
-      ThreadID,
-      Result.NewTask,
-      IfVal,
-      LBLVal.getPointer(CGF),
-      UBLVal.getPointer(CGF),
-      CGF.EmitLoadOfScalar(StLVal, Loc),
-      llvm::ConstantInt::getSigned(
-          CGF.IntTy, 1), // Always 1 because taskgroup emitted by the compiler
-      llvm::ConstantInt::getSigned(
-          CGF.IntTy, Data.Schedule.getPointer()
-                         ? Data.Schedule.getInt() ? NumTasks : Grainsize
-                         : NoSchedule),
-      Data.Schedule.getPointer()
-          ? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
-                                      /*isSigned=*/false)
-          : llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0)};
-  if (Data.HasModifier)
-    TaskArgs.push_back(llvm::ConstantInt::get(CGF.Int32Ty, 1));
-
-  TaskArgs.push_back(Result.TaskDupFn
-                         ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-                               Result.TaskDupFn, CGF.VoidPtrTy)
-                         : llvm::ConstantPointerNull::get(CGF.VoidPtrTy));
-  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                          CGM.getModule(), Data.HasModifier
-                                               ? OMPRTL___kmpc_taskloop_5
-                                               : OMPRTL___kmpc_taskloop),
-                      TaskArgs);
 }
 
 /// Emit reduction operation for each element of array (required for
@@ -6016,9 +6276,15 @@ llvm::Value *CGOpenMPRuntime::emitTaskReductionInit(
       llvm::ConstantInt::get(CGM.IntTy, Size, /*isSigned=*/true),
       CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TaskRedInput.getPointer(),
                                                       CGM.VoidPtrTy)};
-  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                 CGM.getModule(), OMPRTL___kmpc_taskred_init),
-                             Args);
+  if (CGF.getOMPWithinTaskgraph())
+    return CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), OMPRTL___kmpc_taskgraph_taskred_init),
+        Args);
+  else
+    return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                   CGM.getModule(), OMPRTL___kmpc_taskred_init),
+                               Args);
 }
 
 void CGOpenMPRuntime::emitTaskReductionFini(CodeGenFunction &CGF,
@@ -6092,36 +6358,68 @@ void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc,
     llvm::Value *NumOfElements;
     std::tie(NumOfElements, DependenciesArray) =
         emitDependClause(CGF, Data.Dependences, Loc);
-    if (!Data.Dependences.empty()) {
-      llvm::Value *DepWaitTaskArgs[7];
-      DepWaitTaskArgs[0] = UpLoc;
-      DepWaitTaskArgs[1] = ThreadID;
-      DepWaitTaskArgs[2] = NumOfElements;
-      DepWaitTaskArgs[3] = DependenciesArray.emitRawPointer(CGF);
-      DepWaitTaskArgs[4] = CGF.Builder.getInt32(0);
-      DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
-      DepWaitTaskArgs[6] =
-          llvm::ConstantInt::get(CGF.Int32Ty, Data.HasNowaitClause);
-
-      CodeGenFunction::RunCleanupsScope LocalScope(CGF);
 
-      // Build void __kmpc_omp_taskwait_deps_51(ident_t *, kmp_int32 gtid,
-      // kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
-      // ndeps_noalias, kmp_depend_info_t *noalias_dep_list,
-      // kmp_int32 has_no_wait); if dependence info is specified.
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                              M, OMPRTL___kmpc_omp_taskwait_deps_51),
-                          DepWaitTaskArgs);
+    auto &&TaskgraphTaskwaitCodeGen =
+        [this, UpLoc, ThreadID, NumOfElements, &DependenciesArray,
+         &Data](CodeGenFunction &CGF, PrePostActionTy &) {
+          llvm::Value *TGTaskWaitArgs[5];
+          TGTaskWaitArgs[0] = UpLoc;
+          TGTaskWaitArgs[1] = ThreadID;
+          if (Data.Dependences.empty()) {
+            TGTaskWaitArgs[2] = CGF.Builder.getInt32(0);
+            TGTaskWaitArgs[3] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
+          } else {
+            TGTaskWaitArgs[2] = NumOfElements;
+            TGTaskWaitArgs[3] = DependenciesArray.emitRawPointer(CGF);
+          }
+          TGTaskWaitArgs[4] =
+              llvm::ConstantInt::get(CGF.Int32Ty, Data.HasNowaitClause);
+          CGF.EmitRuntimeCall(
+              OMPBuilder.getOrCreateRuntimeFunction(
+                  CGM.getModule(), OMPRTL___kmpc_taskgraph_taskwait),
+              TGTaskWaitArgs);
+        };
+    auto &&NonTaskgraphTaskwaitCodeGen =
+        [this, UpLoc, ThreadID, NumOfElements, &DependenciesArray, &M,
+         &Data](CodeGenFunction &CGF, PrePostActionTy &) {
+          if (!Data.Dependences.empty()) {
+            llvm::Value *DepWaitTaskArgs[7];
+            DepWaitTaskArgs[0] = UpLoc;
+            DepWaitTaskArgs[1] = ThreadID;
+            DepWaitTaskArgs[2] = NumOfElements;
+            DepWaitTaskArgs[3] = DependenciesArray.emitRawPointer(CGF);
+            DepWaitTaskArgs[4] = CGF.Builder.getInt32(0);
+            DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
+            DepWaitTaskArgs[6] =
+                llvm::ConstantInt::get(CGF.Int32Ty, Data.HasNowaitClause);
+
+            CodeGenFunction::RunCleanupsScope LocalScope(CGF);
+
+            // Build void __kmpc_omp_taskwait_deps_51(ident_t *, kmp_int32 gtid,
+            // kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
+            // ndeps_noalias, kmp_depend_info_t *noalias_dep_list,
+            // kmp_int32 has_no_wait); if dependence info is specified.
+            CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                    M, OMPRTL___kmpc_omp_taskwait_deps_51),
+                                DepWaitTaskArgs);
+          } else {
+            // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
+            // global_tid);
+            llvm::Value *Args[] = {UpLoc, ThreadID};
+            // Ignore return result until untied tasks are supported.
+            CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                    M, OMPRTL___kmpc_omp_taskwait),
+                                Args);
+          }
+        };
 
+    if (CGF.getOMPWithinTaskgraph()) {
+      // Lexically within taskgraph, always replayable.
+      RegionCodeGenTy TaskgraphRCG(TaskgraphTaskwaitCodeGen);
+      TaskgraphRCG(CGF);
     } else {
-
-      // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
-      // global_tid);
-      llvm::Value *Args[] = {UpLoc, ThreadID};
-      // Ignore return result until untied tasks are supported.
-      CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_omp_taskwait),
-          Args);
+      RegionCodeGenTy NonTaskgraphRCG(NonTaskgraphTaskwaitCodeGen);
+      NonTaskgraphRCG(CGF);
     }
   }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 451f08e8a32e6..d274d287939b6 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -582,7 +582,9 @@ class CGOpenMPRuntime {
   TaskResultTy emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
                             const OMPExecutableDirective &D,
                             llvm::Function *TaskFunction, QualType SharedsTy,
-                            Address Shareds, const OMPTaskDataTy &Data);
+                            Address Shareds, const OMPTaskDataTy &Data,
+                            bool ForTaskgraph,
+                            std::array<llvm::Value *, 3> &TaskAllocArgs);
 
   /// Emit update for lastprivate conditional data.
   void emitLastprivateConditionalUpdate(CodeGenFunction &CGF, LValue IVLVal,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 05585aad8467f..a312de9e8a812 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -812,6 +812,21 @@ class CodeGenFunction : public CodeGenTypeCache {
     }
   };
 
+  bool OMPWithinTaskgraph = false;
+
+  bool getOMPWithinTaskgraph() { return OMPWithinTaskgraph; }
+  void setOMPWithinTaskgraph(bool In) { OMPWithinTaskgraph = In; }
+
+  class OMPWithinTaskgraphRAII {
+    CodeGenFunction &CGF;
+
+  public:
+    OMPWithinTaskgraphRAII(CodeGenFunction &CGF_) : CGF(CGF_) {
+      CGF.setOMPWithinTaskgraph(true);
+    }
+    ~OMPWithinTaskgraphRAII() { CGF.setOMPWithinTaskgraph(false); }
+  };
+
   template <class T>
   typename DominatingValue<T>::saved_type saveValueInCond(T value) {
     return DominatingValue<T>::save(*this, value);
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 8bb928e70e6a5..cd90c482f4a03 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -11468,6 +11468,10 @@ SemaOpenMP::ActOnOpenMPTaskwaitDirective(ArrayRef<OMPClause *> Clauses,
     Diag(StartLoc, diag::err_omp_nowait_clause_without_depend);
     return StmtError();
   }
+  if (DSAStack->getParentDirective() == OMPD_taskgraph && !HasDependC) {
+    Diag(StartLoc, diag::err_omp_taskgraph_taskwait_without_depend);
+    return StmtError();
+  }
 
   return OMPTaskwaitDirective::Create(getASTContext(), StartLoc, EndLoc,
                                       Clauses);
diff --git a/clang/test/OpenMP/taskgraph_taskwait_nodeps.cpp b/clang/test/OpenMP/taskgraph_taskwait_nodeps.cpp
new file mode 100644
index 0000000000000..9f088ed523f26
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_taskwait_nodeps.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -verify=expected -fsyntax-only %s
+
+int main() {
+#pragma omp taskgraph
+  {
+#pragma omp taskwait // expected-error {{directive '#pragma omp taskwait' within '#pragma omp taskgraph' must use 'depend' clause to be task-generating}}
+  }
+
+#pragma omp taskwait
+
+  return 0;
+}
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 15fbfdaf549d6..49aeecd426a32 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -357,6 +357,17 @@ __OMP_RTL(__kmpc_omp_task, false, Int32, IdentPtr, Int32,
           /* kmp_task_t */ VoidPtr)
 __OMP_RTL(__kmpc_end_taskgroup, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_taskgroup, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_taskgraph, false, Void, IdentPtr, Int32, VoidPtrPtr, Int32,
+          Int32, Int32, VoidPtr, VoidPtr)
+__OMP_RTL(__kmpc_taskgraph_task, false, Int32, IdentPtr, Int32, VoidPtr, Int32,
+          SizeTy, VoidPtr, SizeTy, Int32, VoidPtr)
+__OMP_RTL(__kmpc_taskgraph_taskloop, false, Int32, IdentPtr, Int32, VoidPtr,
+          Int32, SizeTy, VoidPtr, SizeTy, Int32, Int64Ptr, Int64Ptr, Int64,
+          Int32, Int32, Int64, Int32, VoidPtr)
+__OMP_RTL(__kmpc_taskgraph_taskwait, false, Void, IdentPtr, Int32, Int32,
+          VoidPtr, Int32)
+__OMP_RTL(__kmpc_taskgraph_taskred_init, false, /* kmp_taskgroup */ VoidPtr,
+          Int32, Int32, VoidPtr)
 __OMP_RTL(__kmpc_omp_task_begin_if0, false, Void, IdentPtr, Int32,
           /* kmp_task_t */ VoidPtr)
 __OMP_RTL(__kmpc_omp_task_complete_if0, false, Void, IdentPtr, Int32,

>From ca9d827862a4e195acc9dd49b565d39919526a58 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Thu, 23 Apr 2026 14:49:14 -0500
Subject: [PATCH 07/24] [OpenMP] New Clang tests for 'taskgraph' directive

This patch adds two simple tests for parsing/serialization/deserialization
and codegen for the OpenMP 'taskgraph' directive.

commit-id:e783e747

Pull Request: https://github.com/llvm/llvm-project/pull/194052
---
 clang/test/OpenMP/taskgraph_ast_print.cpp | 31 +++++++++++++++
 clang/test/OpenMP/taskgraph_codegen.cpp   | 47 +++++++++++++++++++++++
 2 files changed, 78 insertions(+)
 create mode 100644 clang/test/OpenMP/taskgraph_ast_print.cpp
 create mode 100644 clang/test/OpenMP/taskgraph_codegen.cpp

diff --git a/clang/test/OpenMP/taskgraph_ast_print.cpp b/clang/test/OpenMP/taskgraph_ast_print.cpp
new file mode 100644
index 0000000000000..bfbe829fc9f31
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_ast_print.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -std=c++11 -include-pch %t -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+int main() {
+  int x = 0, y = 0;
+
+#pragma omp taskgraph
+// CHECK: #pragma omp taskgraph
+  {
+#pragma omp task depend(in: x) depend(out: y)
+// CHECK: #pragma omp task depend(in : x) depend(out : y)
+    {
+      y = x;
+    }
+#pragma omp task depend(inout: x, y)
+// CHECK: #pragma omp task depend(inout : x,y)
+    {
+      x++;
+      y++;
+    }
+  }
+
+  return 0;
+}
+
+#endif
diff --git a/clang/test/OpenMP/taskgraph_codegen.cpp b/clang/test/OpenMP/taskgraph_codegen.cpp
new file mode 100644
index 0000000000000..3f661e6bfe3d5
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_codegen.cpp
@@ -0,0 +1,47 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --functions "main" --prefix-filecheck-ir-name _
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK-LABEL: @main(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[Y:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[X]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[Y]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[X]], ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 1
+// CHECK-NEXT:    store ptr [[Y]], ptr [[TMP2]], align 8
+// CHECK-NEXT:    call void @__kmpc_taskgraph(ptr @[[GLOB1]], i32 [[TMP0]], ptr @.omp.taskgraph.handle, i32 0, i32 0, i32 0, ptr @taskgraph.omp_outlined., ptr [[AGG_CAPTURED]])
+// CHECK-NEXT:    ret i32 0
+//
+int main() {
+  int x = 0, y = 0;
+
+#pragma omp taskgraph
+  {
+#pragma omp task depend(in: x) depend(out: y)
+    {
+      y = x;
+    }
+#pragma omp task depend(inout: x, y)
+    {
+      x++;
+      y++;
+    }
+  }
+
+  return 0;
+}
+
+#endif

>From 59f293fcd1e164afb7903bdfcfdaa6dd5dd979ea Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Mon, 27 Apr 2026 13:55:47 -0500
Subject: [PATCH 08/24] [OpenMP] Add 'replayable' clause (for 'taskgraph'
 support)

This patch adds support for the 'replayable' clause to Clang.  This is
used to mark that the directive the clause is used on participates in
taskgraph recording and replay: thus, when the clause is present, the
new API routines introduced in a previous patch are emitted.

Pull Request: https://github.com/llvm/llvm-project/pull/194053
---
 clang/include/clang/AST/OpenMPClause.h        | 69 +++++++++++++++
 clang/include/clang/AST/RecursiveASTVisitor.h |  7 ++
 clang/include/clang/Sema/SemaOpenMP.h         |  6 ++
 clang/lib/AST/OpenMPClause.cpp                | 15 ++++
 clang/lib/AST/StmtProfile.cpp                 |  5 ++
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 87 ++++++++++++-------
 clang/lib/CodeGen/CGOpenMPRuntime.h           |  9 +-
 clang/lib/CodeGen/CGStmtOpenMP.cpp            | 76 +++++++++++++---
 clang/lib/Parse/ParseOpenMP.cpp               |  1 +
 clang/lib/Sema/SemaOpenMP.cpp                 | 28 ++++++
 clang/lib/Sema/TreeTransform.h                | 25 ++++++
 clang/lib/Serialization/ASTReader.cpp         |  8 ++
 clang/lib/Serialization/ASTWriter.cpp         |  5 ++
 clang/tools/libclang/CIndex.cpp               |  4 +
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |  1 +
 15 files changed, 303 insertions(+), 43 deletions(-)

diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index e18dea7d9bd47..34c0c47bfe710 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -2040,6 +2040,75 @@ class OMPSelfMapsClause final : public OMPClause {
   }
 };
 
+/// This represents a 'replayable' clause in the '#pragma omp target',
+// '#pragma omp target enter data', '#pragma omp target exit data',
+// '#pragma omp target update', '#pragma omp task', '#pragma omp taskloop' or
+// '#pragma omp taskwait' directive.
+///
+/// \code
+/// #pragma omp task replayable(1)
+/// \endcode
+/// In this example directive '#pragma omp task' has the 'replayable' clause.
+class OMPReplayableClause final : public OMPClause {
+  friend class OMPClauseReader;
+
+  /// Location of '('.
+  SourceLocation LParenLoc;
+
+  /// Condition of the 'replayable' clause.
+  Stmt *Condition = nullptr;
+
+public:
+  /// Build 'replayable' clause.
+  ///
+  /// \param Cond Condition of the clause.
+  /// \param StartLoc Starting location of the clause.
+  /// \param LParenLoc Location of '('.
+  /// \param EndLoc Ending location of the clause.
+  OMPReplayableClause(Expr *Cond, SourceLocation StartLoc,
+                      SourceLocation LParenLoc, SourceLocation EndLoc)
+      : OMPClause(llvm::omp::OMPC_replayable, StartLoc, EndLoc),
+        LParenLoc(LParenLoc), Condition(Cond) {}
+
+  /// Build an empty clause.
+  OMPReplayableClause()
+      : OMPClause(llvm::omp::OMPC_replayable, SourceLocation(),
+                  SourceLocation()) {}
+
+  /// Sets the location of '('.
+  void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; }
+
+  /// Returns the location of '('.
+  SourceLocation getLParenLoc() const { return LParenLoc; }
+
+  /// Set condition.
+  void setCondition(Expr *Cond) { Condition = Cond; }
+
+  /// Returns condition.
+  Expr *getCondition() const { return cast_or_null<Expr>(Condition); }
+
+  child_range children() {
+    if (Condition)
+      return child_range(&Condition, &Condition + 1);
+    return child_range(child_iterator(), child_iterator());
+  }
+
+  const_child_range children() const {
+    if (Condition)
+      return const_child_range(&Condition, &Condition + 1);
+    return const_child_range(const_child_iterator(), const_child_iterator());
+  }
+
+  child_range used_children();
+  const_child_range used_children() const {
+    return const_cast<OMPReplayableClause *>(this)->used_children();
+  }
+
+  static bool classof(const OMPClause *T) {
+    return T->getClauseKind() == llvm::omp::OMPC_replayable;
+  }
+};
+
 /// This represents 'at' clause in the '#pragma omp error' directive
 ///
 /// \code
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index da18c040bf570..ea0547f61109e 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3673,6 +3673,13 @@ bool RecursiveASTVisitor<Derived>::VisitOMPNowaitClause(OMPNowaitClause *C) {
   return true;
 }
 
+template <typename Derived>
+bool RecursiveASTVisitor<Derived>::VisitOMPReplayableClause(
+    OMPReplayableClause *C) {
+  TRY_TO(TraverseStmt(C->getCondition()));
+  return true;
+}
+
 template <typename Derived>
 bool RecursiveASTVisitor<Derived>::VisitOMPUntiedClause(OMPUntiedClause *) {
   return true;
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 7c500847881f0..98405b871641a 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -1173,6 +1173,12 @@ class SemaOpenMP : public SemaBase {
   OMPClause *ActOnOpenMPSelfMapsClause(SourceLocation StartLoc,
                                        SourceLocation EndLoc);
 
+  /// Called on well-formed 'replayable' clause.
+  OMPClause *ActOnOpenMPReplayableClause(SourceLocation StartLoc,
+                                         SourceLocation EndLoc,
+                                         SourceLocation LParenLoc,
+                                         Expr *Condition);
+
   /// Called on well-formed 'at' clause.
   OMPClause *ActOnOpenMPAtClause(OpenMPAtClauseKind Kind,
                                  SourceLocation KindLoc,
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index db4d5519acb38..7d2b4aa64a1df 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -324,6 +324,12 @@ OMPClause::child_range OMPNowaitClause::used_children() {
   return children();
 }
 
+OMPClause::child_range OMPReplayableClause::used_children() {
+  if (Condition)
+    return child_range(&Condition, &Condition + 1);
+  return children();
+}
+
 OMPClause::child_range OMPGrainsizeClause::used_children() {
   if (Stmt **C = getAddrOfExprAsWritten(getPreInitStmt()))
     return child_range(C, C + 1);
@@ -2201,6 +2207,15 @@ void OMPClausePrinter::VisitOMPNowaitClause(OMPNowaitClause *Node) {
   }
 }
 
+void OMPClausePrinter::VisitOMPReplayableClause(OMPReplayableClause *Node) {
+  OS << "replayable";
+  if (auto *Cond = Node->getCondition()) {
+    OS << "(";
+    Cond->printPretty(OS, nullptr, Policy, 0);
+    OS << ")";
+  }
+}
+
 void OMPClausePrinter::VisitOMPUntiedClause(OMPUntiedClause *) {
   OS << "untied";
 }
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index cde2ad8d8dc98..d6785f6435efc 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -606,6 +606,11 @@ void OMPClauseProfiler::VisitOMPNowaitClause(const OMPNowaitClause *C) {
     Profiler->VisitStmt(C->getCondition());
 }
 
+void OMPClauseProfiler::VisitOMPReplayableClause(const OMPReplayableClause *C) {
+  if (C->getCondition())
+    Profiler->VisitStmt(C->getCondition());
+}
+
 void OMPClauseProfiler::VisitOMPUntiedClause(const OMPUntiedClause *) {}
 
 void OMPClauseProfiler::VisitOMPMergeableClause(const OMPMergeableClause *) {}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 30514facd0546..bc6827784737f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4775,12 +4775,10 @@ void CGOpenMPRuntime::emitUpdateClause(CodeGenFunction &CGF, LValue DepobjLVal,
   CGF.EmitBlock(DoneBB, /*IsFinished=*/true);
 }
 
-void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
-                                   const OMPExecutableDirective &D,
-                                   llvm::Function *TaskFunction,
-                                   QualType SharedsTy, Address Shareds,
-                                   const Expr *IfCond,
-                                   const OMPTaskDataTy &Data) {
+void CGOpenMPRuntime::emitTaskCall(
+    CodeGenFunction &CGF, SourceLocation Loc, const OMPExecutableDirective &D,
+    llvm::Function *TaskFunction, QualType SharedsTy, Address Shareds,
+    const Expr *IfCond, const Expr *ReplayableCond, const OMPTaskDataTy &Data) {
   if (!CGF.HaveInsertPoint())
     return;
 
@@ -4945,17 +4943,23 @@ void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
     RegionCodeGenTy TaskgraphRCG(TaskgraphTaskCodeGen);
     TaskgraphRCG(CGF);
   } else {
-    RegionCodeGenTy NonTaskgraphRCG(NonTaskgraphTaskCodeGen);
-    NonTaskgraphRCG(CGF);
+    if (ReplayableCond) {
+      // We have a replayable clause.  Task is replayable if its argument is
+      // omitted or evaluates to TRUE.
+      emitIfClause(CGF, ReplayableCond, TaskgraphTaskCodeGen,
+                   NonTaskgraphTaskCodeGen);
+    } else {
+      // Not taskgraph, not replayable.
+      RegionCodeGenTy NonTaskgraphRCG(NonTaskgraphTaskCodeGen);
+      NonTaskgraphRCG(CGF);
+    }
   }
 }
 
-void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
-                                       const OMPLoopDirective &D,
-                                       llvm::Function *TaskFunction,
-                                       QualType SharedsTy, Address Shareds,
-                                       const Expr *IfCond,
-                                       const OMPTaskDataTy &Data) {
+void CGOpenMPRuntime::emitTaskLoopCall(
+    CodeGenFunction &CGF, SourceLocation Loc, const OMPLoopDirective &D,
+    llvm::Function *TaskFunction, QualType SharedsTy, Address Shareds,
+    const Expr *IfCond, const Expr *ReplayableCond, const OMPTaskDataTy &Data) {
   if (!CGF.HaveInsertPoint())
     return;
 
@@ -5139,8 +5143,16 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
     RegionCodeGenTy TaskgraphRCG(TaskgraphTaskloopCodeGen);
     TaskgraphRCG(CGF);
   } else {
-    RegionCodeGenTy NonTaskgraphRCG(NonTaskgraphTaskloopCodeGen);
-    NonTaskgraphRCG(CGF);
+    if (ReplayableCond) {
+      // We have a replayable clause.  Taskloop is replayable if its argument
+      // is omitted or evaluates to TRUE.
+      emitIfClause(CGF, ReplayableCond, TaskgraphTaskloopCodeGen,
+                   NonTaskgraphTaskloopCodeGen);
+    } else {
+      // Not taskgraph, not replayable.
+      RegionCodeGenTy NonTaskgraphRCG(NonTaskgraphTaskloopCodeGen);
+      NonTaskgraphRCG(CGF);
+    }
   }
 }
 
@@ -6276,15 +6288,22 @@ llvm::Value *CGOpenMPRuntime::emitTaskReductionInit(
       llvm::ConstantInt::get(CGM.IntTy, Size, /*isSigned=*/true),
       CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TaskRedInput.getPointer(),
                                                       CGM.VoidPtrTy)};
-  if (CGF.getOMPWithinTaskgraph())
+  // A task/taskloop participates in taskgraph replay either when it is
+  // lexically nested inside a #pragma omp taskgraph region or when it carries
+  // a `replayable` clause (which may also fire dynamically inside a taskgraph
+  // recording).  In both cases route through the taskgraph-aware entry point
+  // so the runtime can stash the reduction input for later replay.  The
+  // taskgraph entry point degrades to the regular init when no taskgraph
+  // recording is active, so this is safe even when `replayable(false)` at
+  // runtime.
+  if (CGF.getOMPWithinTaskgraph() || Data.ReplayableCond)
     return CGF.EmitRuntimeCall(
         OMPBuilder.getOrCreateRuntimeFunction(
             CGM.getModule(), OMPRTL___kmpc_taskgraph_taskred_init),
         Args);
-  else
-    return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                   CGM.getModule(), OMPRTL___kmpc_taskred_init),
-                               Args);
+  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                 CGM.getModule(), OMPRTL___kmpc_taskred_init),
+                             Args);
 }
 
 void CGOpenMPRuntime::emitTaskReductionFini(CodeGenFunction &CGF,
@@ -6343,6 +6362,7 @@ Address CGOpenMPRuntime::getTaskReductionItem(CodeGenFunction &CGF,
 }
 
 void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc,
+                                       const Expr *ReplayableCond,
                                        const OMPTaskDataTy &Data) {
   if (!CGF.HaveInsertPoint())
     return;
@@ -6418,8 +6438,16 @@ void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc,
       RegionCodeGenTy TaskgraphRCG(TaskgraphTaskwaitCodeGen);
       TaskgraphRCG(CGF);
     } else {
-      RegionCodeGenTy NonTaskgraphRCG(NonTaskgraphTaskwaitCodeGen);
-      NonTaskgraphRCG(CGF);
+      if (ReplayableCond) {
+        // We have a replayable clause.  Taskwait is replayable if its argument
+        // is omitted or evaluates to TRUE.
+        emitIfClause(CGF, ReplayableCond, TaskgraphTaskwaitCodeGen,
+                     NonTaskgraphTaskwaitCodeGen);
+      } else {
+        // Not taskgraph, not replayable.
+        RegionCodeGenTy NonTaskgraphRCG(NonTaskgraphTaskwaitCodeGen);
+        NonTaskgraphRCG(CGF);
+      }
     }
   }
 
@@ -13572,19 +13600,17 @@ void CGOpenMPSIMDRuntime::emitFlush(CodeGenFunction &CGF,
   llvm_unreachable("Not supported in SIMD-only mode");
 }
 
-void CGOpenMPSIMDRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
-                                       const OMPExecutableDirective &D,
-                                       llvm::Function *TaskFunction,
-                                       QualType SharedsTy, Address Shareds,
-                                       const Expr *IfCond,
-                                       const OMPTaskDataTy &Data) {
+void CGOpenMPSIMDRuntime::emitTaskCall(
+    CodeGenFunction &CGF, SourceLocation Loc, const OMPExecutableDirective &D,
+    llvm::Function *TaskFunction, QualType SharedsTy, Address Shareds,
+    const Expr *IfCond, const Expr *ReplayableCond, const OMPTaskDataTy &Data) {
   llvm_unreachable("Not supported in SIMD-only mode");
 }
 
 void CGOpenMPSIMDRuntime::emitTaskLoopCall(
     CodeGenFunction &CGF, SourceLocation Loc, const OMPLoopDirective &D,
     llvm::Function *TaskFunction, QualType SharedsTy, Address Shareds,
-    const Expr *IfCond, const OMPTaskDataTy &Data) {
+    const Expr *IfCond, const Expr *ReplayableCond, const OMPTaskDataTy &Data) {
   llvm_unreachable("Not supported in SIMD-only mode");
 }
 
@@ -13625,6 +13651,7 @@ Address CGOpenMPSIMDRuntime::getTaskReductionItem(CodeGenFunction &CGF,
 
 void CGOpenMPSIMDRuntime::emitTaskwaitCall(CodeGenFunction &CGF,
                                            SourceLocation Loc,
+                                           const Expr *ReplayableCond,
                                            const OMPTaskDataTy &Data) {
   llvm_unreachable("Not supported in SIMD-only mode");
 }
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index d274d287939b6..d4dbbef5745a5 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -123,6 +123,7 @@ struct OMPTaskDataTy final {
   bool IsWorksharingReduction = false;
   bool HasNowaitClause = false;
   bool HasModifier = false;
+  const Expr *ReplayableCond = nullptr;
 };
 
 /// Class intended to support codegen of all kind of the reduction clauses.
@@ -1177,6 +1178,7 @@ class CGOpenMPRuntime {
                             const OMPExecutableDirective &D,
                             llvm::Function *TaskFunction, QualType SharedsTy,
                             Address Shareds, const Expr *IfCond,
+                            const Expr *ReplayableCond,
                             const OMPTaskDataTy &Data);
 
   /// Emit task region for the taskloop directive. The taskloop region is
@@ -1212,7 +1214,8 @@ class CGOpenMPRuntime {
                                 const OMPLoopDirective &D,
                                 llvm::Function *TaskFunction,
                                 QualType SharedsTy, Address Shareds,
-                                const Expr *IfCond, const OMPTaskDataTy &Data);
+                                const Expr *IfCond, const Expr *ReplayableCond,
+                                const OMPTaskDataTy &Data);
 
   /// Emit code for the directive that does not require outlining.
   ///
@@ -1380,6 +1383,7 @@ class CGOpenMPRuntime {
 
   /// Emit code for 'taskwait' directive.
   virtual void emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc,
+                                const Expr *ReplayableCond,
                                 const OMPTaskDataTy &Data);
 
   /// Emit code for 'taskgraph' directive.
@@ -2059,6 +2063,7 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime {
                     const OMPExecutableDirective &D,
                     llvm::Function *TaskFunction, QualType SharedsTy,
                     Address Shareds, const Expr *IfCond,
+                    const Expr *ReplayableCond,
                     const OMPTaskDataTy &Data) override;
 
   /// Emit task region for the taskloop directive. The taskloop region is
@@ -2093,6 +2098,7 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime {
   void emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
                         const OMPLoopDirective &D, llvm::Function *TaskFunction,
                         QualType SharedsTy, Address Shareds, const Expr *IfCond,
+                        const Expr *ReplayableCond,
                         const OMPTaskDataTy &Data) override;
 
   /// Emit a code for reduction clause. Next code should be emitted for
@@ -2213,6 +2219,7 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime {
 
   /// Emit code for 'taskwait' directive.
   void emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc,
+                        const Expr *ReplayableCond,
                         const OMPTaskDataTy &Data) override;
 
   /// Emit code for 'taskgraph' directive.
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index a2ae5bcfe1160..c44168bd6490f 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5728,8 +5728,19 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
   IntegerLiteral IfCond(getContext(), TrueOrFalse,
                         getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
                         SourceLocation());
+  const Expr *ReplayableCond = nullptr;
+  if (auto *RC = S.getSingleClause<OMPReplayableClause>()) {
+    ReplayableCond = RC->getCondition();
+    if (!ReplayableCond) {
+      ReplayableCond = IntegerLiteral::Create(
+          getContext(), llvm::APInt(32, 1),
+          getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
+          SourceLocation());
+    }
+  }
   CGM.getOpenMPRuntime().emitTaskCall(*this, S.getBeginLoc(), S, OutlinedFn,
-                                      SharedsTy, CapturedStruct, &IfCond, Data);
+                                      SharedsTy, CapturedStruct, &IfCond,
+                                      ReplayableCond, Data);
 }
 
 void CodeGenFunction::processInReduction(const OMPExecutableDirective &S,
@@ -5838,15 +5849,29 @@ void CodeGenFunction::EmitOMPTaskDirective(const OMPTaskDirective &S) {
   OMPTaskDataTy Data;
   // Check if we should emit tied or untied task.
   Data.Tied = !S.getSingleClause<OMPUntiedClause>();
+  const Expr *ReplayableCond = nullptr;
+  if (auto *RC = S.getSingleClause<OMPReplayableClause>()) {
+    ReplayableCond = RC->getCondition();
+    if (!ReplayableCond) {
+      ReplayableCond = IntegerLiteral::Create(
+          getContext(), llvm::APInt(32, 1),
+          getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
+          SourceLocation());
+    }
+  }
+  // Propagate the replayable signal into Data so that reduction init can
+  // route to the taskgraph-aware runtime entry point when the task may
+  // participate in taskgraph replay.
+  Data.ReplayableCond = ReplayableCond;
   auto &&BodyGen = [CS](CodeGenFunction &CGF, PrePostActionTy &) {
     CGF.EmitStmt(CS->getCapturedStmt());
   };
-  auto &&TaskGen = [&S, SharedsTy, CapturedStruct,
-                    IfCond](CodeGenFunction &CGF, llvm::Function *OutlinedFn,
-                            const OMPTaskDataTy &Data) {
+  auto &&TaskGen = [&S, SharedsTy, CapturedStruct, IfCond, ReplayableCond](
+                       CodeGenFunction &CGF, llvm::Function *OutlinedFn,
+                       const OMPTaskDataTy &Data) {
     CGF.CGM.getOpenMPRuntime().emitTaskCall(CGF, S.getBeginLoc(), S, OutlinedFn,
                                             SharedsTy, CapturedStruct, IfCond,
-                                            Data);
+                                            ReplayableCond, Data);
   };
   auto LPCRegion =
       CGOpenMPRuntime::LastprivateConditionalRAII::disable(*this, S);
@@ -5877,7 +5902,18 @@ void CodeGenFunction::EmitOMPTaskwaitDirective(const OMPTaskwaitDirective &S) {
   // Build list of dependences
   buildDependences(S, Data);
   Data.HasNowaitClause = S.hasClausesOfKind<OMPNowaitClause>();
-  CGM.getOpenMPRuntime().emitTaskwaitCall(*this, S.getBeginLoc(), Data);
+  const Expr *ReplayableCond = nullptr;
+  if (auto *RC = S.getSingleClause<OMPReplayableClause>()) {
+    ReplayableCond = RC->getCondition();
+    if (!ReplayableCond) {
+      ReplayableCond = IntegerLiteral::Create(
+          getContext(), llvm::APInt(32, 1),
+          getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
+          SourceLocation());
+    }
+  }
+  CGM.getOpenMPRuntime().emitTaskwaitCall(*this, S.getBeginLoc(),
+                                          ReplayableCond, Data);
 }
 
 void CodeGenFunction::EmitOMPTaskgraphDirective(
@@ -8234,11 +8270,26 @@ void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
     }
   }
 
+  const Expr *ReplayableCond = nullptr;
+  if (auto *RC = S.getSingleClause<OMPReplayableClause>()) {
+    ReplayableCond = RC->getCondition();
+    if (!ReplayableCond) {
+      ReplayableCond = IntegerLiteral::Create(
+          getContext(), llvm::APInt(32, 1),
+          getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
+          SourceLocation());
+    }
+  }
+
   OMPTaskDataTy Data;
   // Check if taskloop must be emitted without taskgroup.
   Data.Nogroup = S.getSingleClause<OMPNogroupClause>();
   // TODO: Check if we should emit tied or untied task.
   Data.Tied = true;
+  // Propagate the replayable signal into Data so that reduction init can
+  // route to the taskgraph-aware runtime entry point when the taskloop may
+  // participate in taskgraph replay.
+  Data.ReplayableCond = ReplayableCond;
   // Set scheduling for taskloop
   if (const auto *Clause = S.getSingleClause<OMPGrainsizeClause>()) {
     // grainsize clause
@@ -8353,15 +8404,16 @@ void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
                                (*LIP)->getType(), S.getBeginLoc()));
     });
   };
-  auto &&TaskGen = [&S, SharedsTy, CapturedStruct,
-                    IfCond](CodeGenFunction &CGF, llvm::Function *OutlinedFn,
-                            const OMPTaskDataTy &Data) {
+  auto &&TaskGen = [&S, SharedsTy, CapturedStruct, IfCond, ReplayableCond](
+                       CodeGenFunction &CGF, llvm::Function *OutlinedFn,
+                       const OMPTaskDataTy &Data) {
     auto &&CodeGen = [&S, OutlinedFn, SharedsTy, CapturedStruct, IfCond,
+                      ReplayableCond,
                       &Data](CodeGenFunction &CGF, PrePostActionTy &) {
       OMPLoopScope PreInitScope(CGF, S);
-      CGF.CGM.getOpenMPRuntime().emitTaskLoopCall(CGF, S.getBeginLoc(), S,
-                                                  OutlinedFn, SharedsTy,
-                                                  CapturedStruct, IfCond, Data);
+      CGF.CGM.getOpenMPRuntime().emitTaskLoopCall(
+          CGF, S.getBeginLoc(), S, OutlinedFn, SharedsTy, CapturedStruct,
+          IfCond, ReplayableCond, Data);
     };
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_taskloop,
                                                     CodeGen);
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 5be99550f1ef6..1338001b73b30 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3393,6 +3393,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
       Clause = ParseOpenMPClause(CKind, WrongDirective);
     break;
   case OMPC_graph_reset:
+  case OMPC_replayable:
     if (!FirstClause) {
       Diag(Tok, diag::err_omp_more_one_clause)
           << getOpenMPDirectiveName(DKind, OMPVersion)
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index cd90c482f4a03..c1bf6671cfdec 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -16906,6 +16906,9 @@ OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
   case OMPC_graph_reset:
     Res = ActOnOpenMPGraphResetClause(Expr, StartLoc, LParenLoc, EndLoc);
     break;
+  case OMPC_replayable:
+    Res = ActOnOpenMPReplayableClause(StartLoc, EndLoc, LParenLoc, Expr);
+    break;
   case OMPC_novariants:
     Res = ActOnOpenMPNovariantsClause(Expr, StartLoc, LParenLoc, EndLoc);
     break;
@@ -18646,6 +18649,11 @@ OMPClause *SemaOpenMP::ActOnOpenMPClause(OpenMPClauseKind Kind,
   case OMPC_self_maps:
     Res = ActOnOpenMPSelfMapsClause(StartLoc, EndLoc);
     break;
+  case OMPC_replayable:
+    Res = ActOnOpenMPReplayableClause(StartLoc, EndLoc,
+                                      /*LParenLoc=*/SourceLocation(),
+                                      /*Condition=*/nullptr);
+    break;
   case OMPC_destroy:
     Res = ActOnOpenMPDestroyClause(/*InteropVar=*/nullptr, StartLoc,
                                    /*LParenLoc=*/SourceLocation(),
@@ -18883,6 +18891,26 @@ OMPClause *SemaOpenMP::ActOnOpenMPSelfMapsClause(SourceLocation StartLoc,
   return new (getASTContext()) OMPSelfMapsClause(StartLoc, EndLoc);
 }
 
+OMPClause *SemaOpenMP::ActOnOpenMPReplayableClause(SourceLocation StartLoc,
+                                                   SourceLocation EndLoc,
+                                                   SourceLocation LParenLoc,
+                                                   Expr *Condition) {
+  Expr *ValExpr = Condition;
+  if (Condition && LParenLoc.isValid()) {
+    if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
+        !Condition->isInstantiationDependent() &&
+        !Condition->containsUnexpandedParameterPack()) {
+      ExprResult Val = SemaRef.CheckBooleanCondition(StartLoc, Condition);
+      if (Val.isInvalid())
+        return nullptr;
+
+      ValExpr = Val.get();
+    }
+  }
+  return new (getASTContext())
+      OMPReplayableClause(ValExpr, StartLoc, LParenLoc, EndLoc);
+}
+
 StmtResult
 SemaOpenMP::ActOnOpenMPInteropDirective(ArrayRef<OMPClause *> Clauses,
                                         SourceLocation StartLoc,
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 91c5314886692..07812bd8074f9 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1903,6 +1903,18 @@ class TreeTransform {
                                                       LParenLoc, Condition);
   }
 
+  /// Build a new OpenMP 'replayable' clause.
+  ///
+  /// By default, performs semantic analysis to build the new OpenMP clause.
+  /// Subclasses may override this routine to provide different behavior.
+  OMPClause *RebuildOMPReplayableClause(Expr *Condition,
+                                        SourceLocation StartLoc,
+                                        SourceLocation LParenLoc,
+                                        SourceLocation EndLoc) {
+    return getSema().OpenMP().ActOnOpenMPReplayableClause(StartLoc, EndLoc,
+                                                          LParenLoc, Condition);
+  }
+
   /// Build a new OpenMP 'private' clause.
   ///
   /// By default, performs semantic analysis to build the new OpenMP clause.
@@ -10866,6 +10878,19 @@ TreeTransform<Derived>::TransformOMPNowaitClause(OMPNowaitClause *C) {
                                              C->getLParenLoc(), C->getEndLoc());
 }
 
+template <typename Derived>
+OMPClause *
+TreeTransform<Derived>::TransformOMPReplayableClause(OMPReplayableClause *C) {
+  ExprResult Cond;
+  if (auto *Condition = C->getCondition()) {
+    Cond = getDerived().TransformExpr(Condition);
+    if (Cond.isInvalid())
+      return nullptr;
+  }
+  return getDerived().RebuildOMPReplayableClause(
+      Cond.get(), C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
+}
+
 template <typename Derived>
 OMPClause *
 TreeTransform<Derived>::TransformOMPUntiedClause(OMPUntiedClause *C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index d98a5a3f240ef..a1f53a2c742c2 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -11706,6 +11706,9 @@ OMPClause *OMPClauseReader::readClause() {
   case llvm::omp::OMPC_graph_reset:
     C = new (Context) OMPGraphResetClause();
     break;
+  case llvm::omp::OMPC_replayable:
+    C = new (Context) OMPReplayableClause();
+    break;
   case llvm::omp::OMPC_num_tasks:
     C = new (Context) OMPNumTasksClause();
     break;
@@ -12010,6 +12013,11 @@ void OMPClauseReader::VisitOMPNowaitClause(OMPNowaitClause *C) {
   C->setLParenLoc(Record.readSourceLocation());
 }
 
+void OMPClauseReader::VisitOMPReplayableClause(OMPReplayableClause *C) {
+  C->setCondition(Record.readSubExpr());
+  C->setLParenLoc(Record.readSourceLocation());
+}
+
 void OMPClauseReader::VisitOMPUntiedClause(OMPUntiedClause *) {}
 
 void OMPClauseReader::VisitOMPMergeableClause(OMPMergeableClause *) {}
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index df0d1a35ea715..20e981b9da3b5 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8166,6 +8166,11 @@ void OMPClauseWriter::VisitOMPNowaitClause(OMPNowaitClause *C) {
   Record.AddSourceLocation(C->getLParenLoc());
 }
 
+void OMPClauseWriter::VisitOMPReplayableClause(OMPReplayableClause *C) {
+  Record.AddStmt(C->getCondition());
+  Record.AddSourceLocation(C->getLParenLoc());
+}
+
 void OMPClauseWriter::VisitOMPUntiedClause(OMPUntiedClause *) {}
 
 void OMPClauseWriter::VisitOMPMergeableClause(OMPMergeableClause *) {}
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 94c7501ff7887..1fa69e6ec876d 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2407,6 +2407,10 @@ void OMPClauseEnqueue::VisitOMPNowaitClause(const OMPNowaitClause *C) {
   Visitor->AddStmt(C->getCondition());
 }
 
+void OMPClauseEnqueue::VisitOMPReplayableClause(const OMPReplayableClause *C) {
+  Visitor->AddStmt(C->getCondition());
+}
+
 void OMPClauseEnqueue::VisitOMPUntiedClause(const OMPUntiedClause *) {}
 
 void OMPClauseEnqueue::VisitOMPMergeableClause(const OMPMergeableClause *) {}
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 91ce4eac9e370..190b4de035980 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -499,6 +499,7 @@ def OMPC_Release : Clause<[Spelling<"release">]> {
   let clangClass = "OMPReleaseClause";
 }
 def OMPC_Replayable : Clause<[Spelling<"replayable">]> {
+  let clangClass = "OMPReplayableClause";
   let flangClass = "OmpReplayableClause";
   let isValueOptional = true;
 }

>From d8f7479286f6e1496df88fa74b18e36db4723384 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Mon, 20 Apr 2026 06:05:36 -0500
Subject: [PATCH 09/24] [OpenMP] Add tests for 'replayable' clause

Parsing and diagnostics tests for the replayable clause added by
the previous patch, plus two libomp runtime tests
(taskgraph_replayable_nonlexical_reduction_minimal_taskloop.cpp
and taskgraph_replayable_nonlexical_recursive_reduction.cpp) for
reductions on non-lexically-nested replayable taskloops, which
exercise the Clang dispatch through __kmpc_taskgraph_taskred_init
that stashes the reduction input for later replay.

Assisted-By: Codex with gpt-5.3
Assisted-By: Claude Opus 4.7

commit-id:6a552a7c

Pull Request: https://github.com/llvm/llvm-project/pull/194054
---
 clang/test/OpenMP/replayable_ast_print.cpp    | 233 ++++++++++++++++++
 clang/test/OpenMP/replayable_messages.cpp     |  51 ++++
 ...layable_nonlexical_recursive_reduction.cpp | 147 +++++++++++
 ..._nonlexical_reduction_minimal_taskloop.cpp | 105 ++++++++
 4 files changed, 536 insertions(+)
 create mode 100644 clang/test/OpenMP/replayable_ast_print.cpp
 create mode 100644 clang/test/OpenMP/replayable_messages.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_recursive_reduction.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_reduction_minimal_taskloop.cpp

diff --git a/clang/test/OpenMP/replayable_ast_print.cpp b/clang/test/OpenMP/replayable_ast_print.cpp
new file mode 100644
index 0000000000000..cea97b8b1cd73
--- /dev/null
+++ b/clang/test/OpenMP/replayable_ast_print.cpp
@@ -0,0 +1,233 @@
+// Check no warnings/errors
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Check AST and unparsing
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -ast-dump  %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -ast-print %s | FileCheck %s --check-prefix=PRINT
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -include-pch %t -ast-print    %s | FileCheck %s --check-prefix=PRINT
+
+#ifndef HEADER
+#define HEADER
+
+void replayable_clauses() {
+  int A = 1;
+  int B[10];
+
+  // --- omp task ---
+
+  // DUMP: OMPTaskDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // PRINT: #pragma omp task replayable
+  #pragma omp task replayable
+  {}
+
+  // DUMP: OMPTaskDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' false
+  // PRINT: #pragma omp task replayable(false)
+  #pragma omp task replayable(false)
+  {}
+
+  // DUMP: OMPTaskDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' true
+  // PRINT: #pragma omp task replayable(true)
+  #pragma omp task replayable(true)
+  {}
+
+  // DUMP: OMPTaskDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: BinaryOperator {{.*}} 'bool' '>'
+  // DUMP-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+  // DUMP-NEXT: DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'A' 'int'
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 5
+  // PRINT: #pragma omp task replayable(A > 5)
+  #pragma omp task replayable(A > 5)
+  {}
+
+  // --- omp taskloop ---
+
+  // DUMP: OMPTaskLoopDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // PRINT: #pragma omp taskloop replayable
+  #pragma omp taskloop replayable
+  for (int i = 0; i < 10; ++i)
+    {}
+
+  // DUMP: OMPTaskLoopDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' false
+  // PRINT: #pragma omp taskloop replayable(false)
+  #pragma omp taskloop replayable(false)
+  for (int i = 0; i < 10; ++i)
+    {}
+
+  // DUMP: OMPTaskLoopDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' true
+  // PRINT: #pragma omp taskloop replayable(true)
+  #pragma omp taskloop replayable(true)
+  for (int i = 0; i < 10; ++i)
+    {}
+
+  // DUMP: OMPTaskLoopDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: BinaryOperator {{.*}} 'bool' '>'
+  // DUMP-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+  // DUMP-NEXT: DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'A' 'int'
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 5
+  // PRINT: #pragma omp taskloop replayable(A > 5)
+  #pragma omp taskloop replayable(A > 5)
+  for (int i = 0; i < 10; ++i)
+    {}
+
+  // --- omp taskwait ---
+
+  // DUMP: OMPTaskwaitDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // PRINT: #pragma omp taskwait replayable
+  #pragma omp taskwait replayable
+
+  // DUMP: OMPTaskwaitDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' false
+  // PRINT: #pragma omp taskwait replayable(false)
+  #pragma omp taskwait replayable(false)
+
+  // DUMP: OMPTaskwaitDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' true
+  // PRINT: #pragma omp taskwait replayable(true)
+  #pragma omp taskwait replayable(true)
+
+  // DUMP: OMPTaskwaitDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: BinaryOperator {{.*}} 'bool' '>'
+  // DUMP-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+  // DUMP-NEXT: DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'A' 'int'
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 5
+  // PRINT: #pragma omp taskwait replayable(A > 5)
+  #pragma omp taskwait replayable(A > 5)
+
+  // --- omp target ---
+
+  // DUMP: OMPTargetDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // PRINT: #pragma omp target replayable
+  #pragma omp target replayable
+  {}
+
+  // DUMP: OMPTargetDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' false
+  // PRINT: #pragma omp target replayable(false)
+  #pragma omp target replayable(false)
+  {}
+
+  // DUMP: OMPTargetDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' true
+  // PRINT: #pragma omp target replayable(true)
+  #pragma omp target replayable(true)
+  {}
+
+  // DUMP: OMPTargetDirective
+  // DUMP-NEXT: OMPReplayableClause
+  // DUMP-NEXT: BinaryOperator {{.*}} 'bool' '>'
+  // DUMP-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+  // DUMP-NEXT: DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'A' 'int'
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 5
+  // PRINT: #pragma omp target replayable(A > 5)
+  #pragma omp target replayable(A > 5)
+  {}
+
+  // --- omp target enter data ---
+
+  // DUMP: OMPTargetEnterDataDirective
+  // DUMP: OMPReplayableClause
+  // PRINT: #pragma omp target enter data map(to: A) replayable
+  #pragma omp target enter data map(to: A) replayable
+
+  // DUMP: OMPTargetEnterDataDirective
+  // DUMP: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' false
+  // PRINT: #pragma omp target enter data map(to: A) replayable(false)
+  #pragma omp target enter data map(to: A) replayable(false)
+
+  // DUMP: OMPTargetEnterDataDirective
+  // DUMP: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' true
+  // PRINT: #pragma omp target enter data map(to: A) replayable(true)
+  #pragma omp target enter data map(to: A) replayable(true)
+
+  // DUMP: OMPTargetEnterDataDirective
+  // DUMP: OMPReplayableClause
+  // DUMP-NEXT: BinaryOperator {{.*}} 'bool' '>'
+  // DUMP-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+  // DUMP-NEXT: DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'A' 'int'
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 5
+  // PRINT: #pragma omp target enter data map(to: A) replayable(A > 5)
+  #pragma omp target enter data map(to: A) replayable(A > 5)
+
+  // --- omp target exit data ---
+
+  // DUMP: OMPTargetExitDataDirective
+  // DUMP: OMPReplayableClause
+  // PRINT: #pragma omp target exit data map(from: A) replayable
+  #pragma omp target exit data map(from: A) replayable
+
+  // DUMP: OMPTargetExitDataDirective
+  // DUMP: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' false
+  // PRINT: #pragma omp target exit data map(from: A) replayable(false)
+  #pragma omp target exit data map(from: A) replayable(false)
+
+  // DUMP: OMPTargetExitDataDirective
+  // DUMP: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' true
+  // PRINT: #pragma omp target exit data map(from: A) replayable(true)
+  #pragma omp target exit data map(from: A) replayable(true)
+
+  // DUMP: OMPTargetExitDataDirective
+  // DUMP: OMPReplayableClause
+  // DUMP-NEXT: BinaryOperator {{.*}} 'bool' '>'
+  // DUMP-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+  // DUMP-NEXT: DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'A' 'int'
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 5
+  // PRINT: #pragma omp target exit data map(from: A) replayable(A > 5)
+  #pragma omp target exit data map(from: A) replayable(A > 5)
+
+  // --- omp target update ---
+
+  // DUMP: OMPTargetUpdateDirective
+  // DUMP: OMPReplayableClause
+  // PRINT: #pragma omp target update to(A) replayable
+  #pragma omp target update to(A) replayable
+
+  // DUMP: OMPTargetUpdateDirective
+  // DUMP: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' false
+  // PRINT: #pragma omp target update to(A) replayable(false)
+  #pragma omp target update to(A) replayable(false)
+
+  // DUMP: OMPTargetUpdateDirective
+  // DUMP: OMPReplayableClause
+  // DUMP-NEXT: XXBoolLiteralExpr {{.*}} 'bool' true
+  // PRINT: #pragma omp target update to(A) replayable(true)
+  #pragma omp target update to(A) replayable(true)
+
+  // DUMP: OMPTargetUpdateDirective
+  // DUMP: OMPReplayableClause
+  // DUMP-NEXT: BinaryOperator {{.*}} 'bool' '>'
+  // DUMP-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
+  // DUMP-NEXT: DeclRefExpr {{.*}} 'int' lvalue Var {{.*}} 'A' 'int'
+  // DUMP-NEXT: IntegerLiteral {{.*}} 'int' 5
+  // PRINT: #pragma omp target update to(A) replayable(A > 5)
+  #pragma omp target update to(A) replayable(A > 5)
+}
+#endif
diff --git a/clang/test/OpenMP/replayable_messages.cpp b/clang/test/OpenMP/replayable_messages.cpp
new file mode 100644
index 0000000000000..10172f50162ad
--- /dev/null
+++ b/clang/test/OpenMP/replayable_messages.cpp
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -verify=expected,omp51 -fsyntax-only %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -verify=expected,omp60 -fsyntax-only %s
+
+// Tests that the 'replayable' clause is accepted in OpenMP 6.0 and rejected in
+// prior versions on all seven supported directives. Also tests duplicate-clause
+// and invalid-condition diagnostics.
+
+void foo() {}
+
+void replayable_messages() {
+  int A = 1;
+
+  #pragma omp task replayable // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp task'}}
+  {}
+
+  #pragma omp taskloop replayable // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp taskloop'}}
+  for (int i = 0; i < 10; ++i)
+    {}
+
+  #pragma omp taskwait replayable // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp taskwait'}}
+
+  #pragma omp target replayable // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp target'}}
+  {}
+
+  #pragma omp target enter data map(to: A) replayable // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp target enter data'}}
+
+  #pragma omp target exit data map(from: A) replayable // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp target exit data'}}
+
+  #pragma omp target update to(A) replayable // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp target update'}}
+
+  #pragma omp task replayable replayable // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp task'}} omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp task'}} expected-error {{directive '#pragma omp task' cannot contain more than one 'replayable' clause}}
+  {}
+
+  #pragma omp task replayable(foo()) // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp task'}} omp60-error {{value of type 'void' is not contextually convertible to 'bool'}}
+  {}
+
+  #pragma omp taskloop replayable(A > 0) // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp taskloop'}}
+  for (int i = 0; i < 10; ++i)
+    {}
+
+  #pragma omp taskwait replayable(A > 0) // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp taskwait'}}
+
+  #pragma omp target replayable(A > 0) // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp target'}}
+  {}
+
+  #pragma omp target enter data map(to: A) replayable(A > 0) // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp target enter data'}}
+
+  #pragma omp target exit data map(from: A) replayable(A > 0) // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp target exit data'}}
+
+  #pragma omp target update to(A) replayable(A > 0) // omp51-error {{unexpected OpenMP clause 'replayable' in directive '#pragma omp target update'}}
+}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_recursive_reduction.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_recursive_reduction.cpp
new file mode 100644
index 0000000000000..c349bd2f34146
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_recursive_reduction.cpp
@@ -0,0 +1,147 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+// Exercises reduction(+: ...) on dynamically (non-lexically) nested
+// replayable constructs reached through a recursive driver:
+//
+//   - emit_reduction_taskloop():  replayable taskloop performing a reduction
+//                                 into a file-scope accumulator (Sum).  The
+//                                 'seed' parameter is captured as
+//                                 firstprivate(saved:) so the snapshot
+//                                 taken at recording is reused unchanged
+//                                 on every replay (OpenMP 6.0 [7.5.4] /
+//                                 [14.3]).
+//   - emit_publish_task():        replayable task that reads Sum and stores
+//                                 it into a distinct slot of a file-scope
+//                                 Snapshots[] array; the slot index is
+//                                 carried into the task as a
+//                                 firstprivate(saved:) capture so that
+//                                 every recursion-level publish lands in
+//                                 its own slot and no two publishes race
+//                                 for the same destination
+//   - driver():                   recursive function that calls
+//                                 emit_reduction_taskloop() on descent and
+//                                 emit_publish_task() on ascent, so on
+//                                 replay these constructs are dispatched
+//                                 from differing recursive stack frames
+//   - run_taskgraph():            wraps the recursive driver() in
+//                                 #pragma omp taskgraph and returns Sum
+//                                 (the cumulative reduction, which is the
+//                                 deterministic test signal; Snapshots[]
+//                                 is written purely as a side-effect to
+//                                 exercise the replayable publish tasks)
+//
+// Per the saved-snapshot semantics, every replay reproduces the recording
+// run's reduction value, independent of the seed argument passed to the
+// replay invocation.  The test therefore (i) verifies that the recording
+// matches expected_result(Seeds[0]) and (ii) verifies that every replay
+// matches the recording.  Historically this exercise hit two consecutive
+// crashes on replay: first a relocate-side #302 because the (static)
+// reduction target's shareds slot couldn't be re-projected from a
+// non-OpenMP outer scope, then a taskred-lookup assertion because the
+// recording-time taskgroup_t holding the reduction state had been torn
+// down.  Both gaps are now fixed in the compiler and runtime.
+
+#include <cstdio>
+
+static constexpr int MaxDepth = 4;
+
+static volatile int Sum = 0;
+static volatile int Snapshots[MaxDepth] = {0, 0, 0, 0};
+
+__attribute__((noinline)) static void emit_reduction_taskloop(int seed) {
+#pragma omp taskloop replayable num_tasks(8) reduction(+ : Sum)                \
+    firstprivate(saved : seed)
+  for (int i = 0; i < 16; ++i)
+    Sum += seed + i;
+}
+
+__attribute__((noinline)) static void emit_publish_task(int slot) {
+#pragma omp task replayable firstprivate(saved : slot)
+  {
+    Snapshots[slot] = Sum;
+  }
+}
+
+__attribute__((noinline)) static void driver(int depth, int seed) {
+  emit_reduction_taskloop(seed + depth);
+  if (depth == 0) {
+    emit_publish_task(depth);
+    return;
+  }
+  driver(depth - 1, seed);
+  emit_publish_task(depth);
+}
+
+__attribute__((noinline)) static int run_taskgraph(int seed) {
+  Sum = 0;
+  for (int i = 0; i < MaxDepth; ++i)
+    Snapshots[i] = 0;
+
+#pragma omp taskgraph graph_id(917)
+  {
+    driver(MaxDepth - 1, seed);
+  }
+
+  // Sum is the deterministic test signal: after the taskgraph's implicit
+  // taskwait it holds the cumulative reduction across all MaxDepth
+  // taskloops.  The per-slot Snapshots[] entries are written by the
+  // replayable publish tasks but race with the reduction taskloops, so
+  // their individual values are non-deterministic and are intentionally
+  // not compared here.  The array exists only so that each replayable
+  // publish task targets a distinct destination instead of clobbering a
+  // shared scalar.
+  return Sum;
+}
+
+__attribute__((noinline)) static int expected_result(int seed) {
+  int sum = 0;
+  for (int d = MaxDepth - 1; d >= 0; --d)
+    for (int i = 0; i < 16; ++i)
+      sum += seed + d + i;
+  return sum;
+}
+
+int main() {
+  constexpr int NumRuns = 4;
+  constexpr int Seeds[NumRuns] = {1, 5, 11, 23};
+
+  int recorded = -1;
+  bool failed = false;
+
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp single
+    {
+      recorded = run_taskgraph(Seeds[0]);
+      const int exp0 = expected_result(Seeds[0]);
+      if (recorded != exp0) {
+        std::fprintf(stderr, "FAIL initial record got=%d expected=%d\n",
+                     recorded, exp0);
+        failed = true;
+      }
+
+      // Saved-snapshot semantics: every replay should reproduce the
+      // recording's reduction value, regardless of the live argument.
+      for (int i = 1; i < NumRuns; ++i) {
+        const int replayed = run_taskgraph(Seeds[i]);
+        if (replayed != recorded) {
+          std::fprintf(stderr, "FAIL replay %d seed=%d got=%d recorded=%d\n", i,
+                       Seeds[i], replayed, recorded);
+          failed = true;
+        }
+      }
+    }
+  }
+
+  if (failed)
+    return 1;
+
+  std::fprintf(stderr, "PASS non-lexical recursive reduction result=%d\n",
+               recorded);
+  return 0;
+}
+
+// CHECK: PASS non-lexical recursive reduction result=
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_reduction_minimal_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_reduction_minimal_taskloop.cpp
new file mode 100644
index 0000000000000..17b36f16a02ef
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_reduction_minimal_taskloop.cpp
@@ -0,0 +1,105 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+// Minimal end-to-end exercise of reduction(+: ...) on a replayable taskloop
+// that is dynamically (non-lexically) nested inside a taskgraph: the
+// taskloop directive lives in a helper called from inside the taskgraph
+// region, not in the taskgraph's lexical body.
+//
+//   - Sum is a file-scope static, so the reduction's accumulator address is
+//     stable across runs and the taskgraph relocate helper has no shareds
+//     slot it must refresh (the static capture is link-time-fixed).
+//   - 'seed' is captured into the taskloop's per-task '.kmp_privates.t'
+//     snapshot as firstprivate(saved: ...); per OpenMP 6.0 [7.5.4] / [14.3]
+//     the snapshot is taken at recording time and reused unchanged on every
+//     replay.
+//   - The recording run therefore produces the expected reduction for
+//     Seeds[0], and every subsequent replay (each invoked with a different
+//     seed) produces exactly the recorded value because the saved snapshot
+//     of 'seed' is what the body sees.
+//
+// Historically this test crashed on replay with OMP: Error #302 because
+// the taskgraph relocate helper refused to handle the (static) reduction
+// capture, and even after that was fixed the reduction body still
+// asserted because the recording-time taskgroup_t had been torn down and
+// the taskred state was unreachable.  Both gaps are addressed: the
+// relocate helper now treats captures of static-storage variables as
+// no-op-safe, and the runtime now stashes reduction-init input into the
+// surrounding taskgraph at recording so the replay machinery can re-create
+// the taskred state on every replay.
+
+#include <cstdio>
+
+static volatile int Sum = 0;
+
+__attribute__((noinline)) static void emit_reduction_taskloop(int seed) {
+#pragma omp taskloop replayable num_tasks(8) reduction(+ : Sum)                \
+    firstprivate(saved : seed)
+  for (int i = 0; i < 16; ++i)
+    Sum += seed + i;
+}
+
+__attribute__((noinline)) static int run_taskgraph(int seed) {
+  Sum = 0;
+
+#pragma omp taskgraph graph_id(921)
+  {
+    emit_reduction_taskloop(seed);
+  }
+
+  return Sum;
+}
+
+__attribute__((noinline)) static int expected_result(int seed) {
+  int sum = 0;
+  for (int i = 0; i < 16; ++i)
+    sum += seed + i;
+  return sum;
+}
+
+int main() {
+  constexpr int NumRuns = 4;
+  constexpr int Seeds[NumRuns] = {1, 5, 11, 23};
+
+  int recorded = -1;
+  bool failed = false;
+
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp single
+    {
+      recorded = run_taskgraph(Seeds[0]);
+      const int exp0 = expected_result(Seeds[0]);
+      if (recorded != exp0) {
+        std::fprintf(stderr, "FAIL initial record got=%d expected=%d\n",
+                     recorded, exp0);
+        failed = true;
+      }
+
+      // 'seed' is firstprivate(saved:), so every replay's body sees the
+      // snapshot taken at recording time.  The reduction therefore yields
+      // the same value as the recording regardless of the live argument
+      // passed on the replay.
+      for (int i = 1; i < NumRuns; ++i) {
+        const int replayed = run_taskgraph(Seeds[i]);
+        if (replayed != recorded) {
+          std::fprintf(stderr, "FAIL replay %d seed=%d got=%d recorded=%d\n", i,
+                       Seeds[i], replayed, recorded);
+          failed = true;
+        }
+      }
+    }
+  }
+
+  if (failed)
+    return 1;
+
+  std::fprintf(stderr,
+               "PASS non-lexical replayable taskloop reduction result=%d\n",
+               recorded);
+  return 0;
+}
+
+// CHECK: PASS non-lexical replayable taskloop reduction result=

>From f69c79f84f0e0befc504317537b48d46229b8df2 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Thu, 23 Apr 2026 14:47:24 -0500
Subject: [PATCH 10/24] [OpenMP] OpenMP 6.0 "taskgraph" support, additional
 tests from previous implementation

This patch contains new tests from previous iterations of taskgraph
support, with some changes to adjust to the new implementation.

Co-Authored-By: Josep Pinot <josep at bsc.es>

Pull Request: https://github.com/llvm/llvm-project/pull/194055
---
 openmp/runtime/test/tasking/omp_taskgraph.cpp | 37 ++++++++++
 .../test/tasking/omp_taskgraph_deps.cpp       | 54 +++++++++++++++
 .../test/tasking/omp_taskgraph_multiTDGs.cpp  | 68 +++++++++++++++++++
 .../test/tasking/omp_taskgraph_taskloop.cpp   | 41 +++++++++++
 4 files changed, 200 insertions(+)
 create mode 100644 openmp/runtime/test/tasking/omp_taskgraph.cpp
 create mode 100644 openmp/runtime/test/tasking/omp_taskgraph_deps.cpp
 create mode 100644 openmp/runtime/test/tasking/omp_taskgraph_multiTDGs.cpp
 create mode 100644 openmp/runtime/test/tasking/omp_taskgraph_taskloop.cpp

diff --git a/openmp/runtime/test/tasking/omp_taskgraph.cpp b/openmp/runtime/test/tasking/omp_taskgraph.cpp
new file mode 100644
index 0000000000000..67567a9b31870
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_taskgraph.cpp
@@ -0,0 +1,37 @@
+// clang-format off
+// REQUIRES: omp_taskgraph_experimental
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run
+// clang-format on
+#include <iostream>
+#include <cassert>
+#define NT 100
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+  void *dummy;
+} ident_t;
+
+void func(int *num_exec) { (*num_exec)++; }
+
+int main() {
+  int num_exec = 0;
+  int num_tasks = 0;
+  int x = 0;
+#pragma omp parallel
+#pragma omp single
+  for (int iter = 0; iter < NT; ++iter) {
+#pragma omp taskgraph
+    {
+      num_tasks++;
+#pragma omp task
+      func(&num_exec);
+    }
+  }
+
+  assert(num_tasks == 1);
+  assert(num_exec == NT);
+
+  std::cout << "Passed" << std::endl;
+  return 0;
+}
+// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_taskgraph_deps.cpp b/openmp/runtime/test/tasking/omp_taskgraph_deps.cpp
new file mode 100644
index 0000000000000..e83e0c2f2ce62
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_taskgraph_deps.cpp
@@ -0,0 +1,54 @@
+// clang-format off
+// REQUIRES: omp_taskgraph_experimental
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run
+// clang-format on
+#include <iostream>
+#include <cassert>
+#define NT 100
+#define MULTIPLIER 100
+#define DECREMENT 5
+
+int val;
+// Compiler-generated code (emulation)
+typedef struct ident {
+  void *dummy;
+} ident_t;
+
+void sub() {
+#pragma omp atomic
+  val -= DECREMENT;
+}
+
+void add() {
+#pragma omp atomic
+  val += DECREMENT;
+}
+
+void mult() {
+  // no atomicity needed, can only be executed by 1 thread
+  // and no concurrency with other tasks possible
+  val *= MULTIPLIER;
+}
+
+int main() {
+  val = 0;
+  int *x, *y;
+#pragma omp parallel
+#pragma omp single
+  for (int iter = 0; iter < NT; ++iter) {
+#pragma omp taskgraph
+    {
+#pragma omp task depend(out : y)
+      add();
+#pragma omp task depend(out : x)
+      sub();
+#pragma omp task depend(in : x, y)
+      mult();
+    }
+  }
+  assert(val == 0);
+
+  std::cout << "Passed" << std::endl;
+  return 0;
+}
+// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_taskgraph_multiTDGs.cpp b/openmp/runtime/test/tasking/omp_taskgraph_multiTDGs.cpp
new file mode 100644
index 0000000000000..eedd4e642c956
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_taskgraph_multiTDGs.cpp
@@ -0,0 +1,68 @@
+// clang-format off
+// REQUIRES: omp_taskgraph_experimental
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run
+// clang-format on
+#include <iostream>
+#include <cassert>
+#define NT 20
+#define MULTIPLIER 100
+#define DECREMENT 5
+
+// Compiler-generated code (emulation)
+typedef struct ident {
+  void *dummy;
+} ident_t;
+
+int val;
+
+void sub() {
+#pragma omp atomic
+  val -= DECREMENT;
+}
+
+void add() {
+#pragma omp atomic
+  val += DECREMENT;
+}
+
+void mult() {
+  // no atomicity needed, can only be executed by 1 thread
+  // and no concurrency with other tasks possible
+  val *= MULTIPLIER;
+}
+
+int main() {
+  int num_tasks = 0;
+  int *x, *y;
+#pragma omp parallel
+#pragma omp single
+  for (int iter = 0; iter < NT; ++iter) {
+#pragma omp taskgraph
+    {
+      num_tasks++;
+#pragma omp task depend(out : y)
+      add();
+#pragma omp task depend(out : x)
+      sub();
+#pragma omp task depend(in : x, y)
+      mult();
+    }
+#pragma omp taskgraph
+    {
+      num_tasks++;
+#pragma omp task depend(out : y)
+      add();
+#pragma omp task depend(out : x)
+      sub();
+#pragma omp task depend(in : x, y)
+      mult();
+    }
+  }
+
+  assert(num_tasks == 2);
+  assert(val == 0);
+
+  std::cout << "Passed" << std::endl;
+  return 0;
+}
+// CHECK: Passed
diff --git a/openmp/runtime/test/tasking/omp_taskgraph_taskloop.cpp b/openmp/runtime/test/tasking/omp_taskgraph_taskloop.cpp
new file mode 100644
index 0000000000000..f4fe290a90f0f
--- /dev/null
+++ b/openmp/runtime/test/tasking/omp_taskgraph_taskloop.cpp
@@ -0,0 +1,41 @@
+// clang-format off
+// REQUIRES: omp_taskgraph_experimental
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run
+// clang-format on
+#include <iostream>
+#include <cassert>
+
+#define NT 20
+#define N 128 * 128
+
+typedef struct ident {
+  void *dummy;
+} ident_t;
+
+int main() {
+  int num_tasks = 0;
+
+  int array[N];
+  for (int i = 0; i < N; ++i)
+    array[i] = 1;
+
+  long sum = 0;
+#pragma omp parallel
+#pragma omp single
+  for (int iter = 0; iter < NT; ++iter) {
+#pragma omp taskgraph
+    {
+      num_tasks++;
+#pragma omp taskloop reduction(+ : sum) num_tasks(4096)
+      for (int i = 0; i < N; ++i) {
+        sum += array[i];
+      }
+    }
+  }
+  assert(sum == N * NT);
+  assert(num_tasks == 1);
+
+  std::cout << "Passed" << std::endl;
+  return 0;
+}
+// CHECK: Passed

>From dac8aa4273d8da09b434efd1dbdcff2e82819e9a Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Fri, 24 Apr 2026 14:56:05 -0500
Subject: [PATCH 11/24] [OpenMP] Add libomp_debug feature for tests

This adds a libomp_debug feature, for use by tests which only work when
the libomp runtime is built with debug output enabled (KMP_?_DEBUG, etc.).

commit-id:5d2ebd7d

Pull Request: https://github.com/llvm/llvm-project/pull/194056
---
 openmp/runtime/test/CMakeLists.txt  | 2 ++
 openmp/runtime/test/lit.cfg         | 3 +++
 openmp/runtime/test/lit.site.cfg.in | 1 +
 3 files changed, 6 insertions(+)

diff --git a/openmp/runtime/test/CMakeLists.txt b/openmp/runtime/test/CMakeLists.txt
index f34d6a8e80ac3..61d5427274865 100644
--- a/openmp/runtime/test/CMakeLists.txt
+++ b/openmp/runtime/test/CMakeLists.txt
@@ -34,6 +34,8 @@ pythonize_bool(LIBOMP_TASKGRAPH_EXPERIMENTAL)
 pythonize_bool(LIBOMP_HAVE_LIBM)
 pythonize_bool(LIBOMP_HAVE_LIBATOMIC)
 pythonize_bool(OPENMP_TEST_COMPILER_HAS_OMIT_FRAME_POINTER_FLAGS)
+pythonize_bool(DEBUG_BUILD)
+pythonize_bool(RELWITHDEBINFO_BUILD)
 
 add_library(ompt-print-callback INTERFACE)
 target_include_directories(ompt-print-callback INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/ompt)
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index 4885f4e2cfbe3..8015b6beafe0d 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -127,6 +127,9 @@ if config.has_ompt:
 if config.has_omp_taskgraph_experimental:
     config.available_features.add("omp_taskgraph_experimental")
 
+if config.libomp_debug:
+    config.available_features.add("libomp_debug")
+
 if config.operating_system == 'AIX':
     config.available_features.add("aix")
     object_mode = os.environ.get('OBJECT_MODE', '32')
diff --git a/openmp/runtime/test/lit.site.cfg.in b/openmp/runtime/test/lit.site.cfg.in
index a4d7c8d8b205e..8b8b8c910a551 100644
--- a/openmp/runtime/test/lit.site.cfg.in
+++ b/openmp/runtime/test/lit.site.cfg.in
@@ -24,6 +24,7 @@ config.target_arch = "@LIBOMP_ARCH@"
 config.compiler_frontend_variant = "@CMAKE_C_COMPILER_FRONTEND_VARIANT@"
 config.compiler_simulate_id = "@CMAKE_C_SIMULATE_ID@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.libomp_debug = @DEBUG_BUILD@ or @RELWITHDEBINFO_BUILD@
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)

>From 7e8925197d9f8da2a6820b669c3d07c43b56f294 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 25 Mar 2026 16:57:25 -0500
Subject: [PATCH 12/24] [OpenMP] OpenMP 6.0 "taskgraph" support, add new tests

These are new tests intended to cover the functionality present in the
new taskgraph implementation.

commit-id:4eb2eb71

Pull Request: https://github.com/llvm/llvm-project/pull/194057
---
 .../test/taskgraph/taskgraph_deps_1.cpp       |  53 +++++++++
 .../test/taskgraph/taskgraph_deps_10.cpp      |  50 +++++++++
 .../test/taskgraph/taskgraph_deps_11.cpp      |  60 ++++++++++
 .../test/taskgraph/taskgraph_deps_12.cpp      |  55 ++++++++++
 .../test/taskgraph/taskgraph_deps_13.cpp      |  45 ++++++++
 .../test/taskgraph/taskgraph_deps_14.cpp      |  48 ++++++++
 .../test/taskgraph/taskgraph_deps_15.cpp      |  75 +++++++++++++
 .../test/taskgraph/taskgraph_deps_16.cpp      |  55 ++++++++++
 .../test/taskgraph/taskgraph_deps_17.cpp      |  68 ++++++++++++
 .../test/taskgraph/taskgraph_deps_18.cpp      |  46 ++++++++
 .../test/taskgraph/taskgraph_deps_19.cpp      |  51 +++++++++
 .../test/taskgraph/taskgraph_deps_2.cpp       |  58 ++++++++++
 .../test/taskgraph/taskgraph_deps_20.cpp      |  51 +++++++++
 .../test/taskgraph/taskgraph_deps_21.cpp      |  52 +++++++++
 .../test/taskgraph/taskgraph_deps_22.cpp      |  70 ++++++++++++
 .../test/taskgraph/taskgraph_deps_23.cpp      | 103 ++++++++++++++++++
 .../test/taskgraph/taskgraph_deps_24.cpp      |  80 ++++++++++++++
 .../test/taskgraph/taskgraph_deps_25.cpp      |  89 +++++++++++++++
 .../test/taskgraph/taskgraph_deps_26.cpp      |  61 +++++++++++
 .../test/taskgraph/taskgraph_deps_27.cpp      |  63 +++++++++++
 .../test/taskgraph/taskgraph_deps_3.cpp       |  80 ++++++++++++++
 .../test/taskgraph/taskgraph_deps_4.cpp       |  76 +++++++++++++
 .../test/taskgraph/taskgraph_deps_5.cpp       |  63 +++++++++++
 .../test/taskgraph/taskgraph_deps_6.cpp       |  59 ++++++++++
 .../test/taskgraph/taskgraph_deps_7.cpp       |  59 ++++++++++
 .../test/taskgraph/taskgraph_deps_8.cpp       |  39 +++++++
 .../test/taskgraph/taskgraph_deps_9.cpp       |  47 ++++++++
 27 files changed, 1656 insertions(+)
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_1.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_10.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_11.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_12.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_13.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_14.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_15.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_16.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_17.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_18.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_19.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_2.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_20.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_21.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_22.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_23.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_24.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_25.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_26.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_27.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_3.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_4.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_5.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_6.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_7.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_8.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_9.cpp

diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_1.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_1.cpp
new file mode 100644
index 0000000000000..5d5cabd7456a4
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_1.cpp
@@ -0,0 +1,53 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[3];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[2])
+          { }
+          #pragma omp task depend(out: deps[0], deps[1])
+          { }
+          #pragma omp task depend(inout: deps[0])
+          { }
+          #pragma omp task depend(inout: deps[1])
+          { }
+          #pragma omp task depend(inout: deps[2])
+          { }
+          #pragma omp task depend(in: deps[0], deps[1], deps[2])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: sequential {
+// CHECK-NEXT:   parallel {
+// CHECK-NEXT:     sequential {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:     sequential {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       parallel {
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    }
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_10.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_10.cpp
new file mode 100644
index 0000000000000..6cbb22f5ec164
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_10.cpp
@@ -0,0 +1,50 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[5];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1], deps[4])
+          { }
+          #pragma omp task depend(in: deps[0], deps[1])
+          { }
+
+          #pragma omp task depend(out: deps[2], deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2], deps[3], deps[4])
+          { }
+          #pragma omp task depend(in: deps[2], deps[3])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: parallel {
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}} [sets: 0x1c]
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}} [sets: 0x7]
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_11.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_11.cpp
new file mode 100644
index 0000000000000..aa24e23590abe
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_11.cpp
@@ -0,0 +1,60 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[4];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(in: deps[0], deps[1])
+          { }
+
+          #pragma omp task depend(out: deps[2], deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
+          { }
+          #pragma omp task depend(in: deps[2], deps[3])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: parallel {
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     exclusive {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     exclusive {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_12.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_12.cpp
new file mode 100644
index 0000000000000..3ae9dd485ecf4
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_12.cpp
@@ -0,0 +1,55 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[2];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(mutexinoutset: deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[1])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: parallel {
+// CHECK-NEXT:   exclusive {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   exclusive {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_13.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_13.cpp
new file mode 100644
index 0000000000000..ee666c584ba31
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_13.cpp
@@ -0,0 +1,45 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[4];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(mutexinoutset: deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: parallel {
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0xc]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x3]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x8]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x4]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x2]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x1]
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_14.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_14.cpp
new file mode 100644
index 0000000000000..325442310c5e3
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_14.cpp
@@ -0,0 +1,48 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[4];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(mutexinoutset: deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1], deps[2])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1], deps[2], deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[1], deps[2], deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[3])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: parallel {
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0xf]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0xe]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x7]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0xc]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x3]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x8]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x1]
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_15.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_15.cpp
new file mode 100644
index 0000000000000..c9e8bfc6f6bc2
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_15.cpp
@@ -0,0 +1,75 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[4];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[1], deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2], deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2], deps[1], deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[3], deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[3], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[3], deps[1], deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[3], deps[2])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[3], deps[2], deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[3], deps[2], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[3], deps[2], deps[1], deps[0])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: parallel {
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0xf]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0xe]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0xd]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0xb]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x7]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0xc]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0xa]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x9]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x6]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x5]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x3]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x8]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x4]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x2]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x1]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_16.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_16.cpp
new file mode 100644
index 0000000000000..8daeb7a50c91c
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_16.cpp
@@ -0,0 +1,55 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[8];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(inout: deps[0])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[4], deps[7])
+          { }
+          #pragma omp task depend(inout: deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[4], deps[7])
+          { }
+          #pragma omp task depend(inout: deps[2])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[5], deps[6])
+          { }
+          #pragma omp task depend(inout: deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[5], deps[6])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: parallel {
+// CHECK-NEXT:   exclusive {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   exclusive {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_17.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_17.cpp
new file mode 100644
index 0000000000000..fda89fd39a9a0
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_17.cpp
@@ -0,0 +1,68 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental
+
+#include <cstdio>
+
+int main()
+{
+  int deps[4];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[1])
+          {
+            fprintf(stderr, "task 0\n");
+          }
+          #pragma omp task depend(out: deps[2], deps[3])
+          {
+            fprintf(stderr, "task 1\n");
+          }
+          #pragma omp task depend(inout: deps[0])
+          {
+            fprintf(stderr, "task 2\n");
+          }
+          #pragma omp task depend(inout: deps[1])
+          {
+            fprintf(stderr, "task 3\n");
+          }
+          #pragma omp task depend(inout: deps[2])
+          {
+            fprintf(stderr, "task 4\n");
+          }
+          #pragma omp task depend(inout: deps[3])
+          {
+            fprintf(stderr, "task 5\n");
+          }
+          #pragma omp task depend(in: deps[0], deps[1], deps[2], deps[3])
+          {
+            fprintf(stderr, "task 6\n");
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK-DAG: task 0
+// CHECK-DAG: task 2
+// CHECK-DAG: task 3
+// CHECK-DAG: task 1
+// CHECK-DAG: task 4
+// CHECK-DAG: task 5
+// CHECK: task 6
+
+// CHECK-DAG: task 0
+// CHECK-DAG: task 2
+// CHECK-DAG: task 3
+// CHECK-DAG: task 1
+// CHECK-DAG: task 4
+// CHECK-DAG: task 5
+// CHECK: task 6
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_18.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_18.cpp
new file mode 100644
index 0000000000000..ed0cca72fa5a6
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_18.cpp
@@ -0,0 +1,46 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[2];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[1])
+          { }
+          #pragma omp taskloop num_tasks(strict: 2)
+          {
+            for (int j = 0; j < 20; j++) { }
+          }
+          #pragma omp task depend(in: deps[0], deps[1])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: sequential {
+// CHECK-NEXT:   parallel {
+// CHECK-NEXT:     sequential {
+// CHECK-NEXT:       parallel {
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       }
+// CHECK-NEXT:       wait: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_19.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_19.cpp
new file mode 100644
index 0000000000000..b18f8c844e8e1
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_19.cpp
@@ -0,0 +1,51 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental
+
+#include <cstdio>
+
+int main()
+{
+  int deps[3];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 10; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0])
+          {
+            fprintf(stderr, "task 0\n");
+          }
+          #pragma omp task depend(out: deps[1])
+          {
+            fprintf(stderr, "task 1\n");
+          }
+          #pragma omp task depend(out: deps[2])
+          {
+            fprintf(stderr, "task 2\n");
+          }
+          #pragma omp taskwait depend(in: deps[0], deps[1], deps[2])
+          #pragma omp task depend(in: deps[0], deps[1], deps[2])
+          {
+            fprintf(stderr, "task 3\n");
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK-DAG: task 0
+// CHECK-DAG: task 1
+// CHECK-DAG: task 2
+// CHECK: task 3
+
+// CHECK-DAG: task 0
+// CHECK-DAG: task 1
+// CHECK-DAG: task 2
+// CHECK: task 3
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_2.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_2.cpp
new file mode 100644
index 0000000000000..a7c678cdb3bdf
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_2.cpp
@@ -0,0 +1,58 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[4];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[1])
+          { }
+          #pragma omp task depend(out: deps[2], deps[3])
+          { }
+          #pragma omp task depend(inout: deps[0])
+          { }
+          #pragma omp task depend(inout: deps[1])
+          { }
+          #pragma omp task depend(inout: deps[2])
+          { }
+          #pragma omp task depend(inout: deps[3])
+          { }
+          #pragma omp task depend(in: deps[0], deps[1], deps[2], deps[3])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: sequential {
+// CHECK-NEXT:   parallel {
+// CHECK-NEXT:     sequential {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       parallel {
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:     sequential {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       parallel {
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_20.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_20.cpp
new file mode 100644
index 0000000000000..549b6d6138a52
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_20.cpp
@@ -0,0 +1,51 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental
+
+#include <cstdio>
+
+int main()
+{
+  int deps[3];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 10; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0])
+          {
+            fprintf(stderr, "task 0\n");
+          }
+          #pragma omp task depend(out: deps[1])
+          {
+            fprintf(stderr, "task 1\n");
+          }
+          #pragma omp task depend(out: deps[2])
+          {
+            fprintf(stderr, "task 2\n");
+          }
+          #pragma omp taskwait depend(inoutset: deps[0], deps[1])
+          #pragma omp task depend(in: deps[0], deps[1], deps[2])
+          {
+            fprintf(stderr, "task 3\n");
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK-DAG: task 0
+// CHECK-DAG: task 1
+// CHECK-DAG: task 2
+// CHECK: task 3
+
+// CHECK-DAG: task 0
+// CHECK-DAG: task 1
+// CHECK-DAG: task 2
+// CHECK: task 3
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_21.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_21.cpp
new file mode 100644
index 0000000000000..0b86e677a13eb
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_21.cpp
@@ -0,0 +1,52 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental
+
+#include <cstdio>
+
+int main()
+{
+  int arr[100];
+
+  int res = 0;
+  for (int i = 0; i < 100; i++) {
+    arr[i] = i;
+    res += i;
+  }
+  printf("base result: %d\n", res);
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 10; i++)
+      {
+        int res = 0;
+        #pragma omp taskgraph
+        {
+          #pragma omp taskloop reduction(+: res) num_tasks(10)
+          {
+            for (int j = 0; j < 100; j++) {
+              res += arr[j];
+            }
+          }
+        }
+        printf("reduction result: %d\n", res);
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      base result: 4950
+// CHECK-NEXT: reduction result: 4950
+// CHECK-NEXT: reduction result: 4950
+// CHECK-NEXT: reduction result: 4950
+// CHECK-NEXT: reduction result: 4950
+// CHECK-NEXT: reduction result: 4950
+// CHECK-NEXT: reduction result: 4950
+// CHECK-NEXT: reduction result: 4950
+// CHECK-NEXT: reduction result: 4950
+// CHECK-NEXT: reduction result: 4950
+// CHECK-NEXT: reduction result: 4950
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_22.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_22.cpp
new file mode 100644
index 0000000000000..66b1ec8b8093c
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_22.cpp
@@ -0,0 +1,70 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental
+
+#include <cstdio>
+
+void foo() {
+#pragma omp task replayable(1)
+  {
+    fprintf(stderr, "task outside lexical taskgraph\n");
+  }
+}
+
+int main()
+{
+  int arr[100];
+
+  int res = 0;
+  for (int i = 0; i < 100; i++) {
+    arr[i] = i;
+    res += i;
+  }
+  fprintf(stderr, "base result: %d\n", res);
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 10; i++)
+      {
+        int res = 0;
+        #pragma omp taskgraph
+        {
+          #pragma omp taskloop reduction(+: res) num_tasks(10)
+          {
+            for (int j = 0; j < 100; j++) {
+              res += arr[j];
+            }
+          }
+          foo();
+        }
+        fprintf(stderr, "reduction result: %d\n", res);
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK: base result: 4950
+// CHECK-DAG: task outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: task outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: task outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: task outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: task outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: task outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: task outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: task outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: task outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: task outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_23.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_23.cpp
new file mode 100644
index 0000000000000..d5ed9cba60cda
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_23.cpp
@@ -0,0 +1,103 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental
+
+#include <cstdio>
+
+void foo() {
+  fprintf(stderr, "called function foo\n");
+#pragma omp taskloop replayable num_tasks(4)
+  {
+    for (int i = 0; i < 4; i++)
+      fprintf(stderr, "taskloop iter %d outside lexical taskgraph\n", i);
+  }
+}
+
+int main()
+{
+  int arr[100];
+
+  int res = 0;
+  for (int i = 0; i < 100; i++) {
+    arr[i] = i;
+    res += i;
+  }
+  fprintf(stderr, "base result: %d\n", res);
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 10; i++)
+      {
+        int res = 0;
+        #pragma omp taskgraph
+        {
+          #pragma omp taskloop reduction(+: res) num_tasks(10)
+          {
+            for (int j = 0; j < 100; j++) {
+              res += arr[j];
+            }
+          }
+          foo();
+        }
+        fprintf(stderr, "reduction result: %d\n", res);
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK: base result: 4950
+// CHECK-NEXT: called function foo
+// CHECK-DAG: taskloop iter 0 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 1 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 2 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 3 outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: taskloop iter 0 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 1 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 2 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 3 outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: taskloop iter 0 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 1 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 2 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 3 outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: taskloop iter 0 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 1 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 2 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 3 outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: taskloop iter 0 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 1 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 2 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 3 outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: taskloop iter 0 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 1 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 2 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 3 outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: taskloop iter 0 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 1 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 2 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 3 outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: taskloop iter 0 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 1 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 2 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 3 outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: taskloop iter 0 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 1 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 2 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 3 outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
+// CHECK-DAG: taskloop iter 0 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 1 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 2 outside lexical taskgraph
+// CHECK-DAG: taskloop iter 3 outside lexical taskgraph
+// CHECK-DAG: reduction result: 4950
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_24.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_24.cpp
new file mode 100644
index 0000000000000..91053ba4302bd
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_24.cpp
@@ -0,0 +1,80 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+#include <cassert>
+
+int global_dep;
+
+void foo() {
+#pragma omp taskwait replayable(1) depend(in: global_dep)
+}
+
+int main()
+{
+  int arr[100];
+
+  int res = 0;
+  for (int i = 0; i < 100; i++) {
+    arr[i] = i;
+    res += i;
+  }
+
+  assert(res == 4950);
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 10; i++)
+      {
+        int res = 0;
+        #pragma omp taskgraph
+        {
+          #pragma omp taskloop reduction(+: res) num_tasks(10)
+          {
+            for (int j = 0; j < 100; j++) {
+              res += arr[j];
+            }
+          }
+          #pragma omp task depend(out: global_dep)
+          { }
+          foo();
+        }
+        assert(res == 4950);
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: sequential {
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     parallel {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:     wait: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   wait: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_25.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_25.cpp
new file mode 100644
index 0000000000000..893f51bc1e0fb
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_25.cpp
@@ -0,0 +1,89 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental
+
+#include <cstdio>
+
+int global_dep;
+
+void foo() {
+  fprintf(stderr, "called function foo\n");
+#pragma omp task replayable(1) depend(in: global_dep)
+  {
+    fprintf(stderr, "out-of-line task created from within taskloop\n");
+  }
+}
+
+int main()
+{
+  int arr[100];
+
+  int res = 0;
+  for (int i = 0; i < 4; i++) {
+    arr[i] = i;
+    res += i;
+  }
+  fprintf(stderr, "base result: %d\n", res);
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 4; i++)
+      {
+        int res = 0;
+        #pragma omp taskgraph
+        {
+          #pragma omp taskloop reduction(+: res) num_tasks(4)
+          {
+            for (int j = 0; j < 4; j++) {
+              res += arr[j];
+              foo();
+            }
+          }
+        }
+        fprintf(stderr, "reduction result: %d\n", res);
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK: base result: 6
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: reduction result: 6
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: reduction result: 6
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: reduction result: 6
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: called function foo
+// CHECK-DAG: out-of-line task created from within taskloop
+// CHECK-DAG: reduction result: 6
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_26.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_26.cpp
new file mode 100644
index 0000000000000..d98b4b60c485e
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_26.cpp
@@ -0,0 +1,61 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental
+
+#include <cstdio>
+
+int main()
+{
+  int arr[100];
+  int arr2[100];
+
+  int res = 0, res2 = 0;
+  for (int i = 0; i < 10; i++) {
+    arr[i] = i;
+    arr2[i] = 3 + i * 2;
+    res += i;
+    res2 += 3 + i * 2;
+  }
+  fprintf(stderr, "base results: %d, %d\n", res, res2);
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 10; i++)
+      {
+        int res = 0, res2 = 0;
+        #pragma omp taskgraph
+        {
+          #pragma omp taskloop reduction(+: res) num_tasks(10)
+          {
+            for (int j = 0; j < 10; j++) {
+              res += arr[j];
+            }
+          }
+          #pragma omp taskloop reduction(+: res2) num_tasks(10)
+          {
+            for (int j = 0; j < 10; j++) {
+              res2 += arr2[j];
+            }
+          }
+        }
+        fprintf(stderr, "reduction results: %d, %d\n", res, res2);
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK: base results: 45, 120
+// CHECK-NEXT: reduction results: 45, 120
+// CHECK-NEXT: reduction results: 45, 120
+// CHECK-NEXT: reduction results: 45, 120
+// CHECK-NEXT: reduction results: 45, 120
+// CHECK-NEXT: reduction results: 45, 120
+// CHECK-NEXT: reduction results: 45, 120
+// CHECK-NEXT: reduction results: 45, 120
+// CHECK-NEXT: reduction results: 45, 120
+// CHECK-NEXT: reduction results: 45, 120
+// CHECK-NEXT: reduction results: 45, 120
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_27.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_27.cpp
new file mode 100644
index 0000000000000..fd2323c21fe5f
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_27.cpp
@@ -0,0 +1,63 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental
+
+#include <cstdio>
+
+int main()
+{
+  int arr[100];
+  int arr2[100];
+
+  int res = 0, res2 = 0;
+  for (int i = 0; i < 10; i++) {
+    arr[i] = i;
+    res += i;
+  }
+  for (int i = 0; i < 10; i++) {
+    arr2[i] = 3 + i * 2;
+    res2 += res * (3 + i * 2);
+  }
+  fprintf(stderr, "base results: %d, %d\n", res, res2);
+
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 10; i++)
+      {
+        int res = 0, res2 = 0;
+        #pragma omp taskgraph
+        {
+          #pragma omp taskloop reduction(+: res) num_tasks(10)
+          {
+            for (int j = 0; j < 10; j++) {
+              res += arr[j];
+            }
+          }
+          #pragma omp taskloop reduction(+: res2) num_tasks(10)
+          {
+            for (int j = 0; j < 10; j++) {
+              res2 += res * arr2[j];
+            }
+          }
+        }
+        fprintf(stderr, "reduction results: %d, %d\n", res, res2);
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK: base results: 45, 5400
+// CHECK-NEXT: reduction results: 45, 5400
+// CHECK-NEXT: reduction results: 45, 5400
+// CHECK-NEXT: reduction results: 45, 5400
+// CHECK-NEXT: reduction results: 45, 5400
+// CHECK-NEXT: reduction results: 45, 5400
+// CHECK-NEXT: reduction results: 45, 5400
+// CHECK-NEXT: reduction results: 45, 5400
+// CHECK-NEXT: reduction results: 45, 5400
+// CHECK-NEXT: reduction results: 45, 5400
+// CHECK-NEXT: reduction results: 45, 5400
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_3.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_3.cpp
new file mode 100644
index 0000000000000..218c28d402631
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_3.cpp
@@ -0,0 +1,80 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[6];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[1])
+          { }
+          #pragma omp task depend(out: deps[2], deps[3])
+          { }
+          #pragma omp task depend(inout: deps[0])
+          { }
+          #pragma omp task depend(inout: deps[1])
+          { }
+          #pragma omp task depend(inout: deps[2])
+          { }
+          #pragma omp task depend(inout: deps[3])
+          { }
+          #pragma omp task depend(in: deps[0], deps[1], deps[2], deps[3])
+          { }
+          #pragma omp task depend(in: deps[1], deps[2]) depend(out: deps[5])
+          { }
+          #pragma omp task depend(in: deps[5])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: parallel {
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     parallel {
+// CHECK-NEXT:       sequential {
+// CHECK-NEXT:         node: 0x[[#%x,NODE1:]] (* 2)
+// CHECK-NEXT:         node: 0x[[#%x,NODE2:]] (* 2)
+// CHECK-NEXT:       }
+// CHECK-NEXT:       sequential {
+// CHECK-NEXT:         node: 0x[[#%x,NODE3:]] (* 2)
+// CHECK-NEXT:         node: 0x[[#%x,NODE4:]] (* 2)
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:     sequential {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     parallel {
+// CHECK-NEXT:       sequential {
+// CHECK-NEXT:         node: 0x[[#NODE3]] (* 2)
+// CHECK-NEXT:         parallel {
+// CHECK-NEXT:           node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:           node: 0x[[#NODE4]] (* 2)
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }
+// CHECK-NEXT:       sequential {
+// CHECK-NEXT:         node: 0x[[#NODE1]] (* 2)
+// CHECK-NEXT:         parallel {
+// CHECK-NEXT:           node: 0x[[#NODE2]] (* 2)
+// CHECK-NEXT:           node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_4.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_4.cpp
new file mode 100644
index 0000000000000..60ecdcdc9bd4f
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_4.cpp
@@ -0,0 +1,76 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[4];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[1])
+          { }
+          #pragma omp task depend(out: deps[2], deps[3])
+          { }
+          #pragma omp task depend(inout: deps[0])
+          { }
+          #pragma omp task depend(inout: deps[1])
+          { }
+          #pragma omp task depend(inout: deps[2])
+          { }
+          #pragma omp task depend(inout: deps[3])
+          { }
+          #pragma omp task depend(in: deps[0], deps[2], deps[3])
+          { }
+          #pragma omp task depend(in: deps[0], deps[1])
+          { }
+          #pragma omp task depend(in: deps[3])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: parallel {
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     node: 0x[[#%x,NODE1:]] (* 2)
+// CHECK-NEXT:     sequential {
+// CHECK-NEXT:       parallel {
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:         node: 0x[[#%x,NODE2:]] (* 2)
+// CHECK-NEXT:       }
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     parallel {
+// CHECK-NEXT:       sequential {
+// CHECK-NEXT:         node: 0x[[#%x,NODE3:]] (* 2)
+// CHECK-NEXT:         parallel {
+// CHECK-NEXT:           node: 0x[[#%x,NODE4:]] (* 2)
+// CHECK-NEXT:           node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:         }
+// CHECK-NEXT:       }
+// CHECK-NEXT:       sequential {
+// CHECK-NEXT:         node: 0x[[#NODE1]] (* 2)
+// CHECK-NEXT:         node: 0x[[#NODE2]] (* 2)
+// CHECK-NEXT:       }
+// CHECK-NEXT:     }
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     node: 0x[[#NODE3]] (* 2)
+// CHECK-NEXT:     node: 0x[[#NODE4]] (* 2)
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_5.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_5.cpp
new file mode 100644
index 0000000000000..aee9cc126cac2
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_5.cpp
@@ -0,0 +1,63 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[4];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[2])
+          { }
+          #pragma omp task depend(out: deps[1], deps[3])
+          { }
+          #pragma omp task depend(inoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(inoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(inoutset: deps[2], deps[3])
+          { }
+          #pragma omp task depend(inoutset: deps[2], deps[3])
+          { }
+          #pragma omp task depend(in: deps[0], deps[1])
+          { }
+          #pragma omp task depend(in: deps[2], deps[3])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: sequential {
+// CHECK-NEXT:   parallel {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   parallel {
+// CHECK-NEXT:     sequential {
+// CHECK-NEXT:       parallel {
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       }
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:     sequential {
+// CHECK-NEXT:       parallel {
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:         node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       }
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_6.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_6.cpp
new file mode 100644
index 0000000000000..e46d7346db660
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_6.cpp
@@ -0,0 +1,59 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[4];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[1], deps[3])
+          { }
+          #pragma omp task depend(out: deps[0], deps[2])
+          { }
+          #pragma omp task depend(inoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(inoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(inoutset: deps[2], deps[3])
+          { }
+          #pragma omp task depend(inoutset: deps[2], deps[3])
+          { }
+          #pragma omp task depend(in: deps[0], deps[2])
+          { }
+          #pragma omp task depend(in: deps[0], deps[2])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: sequential {
+// CHECK-NEXT:   parallel {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     parallel {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:     parallel {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_7.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_7.cpp
new file mode 100644
index 0000000000000..509b465b2cfe3
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_7.cpp
@@ -0,0 +1,59 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[4];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[1])
+          { }
+          #pragma omp task depend(out: deps[2], deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
+          { }
+          #pragma omp task depend(in: deps[0], deps[1])
+          { }
+          #pragma omp task depend(in: deps[2], deps[3])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: parallel {
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     exclusive {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     exclusive {
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:       node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     }
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_8.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_8.cpp
new file mode 100644
index 0000000000000..859ae569e12e3
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_8.cpp
@@ -0,0 +1,39 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[2];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(in: deps[0], deps[1])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// FIXME: This isn't perfect -- we don't really need to keep the mutexes in
+// this case.
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: sequential {
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x3]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_9.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_9.cpp
new file mode 100644
index 0000000000000..ade81d28ed9e4
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_9.cpp
@@ -0,0 +1,47 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+
+int main()
+{
+  int deps[3];
+  #pragma omp parallel
+  {
+    #pragma omp single
+    {
+      for (int i = 0; i < 2; i++)
+      {
+        #pragma omp taskgraph
+        {
+          #pragma omp task depend(out: deps[0], deps[1])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(in: deps[1]) depend(out: deps[2])
+          { }
+          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
+          { }
+          #pragma omp task depend(in: deps[0], deps[1], deps[2])
+          { }
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+// FIXME: This isn't perfect -- we don't really need to keep the mutexes in
+// this case.
+
+// CHECK:      Processed taskgraph 0x[[#%x,GRAPHPTR:]] (graph_id 0):
+// CHECK-NEXT: sequential {
+// CHECK-NEXT:   sequential {
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}} [sets: 0x3]
+// CHECK-NEXT:     node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT:   }
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}} [sets: 0x3]
+// CHECK-NEXT:   node: 0x{{[[:xdigit:]]+}}
+// CHECK-NEXT: }
+// CHECK-NEXT: Replay taskgraph 0x[[#GRAPHPTR]] from task 0x{{[[:xdigit:]]+}}

>From 3cca2b4ba24cf88250ccb4267dfcb2f6916e5403 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 29 Apr 2026 07:45:09 -0500
Subject: [PATCH 13/24] [OpenMP] Runtime support for graph_id and graph_reset
 clauses (for taskgraph)

This patch contains runtime support for the graph_id and graph_reset
clauses -- in particular, adding support for freeing recorded taskgraphs
on reset, and initial/planned support for keeping previous recorded
taskgraphs live until their execution count reaches zero, when concurrent
playback is implemented.

A new structure is introduced here, kmp_taskgraph_header_t: this is
now what the per-syntactic taskgraph handle passed in from the user
program points to during recording.  These structures are never freed
once allocated.

FD: this patch is mostly handwritten, but AI helped a bit with debugging,
so adding the tag.

Assisted-By: Codex/gpt-5.3

Pull Request: https://github.com/llvm/llvm-project/pull/195074
---
 openmp/runtime/src/kmp.h           |   6 +
 openmp/runtime/src/kmp_tasking.cpp | 204 +++++++++++++++++++++++------
 2 files changed, 167 insertions(+), 43 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 8a735b87619f4..ceb485cd7fffc 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2785,6 +2785,12 @@ typedef struct kmp_taskgraph_record {
   struct kmp_taskgraph_record *next = nullptr;
 } kmp_taskgraph_record_t;
 
+typedef struct kmp_taskgraph_header {
+  kmp_taskgraph_record_t *first;
+  kmp_taskgraph_record_t *expiring;
+  kmp_lock_t header_lock;
+} kmp_taskgraph_header_t;
+
 typedef struct kmp_taskgraph_exec_descr {
   std::atomic<kmp_int32> npredecessors;
   std::atomic<kmp_int32> nblocks;
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 0a5010da236b1..82fde157fe603 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -5740,28 +5740,106 @@ bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
 
 #if OMP_TASKGRAPH_EXPERIMENTAL
 
+static void __kmp_taskgraph_reset(kmp_taskgraph_record_t *rec, kmp_int32 gtid,
+                                  kmp_int32 graph_id) {
+  rec->status = KMP_TDG_RECORDING;
+  rec->gtid = gtid;
+  rec->graph_id = graph_id;
+  rec->record_map = nullptr;
+  rec->alloc_root = nullptr;
+  rec->recycled_deps = nullptr;
+  rec->num_tasks = 0;
+  rec->nodes_allocated = 0;
+  rec->num_mutexes = 0;
+  rec->exec_descrs = nullptr;
+  rec->exec_descr_size = 0;
+  rec->next = nullptr;
+}
+
 static kmp_taskgraph_record_t *__kmp_taskgraph_alloc(kmp_int32 gtid,
                                                      kmp_int32 graph_id) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgraph_record_t *new_rec =
       (kmp_taskgraph_record_t *)__kmp_fast_allocate(
           thread, sizeof(kmp_taskgraph_record_t));
-  new_rec->status = KMP_TDG_RECORDING;
-  new_rec->gtid = gtid;
-  new_rec->graph_id = graph_id;
   __kmp_init_lock(&new_rec->map_lock);
-  new_rec->record_map = nullptr;
-  new_rec->alloc_root = nullptr;
-  new_rec->recycled_deps = nullptr;
-  new_rec->num_tasks = 0;
-  new_rec->nodes_allocated = 0;
-  new_rec->num_mutexes = 0;
-  new_rec->exec_descrs = nullptr;
-  new_rec->exec_descr_size = 0;
-  new_rec->next = nullptr;
+  __kmp_taskgraph_reset(new_rec, gtid, graph_id);
   return new_rec;
 }
 
+static void
+__kmp_taskgraph_free_region_metadata(kmp_info_t *thread,
+                                     kmp_taskgraph_region_t *region) {
+  if (region->reduce_input) {
+    __kmp_fast_free(thread, region->reduce_input->reduce_data);
+    __kmp_fast_free(thread, region->reduce_input);
+  }
+  if (region->mutexset) {
+    __kmp_fast_free(thread, region->mutexset);
+  }
+  switch (region->type) {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+  case TASKGRAPH_REGION_WAIT:
+  case TASKGRAPH_REGION_NODE:
+    break;
+  case TASKGRAPH_REGION_PARALLEL:
+  case TASKGRAPH_REGION_SEQUENTIAL:
+  case TASKGRAPH_REGION_EXCLUSIVE: {
+    for (int k = 0; k < region->inner.num_children; k++) {
+      __kmp_taskgraph_free_region_metadata(thread, region->inner.children[k]);
+    }
+    break;
+  }
+  default:
+    assert(false && "unreachable");
+  }
+}
+
+static void __kmp_taskgraph_free(kmp_int32 gtid, kmp_taskgraph_record_t *rec,
+                                 bool keep_rec = false) {
+  kmp_info_t *thread = __kmp_threads[gtid];
+
+  __kmp_taskgraph_free_region_metadata(thread, rec->root);
+
+  for (size_t task = 0; task < rec->num_tasks; task++) {
+    kmp_taskdata *taskdata = KMP_TASK_TO_TASKDATA(rec->record_map[task].task);
+    // Setting this here keeps an assertion in __kmp_free_task happy: the
+    // clone may never have been replayed, in which case 'complete' will be
+    // zero here, as initialized.
+    taskdata->td_flags.complete = 1;
+    // We never decrement td_allocated_child_tasks for taskgraph tasks.  This
+    // keeps another assertion happy in __kmp_free_task.
+    KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 0);
+    __kmp_free_task(gtid, taskdata, thread);
+  }
+  __kmp_thread_free(thread, rec->record_map);
+
+  kmp_taskgraph_region_t *region = rec->alloc_root;
+  while (region) {
+    kmp_taskgraph_region_t *next_region = region->alloc_chain;
+    __kmp_fast_free(thread, region);
+    region = next_region;
+  }
+
+  if (rec->exec_descrs)
+    __kmp_thread_free(thread, rec->exec_descrs);
+
+  if (!keep_rec) {
+    __kmp_destroy_lock(&rec->map_lock);
+    __kmp_fast_free(thread, rec);
+  }
+}
+
+static kmp_taskgraph_header_t *__kmp_taskgraph_header_alloc(kmp_int32 gtid) {
+  kmp_taskgraph_header_t *new_hdr =
+      (kmp_taskgraph_header_t *)__kmp_allocate(sizeof(kmp_taskgraph_header_t));
+  new_hdr->first = nullptr;
+  new_hdr->expiring = nullptr;
+  __kmp_init_lock(&new_hdr->header_lock);
+  return new_hdr;
+}
+
 // Clone a (new) task that has had its private variables and shared variables
 // initialised already.
 static kmp_task_t *__kmp_taskgraph_clone_task(kmp_info_t *thread,
@@ -5784,9 +5862,36 @@ static kmp_task_t *__kmp_taskgraph_clone_task(kmp_info_t *thread,
   KMP_MEMCPY(copy_td, taskdata, shareds_offset + sizeof_shareds);
   // Tasks cloned for a taskgraph always have this field set.
   copy_td->owning_taskgraph = taskgraph;
+  KMP_ATOMIC_ST_RLX(&copy_td->td_incomplete_child_tasks, 0);
   return KMP_TASKDATA_TO_TASK(copy_td);
 }
 
+// Go through list of taskgraph records and free any that are no longer being
+// executed.  For now, this just frees everything in the list immediately: when
+// multiple concurrent playbacks are implemented, it should only free records
+// with zero usage counts.
+// When available, a kmp_taskgraph_record_t structure is returned for reuse by
+// a subsequent taskgraph recording.
+
+static kmp_taskgraph_record_t *
+__kmp_expire_taskgraph_records(kmp_int32 gtid,
+                               kmp_taskgraph_record_t **expiring_p) {
+  kmp_taskgraph_record_t *record = nullptr;
+
+  while (*expiring_p) {
+    kmp_taskgraph_record_t *expiring = *expiring_p;
+    if (!record) {
+      record = expiring;
+      __kmp_taskgraph_free(gtid, record, /*keep_rec=*/true);
+    } else {
+      __kmp_taskgraph_free(gtid, record, /*keep_rec=*/false);
+    }
+    *expiring_p = expiring->next;
+  }
+
+  return record;
+}
+
 // __kmpc_taskgraph: record or replay taskgraph
 // loc_ref:     Location of TDG, not used yet
 // gtid:        Global Thread ID of the encountering thread
@@ -5802,8 +5907,9 @@ void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
                       std::atomic<void *> *tdg_handle, kmp_uint32 graph_id,
                       kmp_int32 graph_reset, kmp_int32 nogroup,
                       void (*entry)(void *), void *args) {
-  kmp_taskgraph_record_t *record =
-      (kmp_taskgraph_record_t *)KMP_ATOMIC_LD_ACQ(tdg_handle);
+  kmp_taskgraph_header_t *header =
+      (kmp_taskgraph_header_t *)KMP_ATOMIC_LD_ACQ(tdg_handle);
+  kmp_taskgraph_record_t *record = nullptr, **record_p = nullptr;
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgroup_t *taskgroup;
 
@@ -5811,39 +5917,51 @@ void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
 
   taskgroup = thread->th.th_current_task->td_taskgroup;
 
-  // FIXME: Implement graph_id and graph_reset functionality.  For graph_id, we
-  // will form a singly-linked list of task records chained through their
-  // "next" pointers (per taskgraph construct handle).  Thread safety and
-  // locking need careful consideration.  We could use a "list header" node
-  // consisting of a lock and a pointer to
-  // the list proper, perhaps.  Ideally we'd want to avoid locking/unlocking in
-  // the common case (replay).
-
-  if (!record) {
-    record = __kmp_taskgraph_alloc(gtid, graph_id);
-    // Another thread may have allocated the taskgraph already.  Check that
-    // here.
-    kmp_taskgraph_record_t *other =
-        (kmp_taskgraph_record_t *)KMP_COMPARE_AND_STORE_RET64(tdg_handle,
-                                                              nullptr, record);
+  if (!header) {
+    header = __kmp_taskgraph_header_alloc(gtid);
+    // Another thread may have allocated the header at the same time.  Grab
+    // their copy if so and forget ours.
+    kmp_taskgraph_header_t *other =
+        (kmp_taskgraph_header_t *)KMP_COMPARE_AND_STORE_RET64(tdg_handle,
+                                                              nullptr, header);
     if (other != nullptr) {
-      __kmp_fast_free(thread, record);
-      record = other;
-      // Should we stall here until the other thread has finished recording the
-      // taskgraph?  That might be safer.  Otherwise multiple threads will add
-      // tasks to the taskgraph simultaneously, which is unlikely to be what
-      // the user wants.  Unclear what to do here.  FIXME.
-    } else {
-      // We record 'nogroup' here.  We always create a group for recording the
-      // taskgraph, but we could avoid doing so for replay.  That's not done
-      // yet though.
-      record->nogroup_taskgroup = nogroup;
-      // Store our taskgraph record into the taskgraph directive's implicit
-      // taskgroup.
-      KMP_ATOMIC_ST_REL(&taskgroup->taskgraph.recording, record);
+      __kmp_free(header);
+      header = other;
     }
   }
 
+  __kmp_acquire_lock(&header->header_lock, gtid);
+  for (record_p = &header->first; *record_p; record_p = &((*record_p)->next))
+    if ((*record_p)->graph_id == graph_id)
+      break;
+  record = *record_p;
+  kmp_taskgraph_record_t *reuse_record = nullptr;
+  if (record && graph_reset) {
+    // Move the existing record to the header's expiring list
+    *record_p = record->next;
+    record->next = header->expiring;
+    header->expiring = record;
+    reuse_record = __kmp_expire_taskgraph_records(gtid, &header->expiring);
+    record = nullptr;
+  }
+  if (!record) {
+    if (reuse_record) {
+      record = reuse_record;
+      __kmp_taskgraph_reset(record, gtid, graph_id);
+    } else
+      record = __kmp_taskgraph_alloc(gtid, graph_id);
+    // We record 'nogroup' here.  We always create a group for recording the
+    // taskgraph, but we could avoid doing so for replay.  That's not done
+    // yet though.
+    record->nogroup_taskgroup = nogroup;
+    record->next = header->first;
+    header->first = record;
+    // Store our taskgraph record into the taskgraph directive's implicit
+    // taskgroup.
+    KMP_ATOMIC_ST_REL(&taskgroup->taskgraph.recording, record);
+  }
+  __kmp_release_lock(&header->header_lock, gtid);
+
   kmp_taskgraph_status_t status = KMP_ATOMIC_LD_ACQ(&record->status);
   if (status == KMP_TDG_RECORDING)
     entry(args);

>From 1ad63c5686be689c6621b2ccfa62134b7571f04d Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Thu, 30 Apr 2026 06:34:33 -0500
Subject: [PATCH 14/24] [OpenMP] New runtime tests for graph_id/graph_reset
 clauses (taskgraph)

This patch contains several new tests for graph_id/graph_reset functionality.

Assisted-By: Codex/gpt-5.3 and Claude Opus 4.7

Pull Request: https://github.com/llvm/llvm-project/pull/195075
---
 ...taskgraph_deps_irreducible_graph_reset.cpp | 50 ++++++++++++++++
 .../test/taskgraph/taskgraph_graph_id.cpp     | 56 ++++++++++++++++++
 .../taskgraph_graph_id_and_reset.cpp          | 57 +++++++++++++++++++
 .../test/taskgraph/taskgraph_graph_reset.cpp  | 57 +++++++++++++++++++
 4 files changed, 220 insertions(+)
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_deps_irreducible_graph_reset.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_graph_id.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_graph_id_and_reset.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_graph_reset.cpp

diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_irreducible_graph_reset.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_irreducible_graph_reset.cpp
new file mode 100644
index 0000000000000..0e94a1644cd99
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_irreducible_graph_reset.cpp
@@ -0,0 +1,50 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+
+// REQUIRES: omp_taskgraph_experimental
+
+#include <atomic>
+#include <cstdio>
+
+// This builds a non-series-parallel dependency shape:
+// A->B, A->C, B->D, C->D, C->E.
+
+int main() {
+  int deps[4] = {0, 0, 0, 0};
+
+  for (int iter = 0; iter < 1000; ++iter) {
+    std::atomic<int> sum{0};
+
+#pragma omp parallel num_threads(4)
+    {
+#pragma omp single
+      {
+#pragma omp taskgraph graph_id(123) graph_reset(1)
+        {
+#pragma omp task depend(out : deps[0], deps[1])
+          { sum.fetch_add(1, std::memory_order_relaxed); } // A
+#pragma omp task depend(inout : deps[0])
+          { sum.fetch_add(4, std::memory_order_relaxed); } // B
+#pragma omp task depend(inout : deps[1])
+          { sum.fetch_add(8, std::memory_order_relaxed); } // C
+#pragma omp task depend(in : deps[0], deps[1], deps[2], deps[3])
+          { sum.fetch_add(64, std::memory_order_relaxed); } // D
+#pragma omp task depend(in : deps[1], deps[2])
+          { sum.fetch_add(128, std::memory_order_relaxed); } // E
+        }
+      }
+    }
+
+    const int actual = sum.load(std::memory_order_relaxed);
+    if (actual != 205) {
+      std::fprintf(stderr, "FAIL iter=%d expected=205 actual=%d\n", iter,
+                   actual);
+      return 1;
+    }
+  }
+
+  std::fprintf(stderr, "PASS\n");
+  return 0;
+}
+
+// CHECK: PASS
diff --git a/openmp/runtime/test/taskgraph/taskgraph_graph_id.cpp b/openmp/runtime/test/taskgraph/taskgraph_graph_id.cpp
new file mode 100644
index 0000000000000..f536c2a03c9b1
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_graph_id.cpp
@@ -0,0 +1,56 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <atomic>
+#include <cstdio>
+
+int main() {
+  constexpr int NumIters = 12;
+  constexpr int WorkA = 16;
+  constexpr int WorkB = 24;
+
+  std::atomic<int> total{0};
+
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp single
+    {
+      for (int iter = 0; iter < NumIters; ++iter) {
+        const int gid = iter & 1;
+
+#pragma omp taskgraph graph_id(gid)
+        {
+          if (gid == 0) {
+            for (int i = 0; i < WorkA; ++i) {
+#pragma omp task
+              total.fetch_add(1, std::memory_order_relaxed);
+            }
+          } else {
+            for (int i = 0; i < WorkB; ++i) {
+#pragma omp task
+              total.fetch_add(1, std::memory_order_relaxed);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const int NumEven = (NumIters + 1) / 2;
+  const int NumOdd = NumIters / 2;
+  const int Expected = NumEven * WorkA + NumOdd * WorkB;
+  const int Actual = total.load(std::memory_order_relaxed);
+
+  if (Actual != Expected) {
+    std::fprintf(stderr, "FAIL graph_id total=%d expected=%d\n", Actual,
+                 Expected);
+    return 1;
+  }
+
+  std::fprintf(stderr, "PASS graph_id total=%d\n", Actual);
+  return 0;
+}
+
+// CHECK: PASS graph_id total=
diff --git a/openmp/runtime/test/taskgraph/taskgraph_graph_id_and_reset.cpp b/openmp/runtime/test/taskgraph/taskgraph_graph_id_and_reset.cpp
new file mode 100644
index 0000000000000..cadc527102ee7
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_graph_id_and_reset.cpp
@@ -0,0 +1,57 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <atomic>
+#include <cstdio>
+
+// Exercise graph_id and graph_reset together on a single taskgraph construct.
+//
+// graph_id is per-taskgraph: the runtime keeps one record per (construct,
+// graph_id) pair.  Here we feed the same construct two different graph_ids
+// alternately (0, 1, 0, 1, ...) so two records coexist for the same
+// directive.  graph_reset is then driven by a separate condition: every
+// fourth visit, we ask the runtime to re-record the graph_id we are about
+// to use, forcing it through the expiry-and-re-record path while the other
+// graph_id's record stays intact and continues to be replayed.
+
+int main() {
+  constexpr int NumIters = 40;
+  constexpr int TasksPerVisit = 6;
+
+  std::atomic<int> total{0};
+
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp single
+    {
+      for (int iter = 0; iter < NumIters; ++iter) {
+        const int gid = iter & 1; // alternate two records
+        const bool reset = (iter % 4) == 3; // periodically force re-record
+
+#pragma omp taskgraph graph_id(gid) graph_reset(reset)
+        {
+          for (int i = 0; i < TasksPerVisit; ++i) {
+#pragma omp task
+            total.fetch_add(1, std::memory_order_relaxed);
+          }
+        }
+      }
+    }
+  }
+
+  const int Expected = NumIters * TasksPerVisit;
+  const int Actual = total.load(std::memory_order_relaxed);
+
+  if (Actual != Expected) {
+    std::fprintf(stderr, "FAIL graph_id+reset total=%d expected=%d\n", Actual,
+                 Expected);
+    return 1;
+  }
+
+  std::fprintf(stderr, "PASS graph_id+reset total=%d\n", Actual);
+  return 0;
+}
+
+// CHECK: PASS graph_id+reset total=
diff --git a/openmp/runtime/test/taskgraph/taskgraph_graph_reset.cpp b/openmp/runtime/test/taskgraph/taskgraph_graph_reset.cpp
new file mode 100644
index 0000000000000..d41592f0ac243
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_graph_reset.cpp
@@ -0,0 +1,57 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <atomic>
+#include <cstdio>
+
+int main() {
+  constexpr int NumIters = 10;
+  constexpr int GraphId = 7;
+
+  std::atomic<int> total{0};
+
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp single
+    {
+      for (int iter = 0; iter < NumIters; ++iter) {
+        const bool odd = (iter & 1) != 0;
+
+        // Alternate the taskgraph shape every iteration. graph_reset(1)
+        // requires the runtime to re-record rather than replay stale shape.
+#pragma omp taskgraph graph_id(GraphId) graph_reset(1)
+        {
+          if (odd) {
+            for (int i = 0; i < 40; ++i) {
+#pragma omp task
+              total.fetch_add(2, std::memory_order_relaxed);
+            }
+          } else {
+            for (int i = 0; i < 10; ++i) {
+#pragma omp task
+              total.fetch_add(1, std::memory_order_relaxed);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const int NumOdd = NumIters / 2;
+  const int NumEven = NumIters - NumOdd;
+  const int Expected = NumOdd * 40 * 2 + NumEven * 10;
+  const int Actual = total.load(std::memory_order_relaxed);
+
+  if (Actual != Expected) {
+    std::fprintf(stderr, "FAIL graph_reset total=%d expected=%d\n", Actual,
+                 Expected);
+    return 1;
+  }
+
+  std::fprintf(stderr, "PASS graph_reset total=%d\n", Actual);
+  return 0;
+}
+
+// CHECK: PASS graph_reset total=

>From aef4299d08b0bdafe3cbda01f4631daaea916a8b Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Thu, 30 Apr 2026 07:02:52 -0500
Subject: [PATCH 15/24] [OpenMP] Add sanity check for concurrent taskgraph
 reset

This patch adds a sanity check for concurrent taskgraph re-recording:
the case where two threads might simultaneously hit a taskgraph region
with the same graph_id and a non-zero graph_reset clause.

This will only trigger when debugging/assertions are enabled: it might
be more helpful to do this check unconditionally, but I'm not sure what
sort of error path in libomp should be used in that case.

Pull Request: https://github.com/llvm/llvm-project/pull/195076
---
 openmp/runtime/src/kmp_tasking.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 82fde157fe603..2369f7086589e 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -5937,6 +5937,16 @@ void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
   record = *record_p;
   kmp_taskgraph_record_t *reuse_record = nullptr;
   if (record && graph_reset) {
+    kmp_taskgraph_status_t old_status = KMP_ATOMIC_LD_ACQ(&record->status);
+    // Sanity check: if the graph is not ready, it means another thread is
+    // already performing a record operation for this taskgraph/graph_id.
+    // That's likely a bug, so tell the user and assert.
+    if (old_status != KMP_TDG_READY) {
+      KG_TRACE(1, ("*** Multiple threads attempting to re-record taskgraph "
+                   "concurrently: T#%d loc=%p taskgraph=%p graph_id=%d\n",
+                   gtid, loc_ref, header, graph_id));
+      KMP_DEBUG_ASSERT(old_status == KMP_TDG_READY);
+    }
     // Move the existing record to the header's expiring list
     *record_p = record->next;
     record->next = header->expiring;

>From 61d2b9398e0d34880a1106652475d900ad306a5c Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Thu, 30 Apr 2026 07:29:09 -0500
Subject: [PATCH 16/24] [OpenMP] Add test for concurrent taskgraph re-record
 sanity check

This adds a test for the concurrent taskgraph re-record sanity check.

Assisted-By: GPT-5.4 Xhigh

Pull Request: https://github.com/llvm/llvm-project/pull/195077
---
 ...taskgraph_graph_id_concurrent_rerecord.cpp | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_graph_id_concurrent_rerecord.cpp

diff --git a/openmp/runtime/test/taskgraph/taskgraph_graph_id_concurrent_rerecord.cpp b/openmp/runtime/test/taskgraph/taskgraph_graph_id_concurrent_rerecord.cpp
new file mode 100644
index 0000000000000..f7f2c28151091
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_graph_id_concurrent_rerecord.cpp
@@ -0,0 +1,53 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env OMP_DYNAMIC=FALSE KMP_G_DEBUG=1 %not --crash %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental, libomp_debug
+// clang-format on
+
+#include <atomic>
+#include <cstdio>
+#include <omp.h>
+
+static std::atomic<int> first_thread_inside{0};
+static std::atomic<int> release_first_thread{0};
+
+// Deterministically force two threads to hit the same lexical taskgraph
+// construct with the same graph_id while the first thread is still recording
+// it.  That guarantees the second thread sees an existing record in
+// KMP_TDG_RECORDING state and trips the runtime sanity check.
+static void enter_same_taskgraph(int tid) {
+#pragma omp taskgraph graph_id(17) graph_reset(1)
+  {
+    if (tid == 0) {
+      first_thread_inside.store(1, std::memory_order_release);
+      while (release_first_thread.load(std::memory_order_acquire) == 0) {
+      }
+    }
+  }
+}
+
+int main() {
+#pragma omp parallel num_threads(2)
+  {
+    const int tid = omp_get_thread_num();
+
+    if (tid == 1) {
+      while (first_thread_inside.load(std::memory_order_acquire) == 0) {
+      }
+    }
+
+    enter_same_taskgraph(tid);
+
+    if (tid == 1)
+      release_first_thread.store(1, std::memory_order_release);
+  }
+
+  std::fprintf(stderr, "UNEXPECTED SUCCESS\n");
+  return 0;
+}
+
+// CHECK: *** Multiple threads attempting to re-record taskgraph concurrently:
+// CHECK-SAME: graph_id=17
+// CHECK: Assertion failure at kmp_tasking.cpp
+// CHECK-SAME: old_status == KMP_TDG_READY.
+// CHECK: OMP: Error #13: Assertion failure at kmp_tasking.cpp
+// CHECK-NOT: UNEXPECTED SUCCESS

>From 5cce343483898c9a25b0842fcc10859fb0806e01 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Fri, 15 May 2026 04:48:00 -0500
Subject: [PATCH 17/24] [OpenMP] Make __kmp_replay_taskgraph static (NFC)

The function has no callers outside its translation unit, so can be
made static.

Pull Request: https://github.com/llvm/llvm-project/pull/200402
---
 openmp/runtime/src/kmp.h           | 5 -----
 openmp/runtime/src/kmp_tasking.cpp | 8 +++++---
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index ceb485cd7fffc..55267161d4b8a 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -4430,11 +4430,6 @@ extern kmp_int32 __kmp_omp_task(kmp_int32 gtid, kmp_task_t *new_task,
 extern kmp_int32 __kmp_build_taskgraph(kmp_int32 gtid,
                                        kmp_taskdata_t *current_taskdata,
                                        kmp_taskgraph_record_t *taskgraph);
-
-extern void __kmp_replay_taskgraph(kmp_int32 gtid,
-                                   kmp_taskdata_t *current_taskdata,
-                                   kmp_taskgraph_record_t *taskgraph,
-                                   kmp_uint32 graph_id);
 #endif
 
 KMP_EXPORT kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid,
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 2369f7086589e..a6ed83fc4748e 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -3178,9 +3178,11 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
 }
 
 #if OMP_TASKGRAPH_EXPERIMENTAL
-void __kmp_replay_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
-                            kmp_taskgraph_record_t *taskgraph,
-                            kmp_uint32 graph_id, kmp_taskgroup_t *taskgroup) {
+static void __kmp_replay_taskgraph(kmp_int32 gtid,
+                                   kmp_taskdata_t *current_taskdata,
+                                   kmp_taskgraph_record_t *taskgraph,
+                                   kmp_uint32 graph_id,
+                                   kmp_taskgroup_t *taskgroup) {
   kmp_info_t *thread = __kmp_threads[gtid];
 
   kmp_taskgraph_exec_descr_t *exec_descrs = taskgraph->exec_descrs;

>From 800c751faf24e9013da2f2da152773a43c9f172b Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Tue, 26 May 2026 14:59:11 -0500
Subject: [PATCH 18/24] [OpenMP] Widen taskgraph graph_id to uintptr_t

The 'graph_id' clause introduced in OpenMP 6.0 lets the user attach an
identifying value to a taskgraph instance, and a potentially-useful
idiom is to pass a pointer expression so that each instance can uniquely
depend on the pointed-to data.  The corresponding runtime entry point
__kmpc_taskgraph and the kmp_taskgraph_record_t::graph_id field were
declared as 32-bit types, which silently truncates such pointer values
on LP64 targets.

Widen the field and the runtime ABI to uintptr_t, lower 'graph_id'
to CGM.IntPtrTy in emitTaskgraphCall, and mark the OMPRTL
descriptor as SizeTy so the call is typed consistently on 32- and
64-bit hosts.  Refresh the auto-generated CHECK line in
taskgraph_codegen.cpp accordingly.

Assisted-By: Claude Opus 4.7

Pull Request: https://github.com/llvm/llvm-project/pull/200403
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp          | 4 ++--
 clang/test/OpenMP/taskgraph_codegen.cpp        | 2 +-
 llvm/include/llvm/Frontend/OpenMP/OMPKinds.def | 2 +-
 openmp/runtime/src/kmp.h                       | 4 ++--
 openmp/runtime/src/kmp_tasking.cpp             | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index bc6827784737f..31cc1ffcd53b9 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2269,13 +2269,13 @@ void CGOpenMPRuntime::emitTaskgraphCall(CodeGenFunction &CGF,
     GraphReset = CGF.Builder.getInt32(0);
   }
 
-  llvm::Value *GraphId = CGF.Builder.getInt32(0);
+  llvm::Value *GraphId = llvm::ConstantInt::get(CGM.IntPtrTy, 0);
   const OMPGraphIdClause *GraphIdClause = D.getSingleClause<OMPGraphIdClause>();
   if (GraphIdClause) {
     const auto *E = GraphIdClause->getId();
     auto *GraphIdVal = CGF.EmitScalarExpr(E);
     GraphId =
-        CGF.Builder.CreateIntCast(GraphIdVal, CGM.Int32Ty, /*isSigned=*/false);
+        CGF.Builder.CreateIntCast(GraphIdVal, CGM.IntPtrTy, /*isSigned=*/false);
   }
 
   CodeGenFunction OutlinedCGF(CGM, /*suppressNewContext=*/true);
diff --git a/clang/test/OpenMP/taskgraph_codegen.cpp b/clang/test/OpenMP/taskgraph_codegen.cpp
index 3f661e6bfe3d5..e6e718c2cd6d5 100644
--- a/clang/test/OpenMP/taskgraph_codegen.cpp
+++ b/clang/test/OpenMP/taskgraph_codegen.cpp
@@ -22,7 +22,7 @@
 // CHECK-NEXT:    store ptr [[X]], ptr [[TMP1]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 1
 // CHECK-NEXT:    store ptr [[Y]], ptr [[TMP2]], align 8
-// CHECK-NEXT:    call void @__kmpc_taskgraph(ptr @[[GLOB1]], i32 [[TMP0]], ptr @.omp.taskgraph.handle, i32 0, i32 0, i32 0, ptr @taskgraph.omp_outlined., ptr [[AGG_CAPTURED]])
+// CHECK-NEXT:    call void @__kmpc_taskgraph(ptr @[[GLOB1]], i32 [[TMP0]], ptr @.omp.taskgraph.handle, i64 0, i32 0, i32 0, ptr @taskgraph.omp_outlined., ptr [[AGG_CAPTURED]])
 // CHECK-NEXT:    ret i32 0
 //
 int main() {
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 49aeecd426a32..fc24280eaa077 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -357,7 +357,7 @@ __OMP_RTL(__kmpc_omp_task, false, Int32, IdentPtr, Int32,
           /* kmp_task_t */ VoidPtr)
 __OMP_RTL(__kmpc_end_taskgroup, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_taskgroup, false, Void, IdentPtr, Int32)
-__OMP_RTL(__kmpc_taskgraph, false, Void, IdentPtr, Int32, VoidPtrPtr, Int32,
+__OMP_RTL(__kmpc_taskgraph, false, Void, IdentPtr, Int32, VoidPtrPtr, SizeTy,
           Int32, Int32, VoidPtr, VoidPtr)
 __OMP_RTL(__kmpc_taskgraph_task, false, Int32, IdentPtr, Int32, VoidPtr, Int32,
           SizeTy, VoidPtr, SizeTy, Int32, VoidPtr)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 55267161d4b8a..d660c4e191d13 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2760,7 +2760,7 @@ typedef struct kmp_taskgraph_region {
 typedef struct kmp_taskgraph_record {
   std::atomic<kmp_taskgraph_status_t> status = KMP_TDG_NONE;
   kmp_int32 gtid = 0;
-  kmp_int32 graph_id = 0;
+  uintptr_t graph_id = 0;
   // A lock that protects the record_map and num_tasks fields from being
   // modified by multiple threads.
   // For now, we also use this whilst the taskgraph is being replayed.
@@ -4502,7 +4502,7 @@ KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
 #if OMP_TASKGRAPH_EXPERIMENTAL
 KMP_EXPORT void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
                                  std::atomic<void *> *tdg_handle,
-                                 kmp_uint32 graph_id, kmp_int32 graph_reset,
+                                 uintptr_t graph_id, kmp_int32 graph_reset,
                                  kmp_int32 nogroup, void (*entry)(void *),
                                  void *args);
 KMP_EXPORT kmp_uint32 __kmpc_taskgraph_task(
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index a6ed83fc4748e..2f73a75f11e7c 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -5906,7 +5906,7 @@ __kmp_expire_taskgraph_records(kmp_int32 gtid,
 // entry:       Pointer to the entry function
 // args:        Pointer to the function arguments
 void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
-                      std::atomic<void *> *tdg_handle, kmp_uint32 graph_id,
+                      std::atomic<void *> *tdg_handle, uintptr_t graph_id,
                       kmp_int32 graph_reset, kmp_int32 nogroup,
                       void (*entry)(void *), void *args) {
   kmp_taskgraph_header_t *header =

>From 0267abce30fa66847a07180f8024fdadcbaf533b Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Tue, 26 May 2026 15:11:50 -0500
Subject: [PATCH 19/24] [OpenMP] Relocate task shared-by-ref captures across
 taskgraph replay

OpenMP 6.0 lets a taskgraph region be recorded once and replayed many
times.  Each replay creates a fresh instance of the 'args' pointer
block passed to __kmpc_taskgraph (and may execute at a different stack
location, or even on a different stack), so by-reference captures inside a
recorded task must be re-pointed at the live host objects of the current
invocation; otherwise the recorded tasks would dereference stale memory
from the stack frame of the initial call to __kmpc_taskgraph.

This patch introduces the small infrastructure to do that and wires
it up for the explicit 'task' construct.  A subsequent patch
extends the same scheme to 'taskloop'.

On the compiler side (CGOpenMPRuntime.cpp), a new helper
emitTaskRelocationFunction emits a per-task thunk:

  void __omp_taskgraph_relocate.NN(kmp_task_t *task,
                                   void *outer_captures);

The thunk walks the task's captures and overwrites each entry of
task->shareds with the address of the corresponding field projected from
the freshly reconstructed outer pointer block.  Two classes of capture do
not need updating and are treated as no-ops by the thunk: captures that
correspond to a firstprivate list item (the body reads from the per-task
'.kmp_privates.t' snapshot, populated when the task is allocated and
-- for non-trivial types -- reset on each replay by the clone helper
introduced later), and captures of variables with static storage duration
(their address is link-time fixed).  Reductions of a local-stack variable
are intentionally not in this set: the taskred state is keyed on the
recording-time taskgroup hierarchy and is not yet usable on replay,
so we prefer to preserve today's relocate-returns-null / runtime-aborts
behaviour for that case so the limitation surfaces as a diagnostic.

emitTaskCall now emits such a thunk for each taskgraph-recorded task
and passes it as the new trailing argument of __kmpc_taskgraph_task.
The redundant 'shareds' parameter is dropped, since relocation now
provides the supported mechanism for refreshing that pointer.

On the runtime side (kmp.h, kmp_tasking.cpp, OMPKinds.def),
introduce a new typedef kmp_task_relocate_t and store the callback
on each recorded task in kmp_taskgraph_node_t::relocate, together
with the outer-record pointer captured at __kmpc_taskgraph entry in
kmp_taskgraph_record_t::taskgraph_args.  __kmp_omp_tg_task invokes
the callback on replay, and aborts with a new fatal diagnostic
(OmpTaskgraphBadCapture, i18n/en_US.txt) when a recorded task has a
non-null shareds payload but no relocation callback.  There is also a
fix for a pre-existing bug in __kmp_taskgraph_clone_task -- the cloned
task's shareds pointer was left referring to the original's payload --
which becomes observable as soon as the relocation thunk writes through
that pointer.

New libomp tests cover lexical and non-lexical shared captures,
pointer captures, non-trivial types, recursive recordings,
stack-depth differences across replays, and the saved/expired-
graph cases.

Assisted-By: Claude Opus 4.7

Pull Request: https://github.com/llvm/llvm-project/pull/200404
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 184 +++++++++++++++++-
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |   2 +-
 openmp/runtime/src/i18n/en_US.txt             |   1 +
 openmp/runtime/src/kmp.h                      |   7 +-
 openmp/runtime/src/kmp_tasking.cpp            |  43 ++--
 .../taskgraph_firstprivate_stack_depth.cpp    | 112 +++++++++++
 ...eplayable_lexical_shared_mixed_capture.cpp |  46 +++++
 ...layable_lexical_shared_nontrivial_type.cpp |  60 ++++++
 ...xical_shared_nontrivial_type_recursive.cpp |  88 +++++++++
 ...raph_replayable_lexical_shared_pointer.cpp |  44 +++++
 ...xical_shared_pointer_recursive_frameid.cpp |  76 ++++++++
 ...ph_replayable_lexical_shared_recursive.cpp |  46 +++++
 ...kgraph_replayable_lexical_shared_works.cpp |  42 ++++
 ...h_replayable_nonlexical_shared_fails_1.cpp |  52 +++++
 ...h_replayable_nonlexical_shared_fails_2.cpp |  70 +++++++
 ...taskgraph_replayable_saved_stack_depth.cpp | 119 +++++++++++
 .../taskgraph_shared_stack_depth.cpp          |  94 +++++++++
 17 files changed, 1064 insertions(+), 22 deletions(-)
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_firstprivate_stack_depth.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_mixed_capture.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_recursive.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_recursive_frameid.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_recursive.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_works.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_1.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_2.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_shared_stack_depth.cpp

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 31cc1ffcd53b9..cc3f90248625c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2241,6 +2241,169 @@ void CGOpenMPRuntime::emitTaskyieldCall(CodeGenFunction &CGF,
     Region->emitUntiedSwitch(CGF);
 }
 
+/// Emit a helper with the runtime relocation signature (kmp_task_relocate_t):
+///   void relocate(kmp_task_t *task, void *outer_captures);
+///
+/// On taskgraph replay the runtime invokes this helper to refresh the task's
+/// shared-pointer table. Each capture (a shared-by-ref variable or \c this)
+/// that the task body actually dereferences at execution time is
+/// re-projected from the freshly reconstructed outer record passed as
+/// \p outer_captures and stored back into \c task->shareds.
+///
+/// Captures that the body cannot observe a changed address for across
+/// replays are skipped here:
+///
+///   * captures of a variable that appears as a firstprivate list item
+///     -- the body sources the value from the per-task '.kmp_privates.t'
+///     snapshot rather than from the shareds slot, so the (potentially
+///     stale) original address in the shareds entry is harmless;
+///
+///   * captures of a variable with static (global / namespace-scope /
+///     static-local / static-data-member) storage duration -- the
+///     captured pointer is the variable's link-time-fixed address, which
+///     is identical at recording and on every replay, so no re-projection
+///     is meaningful.
+///
+/// The relocate helper is therefore only ever called upon to refresh
+/// shareds slots that the body genuinely depends on at execution time
+/// (shared-by-ref to a local variable, captured \c this on a heap or
+/// stack object, etc.).  When every capture falls in one of the
+/// skip-eligible categories the helper is emitted as a (still non-null)
+/// no-op: today's runtime only inspects null-vs-non-null, and a non-null
+/// no-op is the right signal that there is nothing the body actually
+/// needs the shareds table refreshed for.
+///
+/// Reduction captures of a local-stack variable still keep the existing
+/// null-relocate-and-abort behaviour: the taskred runtime state is keyed
+/// off the recording-time taskgroup hierarchy and is not currently usable
+/// on replay, so it is preferable to fail loudly (#302) than to silently
+/// misbehave.  Reduction captures of a static-storage variable do not run
+/// into this hazard at the relocate layer -- the captured pointer is
+/// stable -- and are no-op-skipped via the static-storage rule above;
+/// whether the reduction body itself then succeeds on replay is a
+/// separate concern.
+///
+/// Returns null only when at least one capture is genuinely shared (none
+/// of the skip-eligible categories apply) AND cannot be resolved in
+/// \p OuterCSI; in that case the caller passes a null relocation function
+/// to the runtime and the runtime fails fast at replay.
+static llvm::Function *
+emitTaskRelocationFunction(CodeGenModule &CGM, SourceLocation Loc,
+                           const CapturedStmt &CS,
+                           const CodeGenFunction::CGCapturedStmtInfo *OuterCSI,
+                           const OMPTaskDataTy &Data) {
+  ASTContext &C = CGM.getContext();
+
+  // Variables that don't need their shareds slot refreshed across replays
+  // because the body sources them from the per-task '.kmp_privates.t'
+  // snapshot.  Today this is the set of firstprivate list items (snapshot
+  // is taken at task allocation and reused unchanged by every replay).
+  llvm::SmallPtrSet<const VarDecl *, 8> NoRelocateFirstprivateVars;
+  for (const Expr *E : Data.FirstprivateVars) {
+    if (!E)
+      continue;
+    if (const auto *DRE = dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts()))
+      if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+        NoRelocateFirstprivateVars.insert(VD->getCanonicalDecl());
+  }
+
+  // A capture is "no-op-safe" with respect to taskgraph replay when
+  // refreshing its shareds slot is provably unnecessary - either because
+  // the body never reads from that slot (firstprivate) or because the
+  // captured pointer is a link-time-fixed address and is therefore
+  // identical at every replay (static storage duration).
+  auto IsNoOpRelocate = [&](const CapturedStmt::Capture &Cap) {
+    if (Cap.capturesThis() || !Cap.capturesVariable())
+      return false;
+    const VarDecl *VD = Cap.getCapturedVar();
+    if (VD->hasGlobalStorage())
+      return true;
+    return NoRelocateFirstprivateVars.contains(VD->getCanonicalDecl());
+  };
+
+  auto LookupOuterField =
+      [&](const CapturedStmt::Capture &Cap) -> const FieldDecl * {
+    if (!OuterCSI)
+      return nullptr;
+    return Cap.capturesThis() ? OuterCSI->getThisFieldDecl()
+                              : OuterCSI->lookup(Cap.getCapturedVar());
+  };
+
+  // Bail out before emitting any IR if a genuinely-shared capture cannot
+  // be resolved in the containing context.  No-op-safe captures (see the
+  // function-level comment) don't participate in this preflight; they
+  // simply cause the helper to skip their slot below.
+  if (llvm::any_of(CS.captures(), [&](const CapturedStmt::Capture &Cap) {
+        assert((Cap.capturesThis() || Cap.capturesVariable()) &&
+               "OpenMP task capture must be shared-by-ref or 'this'");
+        return !IsNoOpRelocate(Cap) && !LookupOuterField(Cap);
+      }))
+    return nullptr;
+
+  // void relocate(void *task, void *outer_captures)
+  auto *TaskArg =
+      ImplicitParamDecl::Create(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                                C.VoidPtrTy, ImplicitParamKind::Other);
+  auto *OuterArg =
+      ImplicitParamDecl::Create(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
+                                C.VoidPtrTy, ImplicitParamKind::Other);
+  FunctionArgList Args{TaskArg, OuterArg};
+  const CGFunctionInfo &FnInfo =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+
+  std::string Name =
+      CGM.getOpenMPRuntime().getName({"omp", "taskgraph", "relocate", ""});
+  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(FnInfo),
+                                    llvm::GlobalValue::InternalLinkage, Name,
+                                    &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, FnInfo);
+  if (!CGM.getCodeGenOpts().SampleProfileFile.empty())
+    Fn->addFnAttr("sample-profile-suffix-elision-policy", "selected");
+  Fn->setDoesNotRecurse();
+
+  CodeGenFunction CGF(CGM);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args, Loc, Loc);
+
+  CGBuilderTy &Bld = CGF.Builder;
+  CharUnits PtrAlign = CGF.getPointerAlign();
+
+  // Base of the reconstructed outer record for this replay.
+  llvm::Value *OuterRaw = Bld.CreateLoad(CGF.GetAddrOfLocalVar(OuterArg));
+
+  // kmp_task_t::shareds is the first field of the runtime task descriptor;
+  // load it to obtain the void* shared table that we will refresh in place.
+  // The table holds one void* per by-ref capture.
+  llvm::Value *TaskRaw = Bld.CreateLoad(CGF.GetAddrOfLocalVar(TaskArg));
+  llvm::Value *SharedRaw =
+      Bld.CreateLoad(Address(TaskRaw, CGF.VoidPtrTy, PtrAlign));
+  Address SharedTable(SharedRaw, CGF.VoidPtrTy, PtrAlign);
+
+  unsigned Index = 0;
+  for (const CapturedStmt::Capture &Cap : CS.captures()) {
+    // Always advance the slot index so that we stay aligned with the
+    // shareds-table layout established at task allocation.
+    unsigned ThisIndex = Index++;
+    if (IsNoOpRelocate(Cap))
+      continue;
+    // Project the capture's referent from the freshly reconstructed outer
+    // record. EmitLValueForField auto-loads the outer reference field, so
+    // the resulting pointer is the live referent address (not the slot).
+    const FieldDecl *OuterField = LookupOuterField(Cap);
+    assert(OuterField && "preflight should have rejected this capture");
+    QualType OuterTy =
+        C.getCanonicalTagType(cast<RecordDecl>(OuterField->getDeclContext()));
+    LValue OuterBase = CGF.MakeAddrLValue(
+        Address(OuterRaw, CGF.ConvertTypeForMem(OuterTy), PtrAlign), OuterTy);
+    llvm::Value *Mapped =
+        CGF.EmitLValueForField(OuterBase, OuterField).getPointer(CGF);
+    Mapped = Bld.CreatePointerBitCastOrAddrSpaceCast(Mapped, CGM.VoidPtrTy);
+    Bld.CreateStore(Mapped, Bld.CreateConstGEP(SharedTable, ThisIndex));
+  }
+
+  CGF.FinishFunction();
+  return Fn;
+}
+
 void CGOpenMPRuntime::emitTaskgraphCall(CodeGenFunction &CGF,
                                         SourceLocation Loc,
                                         const OMPExecutableDirective &D,
@@ -4800,22 +4963,27 @@ void CGOpenMPRuntime::emitTaskCall(
     TGTaskArgs[2] = Result.NewTask;
     TGTaskArgs[3] = TaskAllocArgs[0]; // TaskFlags
     TGTaskArgs[4] = TaskAllocArgs[1]; // KmpTaskTWithPrivatesTySize
-    TGTaskArgs[5] = Shareds.emitRawPointer(CGF);
-    TGTaskArgs[6] = TaskAllocArgs[2]; // SharedsSize
+    TGTaskArgs[5] = TaskAllocArgs[2]; // SharedsSize
     if (auto RecType = dyn_cast<RecordType>(SharedsTy)) {
       auto *RD = RecType->getAsRecordDecl();
       if (RD->fields().empty()) {
         // FIXME: The condition might not be precisely correct here.
-        TGTaskArgs[6] = CGF.Builder.getSize(0);
+        TGTaskArgs[5] = CGF.Builder.getSize(0);
       }
     }
     if (Data.Dependences.size() == 0) {
-      TGTaskArgs[7] = CGF.Builder.getInt32(0);
-      TGTaskArgs[8] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
+      TGTaskArgs[6] = CGF.Builder.getInt32(0);
+      TGTaskArgs[7] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
     } else {
-      TGTaskArgs[7] = NumOfElements;
-      TGTaskArgs[8] = DependenciesArray.emitRawPointer(CGF);
-    }
+      TGTaskArgs[6] = NumOfElements;
+      TGTaskArgs[7] = DependenciesArray.emitRawPointer(CGF);
+    }
+    const auto *CS = cast<CapturedStmt>(D.getAssociatedStmt());
+    llvm::Function *RelocFn =
+        emitTaskRelocationFunction(CGM, Loc, *CS, CGF.CapturedStmtInfo, Data);
+    TGTaskArgs[8] = RelocFn ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+                                  RelocFn, CGM.VoidPtrTy)
+                            : llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
                             CGM.getModule(), OMPRTL___kmpc_taskgraph_task),
                         TGTaskArgs);
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index fc24280eaa077..e32308df74cae 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -360,7 +360,7 @@ __OMP_RTL(__kmpc_taskgroup, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_taskgraph, false, Void, IdentPtr, Int32, VoidPtrPtr, SizeTy,
           Int32, Int32, VoidPtr, VoidPtr)
 __OMP_RTL(__kmpc_taskgraph_task, false, Int32, IdentPtr, Int32, VoidPtr, Int32,
-          SizeTy, VoidPtr, SizeTy, Int32, VoidPtr)
+          SizeTy, SizeTy, Int32, VoidPtr, VoidPtr)
 __OMP_RTL(__kmpc_taskgraph_taskloop, false, Int32, IdentPtr, Int32, VoidPtr,
           Int32, SizeTy, VoidPtr, SizeTy, Int32, Int64Ptr, Int64Ptr, Int64,
           Int32, Int32, Int64, Int32, VoidPtr)
diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt
index 08e837d3dea11..3cd852abd66c6 100644
--- a/openmp/runtime/src/i18n/en_US.txt
+++ b/openmp/runtime/src/i18n/en_US.txt
@@ -482,6 +482,7 @@ AffHWSubsetIgnoringAttr      "KMP_HW_SUBSET: ignoring %1$s attribute. This machi
 TargetMemNotAvailable        "Target memory not available, will use default allocator."
 AffIgnoringNonHybrid         "%1$s ignored: This machine is not a hybrid architecutre. Using \"%2$s\" instead."
 AffIgnoringNotAvailable      "%1$s ignored: %2$s is not available. Using \"%3$s\" instead."
+OmpTaskgraphBadCapture       "Cannot locate captured shared variable reference for taskgraph replay"
 
 # --------------------------------------------------------------------------------------------------
 -*- HINTS -*-
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index d660c4e191d13..9a96121d94d36 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2482,6 +2482,7 @@ extern kmp_uint64 __kmp_taskloop_min_tasks;
 /*!
  */
 typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, void *);
+typedef void (*kmp_task_relocate_t)(struct kmp_task *, void *);
 
 typedef union kmp_cmplrdata {
   kmp_int32 priority; /**< priority specified by user for the task */
@@ -2692,6 +2693,7 @@ typedef struct kmp_taskgraph_region_dep {
 typedef struct kmp_taskgraph_node {
   kmp_task_t *task;
   bool taskloop_task;
+  kmp_task_relocate_t relocate;
   kmp_taskgraph_reduce_input_data_t *reduce_input;
   union {
     // Valid when KMP_TDG_RECORDING in parent taskgraph record.
@@ -2777,6 +2779,7 @@ typedef struct kmp_taskgraph_record {
   struct kmp_taskgraph_exec_descr *exec_descrs;
   kmp_size_t exec_descr_size;
   kmp_lock_t replay_lock;
+  void *taskgraph_args = nullptr;
   // We need a taskgroup structure to keep track of recorded tasks.  This is
   // set to TRUE if the user requested "nogroup" on the taskgraph directive
   // (then we can avoid blocking at the end of the taskgraph region on replay,
@@ -4507,8 +4510,8 @@ KMP_EXPORT void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
                                  void *args);
 KMP_EXPORT kmp_uint32 __kmpc_taskgraph_task(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 flags,
-    size_t sizeof_kmp_task_t, void *shareds, size_t sizeof_shareds,
-    kmp_int32 ndeps, kmp_depend_info_t *dep_list);
+    size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_int32 ndeps,
+    kmp_depend_info_t *dep_list, kmp_task_relocate_t reloc);
 KMP_EXPORT kmp_uint32 __kmpc_taskgraph_taskloop(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 flags,
     size_t sizeof_kmp_task_t, void *shareds, size_t sizeof_shareds,
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 2f73a75f11e7c..1b35a56f7eecd 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -2352,10 +2352,11 @@ static void __kmp_exec_descr_link_instances(kmp_taskgraph_exec_descr_t *descrs,
 
 /// Reset, reparent and regroup the recorded task TASK and re-invoke it.
 
-static void __kmp_omp_tg_task(kmp_int32 gtid, kmp_task_t *task,
+static void __kmp_omp_tg_task(kmp_int32 gtid, kmp_taskgraph_node_t *node,
                               kmp_taskgroup_t *taskgroup,
                               kmp_taskdata_t *parent_taskdata,
                               bool serialize_immediate) {
+  kmp_task_t *task = node->task;
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   taskdata->td_parent = parent_taskdata;
 
@@ -2378,6 +2379,18 @@ static void __kmp_omp_tg_task(kmp_int32 gtid, kmp_task_t *task,
   if (parent_taskdata->td_flags.tasktype == TASK_EXPLICIT)
     KMP_ATOMIC_INC(&parent_taskdata->td_allocated_child_tasks);
 
+  if (node->relocate) {
+    // Call the task's relocation function with the incoming args from the
+    // owning taskgraph.  This rewrites capture-by-reference variables to point
+    // to the correct location on the replayed taskgraph's stack (which may not
+    // be the same as the location from the initial recorded taskgraph).
+    node->relocate(task, taskdata->owning_taskgraph->taskgraph_args);
+  } else if (task->shareds != NULL) {
+    // A missing relocation callback is only fatal when there is a non-empty
+    // shareds payload that may contain by-reference captures needing remap.
+    KMP_FATAL(OmpTaskgraphBadCapture);
+  }
+
   __kmp_omp_task(gtid, task, false);
 }
 
@@ -2404,9 +2417,9 @@ static void __kmp_taskgraph_exec_descr_start(kmp_int32 gtid, kmp_info_t *thread,
     kmp_int32 nblocks = KMP_ATOMIC_DEC(&lowest_descr->nblocks);
     if (nblocks <= 0) {
       if (descr->region->type == TASKGRAPH_REGION_NODE) {
-        kmp_task_t *task = descr->region->task.node->task;
+        kmp_taskgraph_node_t *node = descr->region->task.node;
         kmp_taskdata_t *current_taskdata = thread->th.th_current_task;
-        __kmp_omp_tg_task(gtid, task, taskgroup, current_taskdata, false);
+        __kmp_omp_tg_task(gtid, node, taskgroup, current_taskdata, false);
       } else {
         // There's no task for a 'taskwait', so start successors immediately.
         kmp_taskgraph_exec_descr_t *walk = descr;
@@ -2447,9 +2460,9 @@ static void __kmp_taskgraph_exec_descr_start(kmp_int32 gtid, kmp_info_t *thread,
     kmp_taskgraph_exec_descr_t *item = head;
     do {
       assert(item->region->type == TASKGRAPH_REGION_NODE);
-      kmp_task_t *task = item->region->task.node->task;
+      kmp_taskgraph_node_t *node = item->region->task.node;
       kmp_taskdata_t *current_taskdata = thread->th.th_current_task;
-      __kmp_omp_tg_task(gtid, task, taskgroup, current_taskdata, true);
+      __kmp_omp_tg_task(gtid, node, taskgroup, current_taskdata, true);
       item = item->sibling;
     } while (item != head);
     break;
@@ -5023,6 +5036,7 @@ __kmp_taskgraph_node_alloc(kmp_taskgraph_record_t *rec, kmp_task_t *task,
 
   new_task->task = task;
   new_task->taskloop_task = false;
+  new_task->relocate = nullptr;
   new_task->reduce_input = nullptr;
   new_task->u.unresolved.ndeps = 0;
   new_task->u.unresolved.dep_list = nullptr;
@@ -5755,6 +5769,7 @@ static void __kmp_taskgraph_reset(kmp_taskgraph_record_t *rec, kmp_int32 gtid,
   rec->num_mutexes = 0;
   rec->exec_descrs = nullptr;
   rec->exec_descr_size = 0;
+  rec->taskgraph_args = nullptr;
   rec->next = nullptr;
 }
 
@@ -5852,10 +5867,6 @@ static kmp_task_t *__kmp_taskgraph_clone_task(kmp_info_t *thread,
   // FIXME: This should use a "taskdup" function like taskloops in cases where
   // private variables are not trivially copyable.  For now, do it by plain
   // bitwise copy.
-  // FIXME 2: It's intended that this copy be persistent, and can be
-  // re-executed on taskgraph replay.  Make sure that works (for shared
-  // variables) if stack addresses change (i.e. a task-generating function is
-  // called from different call stack depths).
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(orig);
   size_t shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
@@ -5864,6 +5875,11 @@ static kmp_task_t *__kmp_taskgraph_clone_task(kmp_info_t *thread,
   KMP_MEMCPY(copy_td, taskdata, shareds_offset + sizeof_shareds);
   // Tasks cloned for a taskgraph always have this field set.
   copy_td->owning_taskgraph = taskgraph;
+  kmp_task_t *copy_task = KMP_TASKDATA_TO_TASK(copy_td);
+  if (orig->shareds) {
+    // New task's shared data has now moved.  Update the pointer.
+    copy_task->shareds = (void *)((char *)copy_td + shareds_offset);
+  }
   KMP_ATOMIC_ST_RLX(&copy_td->td_incomplete_child_tasks, 0);
   return KMP_TASKDATA_TO_TASK(copy_td);
 }
@@ -5972,6 +5988,9 @@ void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
     // taskgroup.
     KMP_ATOMIC_ST_REL(&taskgroup->taskgraph.recording, record);
   }
+  // Keep the current taskgraph invocation's outlined-entry args for
+  // replay-time relocation of by-reference captures.
+  record->taskgraph_args = args;
   __kmp_release_lock(&header->header_lock, gtid);
 
   kmp_taskgraph_status_t status = KMP_ATOMIC_LD_ACQ(&record->status);
@@ -6000,9 +6019,10 @@ void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
 
 kmp_uint32 __kmpc_taskgraph_task(ident_t *loc_ref, kmp_int32 gtid,
                                  kmp_task_t *new_task, kmp_int32 flags,
-                                 size_t sizeof_kmp_task_t, void *shareds,
+                                 size_t sizeof_kmp_task_t,
                                  size_t sizeof_shareds, kmp_int32 ndeps,
-                                 kmp_depend_info_t *dep_list) {
+                                 kmp_depend_info_t *dep_list,
+                                 kmp_task_relocate_t relocate) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgroup_t *taskgroup = thread->th.th_current_task->td_taskgroup;
   kmp_taskgraph_record_t *rec = __kmp_taskgraph_or_parent_recording(taskgroup);
@@ -6038,6 +6058,7 @@ kmp_uint32 __kmpc_taskgraph_task(ident_t *loc_ref, kmp_int32 gtid,
           thread, ndeps * sizeof(kmp_depend_info_t));
       KMP_MEMCPY(node->u.unresolved.dep_list, dep_list,
                  ndeps * sizeof(kmp_depend_info_t));
+      node->relocate = relocate;
     } else if (status == KMP_TDG_READY) {
 #ifdef DEBUG_TASKGRAPH
       fprintf(stderr,
diff --git a/openmp/runtime/test/taskgraph/taskgraph_firstprivate_stack_depth.cpp b/openmp/runtime/test/taskgraph/taskgraph_firstprivate_stack_depth.cpp
new file mode 100644
index 0000000000000..e3f8976bc1ae0
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_firstprivate_stack_depth.cpp
@@ -0,0 +1,112 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+static volatile int StackSink = 0;
+// Keep the observable result in stable storage so this test isolates
+// firstprivate replay across different stack depths. There is a separate bug
+// test for replayed tasks writing through a stack-local shared pointer cached
+// at record time.
+static volatile int ReplayResult = -1;
+
+struct Payload {
+  int values[6];
+  int bias;
+};
+
+__attribute__((noinline)) static int evaluate_payload(const Payload &payload,
+                                                      int seed) {
+  return seed * payload.values[0] - payload.values[1] +
+         payload.values[2] * payload.values[3] - payload.bias +
+         payload.values[4] - payload.values[5];
+}
+
+__attribute__((noinline)) static void clobber_stack(int base) {
+  volatile int scratch[4096];
+
+  for (int i = 0; i < 4096; ++i)
+    scratch[i] = base + i;
+
+  StackSink += scratch[base & 63];
+}
+
+__attribute__((noinline)) static int run_taskgraph(int seed) {
+  Payload payload{
+      {seed + 1, seed + 3, seed + 5, seed + 7, seed + 11, seed + 13},
+      seed * 17 + 19};
+  ReplayResult = -1;
+
+#pragma omp taskgraph graph_id(91)
+  {
+#pragma omp task firstprivate(payload, seed) shared(ReplayResult)
+    {
+      ReplayResult = evaluate_payload(payload, seed);
+    }
+  }
+
+  return ReplayResult;
+}
+
+__attribute__((noinline)) static int call_with_depth(int seed, int depth) {
+  volatile int padding[128];
+
+  for (int i = 0; i < 128; ++i)
+    padding[i] = seed + depth + i;
+
+  StackSink += padding[(seed + depth) & 127];
+
+  if (depth == 0)
+    return run_taskgraph(seed);
+  return call_with_depth(seed, depth - 1);
+}
+
+__attribute__((noinline)) static int expected_result(int seed) {
+  Payload payload{
+      {seed + 1, seed + 3, seed + 5, seed + 7, seed + 11, seed + 13},
+      seed * 17 + 19};
+  return evaluate_payload(payload, seed);
+}
+
+int main() {
+  constexpr int NumCalls = 4;
+  constexpr int Seeds[NumCalls] = {3, 17, 29, 41};
+  constexpr int Depths[NumCalls] = {0, 3, 1, 5};
+
+  int recorded = -1;
+  bool failed = false;
+
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp single
+    {
+      recorded = call_with_depth(Seeds[0], Depths[0]);
+      if (recorded != expected_result(Seeds[0])) {
+        std::fprintf(stderr, "FAIL initial record got=%d expected=%d\n",
+                     recorded, expected_result(Seeds[0]));
+        failed = true;
+      }
+
+      for (int i = 1; i < NumCalls; ++i) {
+        clobber_stack(Seeds[i] * 1000);
+        const int replayed = call_with_depth(Seeds[i], Depths[i]);
+        if (replayed != recorded) {
+          std::fprintf(stderr,
+                       "FAIL replay %d depth=%d seed=%d got=%d expected=%d\n",
+                       i, Depths[i], Seeds[i], replayed, recorded);
+          failed = true;
+        }
+      }
+    }
+  }
+
+  if (failed)
+    return 1;
+
+  std::fprintf(stderr, "PASS firstprivate stack result=%d\n", recorded);
+  return 0;
+}
+
+// CHECK: PASS firstprivate stack result=
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_mixed_capture.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_mixed_capture.cpp
new file mode 100644
index 0000000000000..0ea8e4bdadbb3
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_mixed_capture.cpp
@@ -0,0 +1,46 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int run_taskgraph_mixed_capture(int seed) {
+  int x = seed;
+  int y = seed * 2;
+  int out = -1;
+  int fp = 7;
+
+#pragma omp taskgraph graph_id(401)
+  {
+#pragma omp task replayable(1) shared(x, y, out) firstprivate(fp)              \
+    depend(inout : x, y)
+    {
+      x += fp;
+      y += x;
+      out = y + fp;
+    }
+  }
+
+  return out;
+}
+
+int main() {
+  const int first = run_taskgraph_mixed_capture(1);
+  const int second = run_taskgraph_mixed_capture(100);
+
+  if (first != 17 || second != 314) {
+    std::fprintf(stderr,
+                 "FAIL lexical mixed capture replay first=%d second=%d "
+                 "expected=17/314\n",
+                 first, second);
+    return 1;
+  }
+
+  std::fprintf(stderr, "PASS lexical mixed capture replay first=%d second=%d\n",
+               first, second);
+  return 0;
+}
+
+// CHECK: PASS lexical mixed capture replay first=17 second=314
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type.cpp
new file mode 100644
index 0000000000000..f3a50a27176d4
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type.cpp
@@ -0,0 +1,60 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+struct Tracker {
+  static int Ctors;
+  static int Dtors;
+
+  int Value;
+
+  explicit Tracker(int V) : Value(V) { ++Ctors; }
+  ~Tracker() { ++Dtors; }
+
+  void bump(int Delta) { Value += Delta; }
+};
+
+int Tracker::Ctors = 0;
+int Tracker::Dtors = 0;
+
+__attribute__((noinline)) static int run_taskgraph_nontrivial(int seed) {
+  Tracker Obj(seed);
+  int out = -1;
+
+#pragma omp taskgraph graph_id(403)
+  {
+#pragma omp task replayable(1) shared(Obj, out)
+    {
+      Obj.bump(11);
+      out = Obj.Value;
+    }
+  }
+
+  return out;
+}
+
+int main() {
+  const int first = run_taskgraph_nontrivial(1);
+  const int second = run_taskgraph_nontrivial(100);
+
+  if (first != 12 || second != 111 || Tracker::Ctors < 2 ||
+      Tracker::Dtors < 2 || Tracker::Ctors != Tracker::Dtors) {
+    std::fprintf(
+        stderr,
+        "FAIL lexical nontrivial replay first=%d second=%d ctors=%d dtors=%d\n",
+        first, second, Tracker::Ctors, Tracker::Dtors);
+    return 1;
+  }
+
+  std::fprintf(
+      stderr,
+      "PASS lexical nontrivial replay first=%d second=%d ctors=%d dtors=%d\n",
+      first, second, Tracker::Ctors, Tracker::Dtors);
+  return 0;
+}
+
+// CHECK: PASS lexical nontrivial replay first=12 second=111
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_recursive.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_recursive.cpp
new file mode 100644
index 0000000000000..6012b6194e56b
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_recursive.cpp
@@ -0,0 +1,88 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+struct Tracker {
+  static int Ctors;
+  static int Dtors;
+
+  int Value;
+
+  explicit Tracker(int V) : Value(V) { ++Ctors; }
+  ~Tracker() { ++Dtors; }
+
+  void bump(int Delta) { Value += Delta; }
+};
+
+int Tracker::Ctors = 0;
+int Tracker::Dtors = 0;
+
+__attribute__((noinline)) static int expected_recursive(int depth, int seed,
+                                                        int run_tag) {
+  int local = seed + (depth + 1) * 5 + run_tag;
+  if (depth == 0)
+    return local;
+  return local + expected_recursive(depth - 1, seed + 9, run_tag);
+}
+
+__attribute__((noinline)) static int
+run_recursive_nontrivial(int depth, int seed, int run_tag) {
+  Tracker Obj(seed);
+  int out = -1;
+
+  int gid = 500 + depth;
+#pragma omp taskgraph graph_id(gid)
+  {
+#pragma omp task replayable(1) shared(Obj, out, depth, run_tag)
+    {
+      Obj.bump((depth + 1) * 5 + run_tag);
+      out = Obj.Value;
+    }
+  }
+
+  if (depth == 0)
+    return out;
+  return out + run_recursive_nontrivial(depth - 1, seed + 9, run_tag);
+}
+
+int main() {
+  const int depth = 3;
+  int total_actual = 0;
+  int total_expected = 0;
+
+  for (int run = 0; run < 3; ++run) {
+    const int seed = 100 * run + 1;
+    const int actual = run_recursive_nontrivial(depth, seed, run);
+    const int expected = expected_recursive(depth, seed, run);
+
+    if (actual != expected) {
+      std::fprintf(stderr,
+                   "FAIL recursive nontrivial run=%d actual=%d expected=%d\n",
+                   run, actual, expected);
+      return 1;
+    }
+
+    total_actual += actual;
+    total_expected += expected;
+  }
+
+  if (Tracker::Ctors != Tracker::Dtors || Tracker::Ctors < 12) {
+    std::fprintf(stderr,
+                 "FAIL recursive nontrivial lifetime ctors=%d dtors=%d "
+                 "total=%d expected=%d\n",
+                 Tracker::Ctors, Tracker::Dtors, total_actual, total_expected);
+    return 1;
+  }
+
+  std::fprintf(
+      stderr,
+      "PASS recursive nontrivial total=%d expected=%d ctors=%d dtors=%d\n",
+      total_actual, total_expected, Tracker::Ctors, Tracker::Dtors);
+  return 0;
+}
+
+// CHECK: PASS recursive nontrivial total=
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer.cpp
new file mode 100644
index 0000000000000..7690aff37853c
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer.cpp
@@ -0,0 +1,44 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int run_taskgraph_pointer_shared(int seed) {
+  int value = seed;
+  int *ptr = &value;
+  int out = -1;
+
+#pragma omp taskgraph graph_id(402)
+  {
+#pragma omp task replayable(1) shared(ptr, out) depend(inout : value)
+    {
+      *ptr += 3;
+      out = *ptr;
+    }
+  }
+
+  return out;
+}
+
+int main() {
+  const int first = run_taskgraph_pointer_shared(1);
+  const int second = run_taskgraph_pointer_shared(100);
+
+  if (first != 4 || second != 103) {
+    std::fprintf(stderr,
+                 "FAIL lexical pointer shared replay first=%d second=%d "
+                 "expected=4/103\n",
+                 first, second);
+    return 1;
+  }
+
+  std::fprintf(stderr,
+               "PASS lexical pointer shared replay first=%d second=%d\n", first,
+               second);
+  return 0;
+}
+
+// CHECK: PASS lexical pointer shared replay first=4 second=103
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_recursive_frameid.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_recursive_frameid.cpp
new file mode 100644
index 0000000000000..142a5941f97ae
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_recursive_frameid.cpp
@@ -0,0 +1,76 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdint>
+#include <cstdio>
+
+__attribute__((noinline)) static int expected_recursive(int depth, int seed,
+                                                        int run_tag) {
+  int value = seed;
+  value += (depth + 1) * 3 + run_tag;
+  if (depth == 0)
+    return value;
+  return value + expected_recursive(depth - 1, seed + 7, run_tag);
+}
+
+__attribute__((noinline)) static int run_recursive_frameid(int depth, int seed,
+                                                           int run_tag) {
+  int value = seed;
+  int *ptr = &value;
+  int *&ptr_ref = ptr;
+  int out = -1;
+
+  // Typically, if captured pointers refer to locations on the stack, that
+  // would not be safe for taskgraph record/replay because we in general we
+  // cannot rewrite such pointers to point to the current (live) stack frame.
+  //
+  // This is one possible way around that though: we keep a taskgraph record
+  // per stack-depth, each of which may refer to the local stack frame.
+  //
+  // I probably wouldn't recommend use of this technique in production code.
+  uintptr_t frame_gid = reinterpret_cast<uintptr_t>(__builtin_frame_address(0));
+
+#pragma omp taskgraph graph_id(frame_gid)
+  {
+#pragma omp task shared(ptr_ref, out, depth, run_tag) depend(inout : value)
+    {
+      *ptr_ref += (depth + 1) * 3 + run_tag;
+      out = *ptr_ref;
+    }
+  }
+
+  if (depth == 0)
+    return out;
+  return out + run_recursive_frameid(depth - 1, seed + 7, run_tag);
+}
+
+int main() {
+  const int depth = 3;
+  int actual_sum = 0;
+  int expected_sum = 0;
+
+  for (int run = 0; run < 3; ++run) {
+    int seed = 100 * run + 1;
+    int actual = run_recursive_frameid(depth, seed, run);
+    int expected = expected_recursive(depth, seed, run);
+    if (actual != expected) {
+      std::fprintf(
+          stderr,
+          "FAIL recursive pointer frameid run=%d actual=%d expected=%d\n", run,
+          actual, expected);
+      return 1;
+    }
+    actual_sum += actual;
+    expected_sum += expected;
+  }
+
+  std::fprintf(stderr,
+               "PASS recursive pointer frameid runs=3 total=%d expected=%d\n",
+               actual_sum, expected_sum);
+  return 0;
+}
+
+// CHECK: PASS recursive pointer frameid runs=3 total=
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_recursive.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_recursive.cpp
new file mode 100644
index 0000000000000..4d0d7b5613108
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_recursive.cpp
@@ -0,0 +1,46 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int run_taskgraph_recursive(int depth,
+                                                             int seed) {
+  int x = seed;
+  int out = -1;
+
+#pragma omp taskgraph graph_id(450)
+  {
+#pragma omp task replayable(1) shared(x, out, depth) depend(inout : x)
+    {
+      x += depth + 1;
+      out = x;
+    }
+  }
+
+  if (depth == 0)
+    return out;
+
+  return out + run_taskgraph_recursive(depth - 1, seed + 10);
+}
+
+int main() {
+  const int first = run_taskgraph_recursive(3, 1);
+  const int second = run_taskgraph_recursive(3, 100);
+
+  if (first != 74 || second != 470) {
+    std::fprintf(
+        stderr,
+        "FAIL lexical recursive replay first=%d second=%d expected=74/470\n",
+        first, second);
+    return 1;
+  }
+
+  std::fprintf(stderr, "PASS lexical recursive replay first=%d second=%d\n",
+               first, second);
+  return 0;
+}
+
+// CHECK: PASS lexical recursive replay first=74 second=470
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_works.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_works.cpp
new file mode 100644
index 0000000000000..82be9c82d3e7c
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_works.cpp
@@ -0,0 +1,42 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int run_taskgraph_lexical(int seed) {
+  int x = seed;
+  int out = -1;
+
+#pragma omp taskgraph graph_id(311)
+  {
+#pragma omp task replayable(1) shared(x, out) depend(inout : x)
+    {
+      x += 5;
+      out = x;
+    }
+  }
+
+  return out;
+}
+
+int main() {
+  const int first = run_taskgraph_lexical(1);
+  const int second = run_taskgraph_lexical(100);
+
+  if (first != 6 || second != 105) {
+    std::fprintf(
+        stderr,
+        "FAIL lexical shared replay first=%d second=%d expected=6/105\n", first,
+        second);
+    return 1;
+  }
+
+  std::fprintf(stderr, "PASS lexical shared replay first=%d second=%d\n", first,
+               second);
+  return 0;
+}
+
+// CHECK: PASS lexical shared replay first=6 second=105
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_1.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_1.cpp
new file mode 100644
index 0000000000000..671e3bf76951f
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_1.cpp
@@ -0,0 +1,52 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %not --crash %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+// This seems like it could work in principle, but in general we don't know
+// where the targets of the referenced variables are when the task is replayed.
+__attribute__((noinline)) static void emit_nonlexical_task(int &x, int &out) {
+#pragma omp task replayable(1) shared(x, out) depend(inout : x)
+  {
+    x += 5;
+    out = x;
+  }
+}
+
+__attribute__((noinline)) static int run_taskgraph_nonlexical(int seed) {
+  int x = seed;
+  int out = -1;
+
+#pragma omp taskgraph graph_id(312)
+  {
+    emit_nonlexical_task(x, out);
+  }
+
+  return out;
+}
+
+int main() {
+  const int recorded = run_taskgraph_nonlexical(1);
+  const int replayed = run_taskgraph_nonlexical(100);
+
+  // The "non-lexical" replayable task is emitted in a helper function outside
+  // the taskgraph lexical scope.  We expect this to raise a runtime error.
+  if (recorded == replayed) {
+    std::fprintf(
+        stderr,
+        "UNEXPECTED SUCCESS nonlexical replay recorded=%d replayed=%d\n",
+        recorded, replayed);
+    return 0;
+  }
+
+  std::fprintf(stderr,
+               "EXPECTED FAILURE nonlexical replay recorded=%d replayed=%d\n",
+               recorded, replayed);
+  return 1;
+}
+
+// CHECK: OMP: Error #302: Cannot locate captured shared variable reference for
+// taskgraph replay
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_2.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_2.cpp
new file mode 100644
index 0000000000000..705114d19876e
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_2.cpp
@@ -0,0 +1,70 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %not --crash %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int emit_nonlexical_task(int seed) {
+  int x = seed;
+  int out = -1;
+
+// This is syntactically valid, but a taskgraph replay that includes this
+// task cannot possibly succeed, because the stack frame containing 'x' and
+// 'out' doesn't exist at replay time.  We can raise a runtime error in that
+// case.
+// This isn't a compile error because the code is still valid if no taskgraph
+// record/replay is in progress.
+#pragma omp task replayable(1) shared(x, out) depend(inout : x)
+  {
+    x += 5;
+    out = x;
+  }
+
+  return out;
+}
+
+__attribute__((noinline)) static int run_taskgraph_nonlexical(int seed) {
+  int out;
+
+#pragma omp taskgraph graph_id(312)
+  {
+#pragma omp task shared(out)
+    {
+      out = emit_nonlexical_task(seed);
+    }
+  }
+
+  return out;
+}
+
+int main() {
+  int out = emit_nonlexical_task(50);
+  if (out != 55) {
+    std::fprintf(stderr,
+                 "UNEXPECTED FAILURE: task outside taskgraph returned %d\n",
+                 out);
+  }
+
+  const int recorded = run_taskgraph_nonlexical(1);
+  const int replayed = run_taskgraph_nonlexical(100);
+
+  // The non-lexical replayable task is emitted in a helper function outside
+  // the taskgraph lexical scope.
+  if (recorded == replayed) {
+    std::fprintf(
+        stderr,
+        "UNEXPECTED SUCCESS nonlexical replay recorded=%d replayed=%d\n",
+        recorded, replayed);
+    return 0;
+  }
+
+  std::fprintf(stderr,
+               "EXPECTED FAILURE nonlexical replay recorded=%d replayed=%d\n",
+               recorded, replayed);
+  return 1;
+}
+
+// CHECK: OMP: Error #302: Cannot locate captured shared variable reference for
+// taskgraph replay
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth.cpp
new file mode 100644
index 0000000000000..922cb85a53eec
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth.cpp
@@ -0,0 +1,119 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// XFAIL: *
+// clang-format on
+
+#include <cstdio>
+
+static volatile int StackSink = 0;
+static volatile int ReplayResult = -1;
+
+struct Payload {
+  int values[6];
+  int bias;
+};
+
+__attribute__((noinline)) static int evaluate_payload(const Payload &payload,
+                                                      int seed) {
+  return seed * payload.values[0] - payload.values[1] +
+         payload.values[2] * payload.values[3] - payload.bias +
+         payload.values[4] - payload.values[5];
+}
+
+__attribute__((noinline)) static void clobber_stack(int base) {
+  volatile int scratch[4096];
+
+  for (int i = 0; i < 4096; ++i)
+    scratch[i] = base + i;
+
+  StackSink += scratch[base & 63];
+}
+
+// Intended future usage of firstprivate(saved: ...): this replayable task is
+// not lexically nested within the taskgraph directive. It is created from a
+// helper function called inside the taskgraph region, and needs closure-like
+// capture of that helper's stack locals.
+__attribute__((noinline)) static void emit_replayable_task(int seed) {
+  Payload payload{
+      {seed + 1, seed + 3, seed + 5, seed + 7, seed + 11, seed + 13},
+      seed * 17 + 19};
+
+#pragma omp task replayable(1) firstprivate(saved : payload, seed)             \
+    shared(ReplayResult)
+  {
+    ReplayResult = evaluate_payload(payload, seed);
+  }
+}
+
+__attribute__((noinline)) static int run_taskgraph(int seed) {
+  ReplayResult = -1;
+
+#pragma omp taskgraph graph_id(93)
+  {
+    emit_replayable_task(seed);
+  }
+
+  return ReplayResult;
+}
+
+__attribute__((noinline)) static int call_with_depth(int seed, int depth) {
+  volatile int padding[128];
+
+  for (int i = 0; i < 128; ++i)
+    padding[i] = seed + depth + i;
+
+  StackSink += padding[(seed + depth) & 127];
+
+  if (depth == 0)
+    return run_taskgraph(seed);
+  return call_with_depth(seed, depth - 1);
+}
+
+__attribute__((noinline)) static int expected_result(int seed) {
+  Payload payload{
+      {seed + 1, seed + 3, seed + 5, seed + 7, seed + 11, seed + 13},
+      seed * 17 + 19};
+  return evaluate_payload(payload, seed);
+}
+
+int main() {
+  constexpr int NumCalls = 4;
+  constexpr int Seeds[NumCalls] = {3, 17, 29, 41};
+  constexpr int Depths[NumCalls] = {0, 3, 1, 5};
+
+  int recorded = -1;
+  bool failed = false;
+
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp single
+    {
+      recorded = call_with_depth(Seeds[0], Depths[0]);
+      if (recorded != expected_result(Seeds[0])) {
+        std::fprintf(stderr, "FAIL initial record got=%d expected=%d\n",
+                     recorded, expected_result(Seeds[0]));
+        failed = true;
+      }
+
+      for (int i = 1; i < NumCalls; ++i) {
+        clobber_stack(Seeds[i] * 1000);
+        const int replayed = call_with_depth(Seeds[i], Depths[i]);
+        if (replayed != recorded) {
+          std::fprintf(stderr,
+                       "FAIL replay %d depth=%d seed=%d got=%d expected=%d\n",
+                       i, Depths[i], Seeds[i], replayed, recorded);
+          failed = true;
+        }
+      }
+    }
+  }
+
+  if (failed)
+    return 1;
+
+  std::fprintf(stderr, "PASS replayable saved stack result=%d\n", recorded);
+  return 0;
+}
+
+// CHECK: PASS replayable saved stack result=
diff --git a/openmp/runtime/test/taskgraph/taskgraph_shared_stack_depth.cpp b/openmp/runtime/test/taskgraph/taskgraph_shared_stack_depth.cpp
new file mode 100644
index 0000000000000..ad579e8ed1b3d
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_shared_stack_depth.cpp
@@ -0,0 +1,94 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// XFAIL: *
+// clang-format on
+
+#include <cstdio>
+
+static volatile int StackSink = 0;
+
+struct Payload {
+  int values[6];
+  int bias;
+};
+
+__attribute__((noinline)) static int evaluate_payload(const Payload &payload,
+                                                      int seed) {
+  return seed * payload.values[0] - payload.values[1] +
+         payload.values[2] * payload.values[3] - payload.bias +
+         payload.values[4] - payload.values[5];
+}
+
+__attribute__((noinline)) static void clobber_stack(int base) {
+  volatile int scratch[4096];
+
+  for (int i = 0; i < 4096; ++i)
+    scratch[i] = base + i;
+
+  StackSink += scratch[base & 63];
+}
+
+__attribute__((noinline)) static int run_taskgraph(int seed) {
+  Payload payload{
+      {seed + 1, seed + 3, seed + 5, seed + 7, seed + 11, seed + 13},
+      seed * 17 + 19};
+  int result = -1;
+
+#pragma omp taskgraph graph_id(92)
+  {
+#pragma omp task firstprivate(payload, seed) shared(result)
+    {
+      result = evaluate_payload(payload, seed);
+    }
+  }
+
+  return result;
+}
+
+__attribute__((noinline)) static int call_with_depth(int seed, int depth) {
+  volatile int padding[128];
+
+  for (int i = 0; i < 128; ++i)
+    padding[i] = seed + depth + i;
+
+  StackSink += padding[(seed + depth) & 127];
+
+  if (depth == 0)
+    return run_taskgraph(seed);
+  return call_with_depth(seed, depth - 1);
+}
+
+__attribute__((noinline)) static int expected_result(int seed) {
+  Payload payload{
+      {seed + 1, seed + 3, seed + 5, seed + 7, seed + 11, seed + 13},
+      seed * 17 + 19};
+  return evaluate_payload(payload, seed);
+}
+
+int main() {
+  constexpr int RecordSeed = 3;
+  constexpr int ReplaySeed = 17;
+
+  const int recorded = call_with_depth(RecordSeed, 0);
+  if (recorded != expected_result(RecordSeed)) {
+    std::fprintf(stderr, "FAIL initial record got=%d expected=%d\n", recorded,
+                 expected_result(RecordSeed));
+    return 1;
+  }
+
+  clobber_stack(ReplaySeed * 1000);
+  const int replayed = call_with_depth(ReplaySeed, 3);
+  if (replayed != recorded) {
+    std::fprintf(
+        stderr, "BUG shared stack replay depth=%d seed=%d got=%d expected=%d\n",
+        3, ReplaySeed, replayed, recorded);
+    return 1;
+  }
+
+  std::fprintf(stderr, "PASS shared stack replay=%d\n", replayed);
+  return 0;
+}
+
+// CHECK: PASS shared stack replay=14

>From f6fc32ca30f94652c7e36c4a8f3e65cc358171c6 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Tue, 26 May 2026 15:20:12 -0500
Subject: [PATCH 20/24] [OpenMP] Extend taskgraph shared-data relocation to
 taskloop

Build on the per-task relocation infrastructure introduced for
'omp task' in the previous patch and apply the same scheme to
taskloop tasks recorded inside a taskgraph.  Without this,
by-reference captures inside a recorded taskloop iteration still
point at the original recording's stack on each replay.

On the compiler side (CGOpenMPRuntime.cpp), emit a relocation
thunk for the taskloop's captured statement via the shared
emitTaskRelocationFunction helper and pass it as the new trailing
argument of __kmpc_taskgraph_taskloop.  The now-unused 'shareds'
and 'sizeof_shareds' parameters are dropped from the call and from
TGTaskLoopArgs.

On the runtime side, update __kmpc_taskgraph_taskloop to match
the new ABI (drop the dead 'shareds' / 'sizeof_shareds'
parameters, add the trailing kmp_task_relocate_t parameter) and
plumb the callback through the taskgraph variant of
__kmp_taskloop and __kmp_taskloop_linear so every recorded
subtask gets node->relocate = reloc, mirroring the explicit-task
path.  Non-taskgraph callers pass a default nullptr and are
unaffected.

Add taskloop counterparts of the omp-task runtime tests added in
the previous patch.

Assisted-By: Claude Opus 4.7

Pull Request: https://github.com/llvm/llvm-project/pull/200405
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  30 ++--
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |   4 +-
 openmp/runtime/src/kmp.h                      |   3 +-
 openmp/runtime/src/kmp_tasking.cpp            |  24 ++--
 ..._lexical_shared_mixed_capture_taskloop.cpp |  45 ++++++
 ...red_nontrivial_type_recursive_taskloop.cpp |  89 ++++++++++++
 ...exical_shared_nontrivial_type_taskloop.cpp |  59 ++++++++
 ...red_pointer_recursive_frameid_taskloop.cpp |  86 ++++++++++++
 ...ayable_lexical_shared_pointer_taskloop.cpp |  44 ++++++
 ...able_lexical_shared_recursive_taskloop.cpp |  74 ++++++++++
 ...playable_lexical_shared_works_taskloop.cpp |  42 ++++++
 ...ble_nonlexical_shared_fails_1_taskloop.cpp |  51 +++++++
 ...ble_nonlexical_shared_fails_2_taskloop.cpp |  62 +++++++++
 ..._replayable_saved_stack_depth_taskloop.cpp | 130 ++++++++++++++++++
 14 files changed, 714 insertions(+), 29 deletions(-)
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_mixed_capture_taskloop.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_recursive_taskloop.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_taskloop.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_recursive_frameid_taskloop.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_taskloop.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_recursive_taskloop.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_works_taskloop.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_1_taskloop.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_2_taskloop.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth_taskloop.cpp

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index cc3f90248625c..95ac4b8aca572 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -5151,7 +5151,7 @@ void CGOpenMPRuntime::emitTaskLoopCall(
                                                      PrePostActionTy &) {
     llvm::Value *ThreadId = getThreadID(CGF, Loc);
     llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
-    std::array<llvm::Value *, 16> TGTaskLoopArgs;
+    std::array<llvm::Value *, 14> TGTaskLoopArgs;
 
     // This is all copy/pasted from below. Refactor!
     LValue LBLVal = CGF.EmitLValueForField(
@@ -5196,30 +5196,34 @@ void CGOpenMPRuntime::emitTaskLoopCall(
     TGTaskLoopArgs[1] = ThreadId;
     TGTaskLoopArgs[2] = TaskInitResult.NewTask;
     TGTaskLoopArgs[3] = TaskAllocArgs[0]; // TaskFlags
-    TGTaskLoopArgs[4] = TaskAllocArgs[1]; // KmpTaskTWithPrivatesTySize
-    TGTaskLoopArgs[5] = Shareds.emitRawPointer(CGF);
-    TGTaskLoopArgs[6] = TaskAllocArgs[2]; // SharedsSize
-    TGTaskLoopArgs[7] = IfVal;
-    TGTaskLoopArgs[8] = LBLVal.getPointer(CGF);
-    TGTaskLoopArgs[9] = UBLVal.getPointer(CGF);
-    TGTaskLoopArgs[10] = CGF.EmitLoadOfScalar(StLVal, Loc);
-    TGTaskLoopArgs[11] =
+    TGTaskLoopArgs[4] = IfVal;
+    TGTaskLoopArgs[5] = LBLVal.getPointer(CGF);
+    TGTaskLoopArgs[6] = UBLVal.getPointer(CGF);
+    TGTaskLoopArgs[7] = CGF.EmitLoadOfScalar(StLVal, Loc);
+    TGTaskLoopArgs[8] =
         llvm::ConstantInt::getSigned(CGF.IntTy, Data.Nogroup ? 1 : 0);
-    TGTaskLoopArgs[12] = llvm::ConstantInt::getSigned(
+    TGTaskLoopArgs[9] = llvm::ConstantInt::getSigned(
         CGF.IntTy, Data.Schedule.getPointer()
                        ? Data.Schedule.getInt() ? NumTasks : Grainsize
                        : NoSchedule);
-    TGTaskLoopArgs[13] =
+    TGTaskLoopArgs[10] =
         Data.Schedule.getPointer()
             ? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
                                         /*isSigned=*/false)
             : llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0);
-    TGTaskLoopArgs[14] =
+    TGTaskLoopArgs[11] =
         llvm::ConstantInt::getSigned(CGF.IntTy, Data.HasModifier ? 1 : 0);
-    TGTaskLoopArgs[15] = TaskInitResult.TaskDupFn
+    TGTaskLoopArgs[12] = TaskInitResult.TaskDupFn
                              ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
                                    TaskInitResult.TaskDupFn, CGF.VoidPtrTy)
                              : llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
+    const auto *CS = cast<CapturedStmt>(D.getAssociatedStmt());
+    llvm::Function *RelocFn =
+        emitTaskRelocationFunction(CGM, Loc, *CS, CGF.CapturedStmtInfo, Data);
+    TGTaskLoopArgs[13] =
+        RelocFn ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(RelocFn,
+                                                                  CGM.VoidPtrTy)
+                : llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
                             CGM.getModule(), OMPRTL___kmpc_taskgraph_taskloop),
                         TGTaskLoopArgs);
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index e32308df74cae..02e3e1f98e969 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -362,8 +362,8 @@ __OMP_RTL(__kmpc_taskgraph, false, Void, IdentPtr, Int32, VoidPtrPtr, SizeTy,
 __OMP_RTL(__kmpc_taskgraph_task, false, Int32, IdentPtr, Int32, VoidPtr, Int32,
           SizeTy, SizeTy, Int32, VoidPtr, VoidPtr)
 __OMP_RTL(__kmpc_taskgraph_taskloop, false, Int32, IdentPtr, Int32, VoidPtr,
-          Int32, SizeTy, VoidPtr, SizeTy, Int32, Int64Ptr, Int64Ptr, Int64,
-          Int32, Int32, Int64, Int32, VoidPtr)
+          Int32, Int32, Int64Ptr, Int64Ptr, Int64,
+          Int32, Int32, Int64, Int32, VoidPtr, VoidPtr)
 __OMP_RTL(__kmpc_taskgraph_taskwait, false, Void, IdentPtr, Int32, Int32,
           VoidPtr, Int32)
 __OMP_RTL(__kmpc_taskgraph_taskred_init, false, /* kmp_taskgroup */ VoidPtr,
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 9a96121d94d36..27b2399ddbc01 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -4514,10 +4514,9 @@ KMP_EXPORT kmp_uint32 __kmpc_taskgraph_task(
     kmp_depend_info_t *dep_list, kmp_task_relocate_t reloc);
 KMP_EXPORT kmp_uint32 __kmpc_taskgraph_taskloop(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 flags,
-    size_t sizeof_kmp_task_t, void *shareds, size_t sizeof_shareds,
     kmp_int32 if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
     kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize,
-    kmp_int32 modifier, void *task_dup);
+    kmp_int32 modifier, void *task_dup, kmp_task_relocate_t reloc);
 KMP_EXPORT void __kmpc_taskgraph_taskwait(ident_t *loc_ref, kmp_int32 gtid,
                                           kmp_int32 ndeps,
                                           kmp_depend_info_t *dep_list,
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 1b35a56f7eecd..08b0eda3dc08f 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -5074,7 +5074,8 @@ __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, kmp_uint64 *lb,
                       void *task_dup
 #if OMP_TASKGRAPH_EXPERIMENTAL
                       ,
-                      kmp_taskgraph_record_t *taskgraph_rec = nullptr
+                      kmp_taskgraph_record_t *taskgraph_rec = nullptr,
+                      kmp_task_relocate_t relocate = nullptr
 #endif
 ) {
   KMP_COUNT_BLOCK(OMP_TASKLOOP);
@@ -5170,6 +5171,7 @@ __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, kmp_uint64 *lb,
         taskgroup->taskgraph.reduce_input = nullptr;
       }
       node->taskloop_task = true;
+      node->relocate = relocate;
       next_taskdata->owning_taskgraph = taskgraph_rec;
       // FIXME: These dependency fields might be back-filled by the as-yet
       // unimplemented task_iteration subsidiary directive.  We'll need a way
@@ -5462,7 +5464,8 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
                            int modifier, void *task_dup
 #if OMP_TASKGRAPH_EXPERIMENTAL
                            ,
-                           kmp_taskgraph_record_t *taskgraph_rec = nullptr
+                           kmp_taskgraph_record_t *taskgraph_rec = nullptr,
+                           kmp_task_relocate_t relocate = nullptr
 #endif
 ) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
@@ -5596,7 +5599,7 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
 #if OMPT_SUPPORT
                           OMPT_GET_RETURN_ADDRESS(0),
 #endif
-                          task_dup, taskgraph_rec);
+                          task_dup, taskgraph_rec, relocate);
     // check if clause value next
     // Also require GOMP_taskloop to reduce to linear
     // (taskdata->td_flags.native)
@@ -6119,14 +6122,11 @@ void __kmpc_taskgraph_taskwait(ident_t *loc_ref, kmp_int32 gtid,
                               has_no_wait);
 }
 
-kmp_uint32 __kmpc_taskgraph_taskloop(ident_t *loc_ref, kmp_int32 gtid,
-                                     kmp_task_t *new_task, kmp_int32 flags,
-                                     size_t sizeof_kmp_task_t, void *shareds,
-                                     size_t sizeof_shareds, kmp_int32 if_val,
-                                     kmp_uint64 *lb, kmp_uint64 *ub,
-                                     kmp_int64 st, kmp_int32 nogroup,
-                                     kmp_int32 sched, kmp_uint64 grainsize,
-                                     kmp_int32 modifier, void *task_dup) {
+kmp_uint32 __kmpc_taskgraph_taskloop(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 flags,
+    kmp_int32 if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+    kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize,
+    kmp_int32 modifier, void *task_dup, kmp_task_relocate_t relocate) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgroup_t *taskgroup = thread->th.th_current_task->td_taskgroup;
   kmp_taskgraph_record_t *rec = __kmp_taskgraph_or_parent_recording(taskgroup);
@@ -6135,7 +6135,7 @@ kmp_uint32 __kmpc_taskgraph_taskloop(ident_t *loc_ref, kmp_int32 gtid,
     kmp_taskgraph_status_t status = KMP_ATOMIC_LD_ACQ(&rec->status);
     if (status == KMP_TDG_RECORDING)
       __kmp_taskloop(loc_ref, gtid, new_task, if_val, lb, ub, st, nogroup,
-                     sched, grainsize, modifier, task_dup, rec);
+                     sched, grainsize, modifier, task_dup, rec, relocate);
     else if (status == KMP_TDG_READY) {
 #ifdef DEBUG_TASKGRAPH
       fprintf(stderr, "non-taskgraph taskloop entry point for taskloop in "
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_mixed_capture_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_mixed_capture_taskloop.cpp
new file mode 100644
index 0000000000000..147e768dcc85c
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_mixed_capture_taskloop.cpp
@@ -0,0 +1,45 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int run_taskgraph_mixed_capture(int seed) {
+  int x = seed;
+  int y = seed * 2;
+  int fp = 7;
+  int res = 0;
+
+#pragma omp taskgraph graph_id(612)
+  {
+#pragma omp taskloop replayable num_tasks(8) shared(x, y) firstprivate(fp)     \
+    reduction(+ : res)
+    for (int i = 0; i < 16; ++i) {
+      res += x + y + fp + i;
+    }
+  }
+
+  return res;
+}
+
+int main() {
+  const int first = run_taskgraph_mixed_capture(1);
+  const int second = run_taskgraph_mixed_capture(100);
+
+  if (first != 280 || second != 5032) {
+    std::fprintf(stderr,
+                 "FAIL lexical mixed capture taskloop replay first=%d "
+                 "second=%d expected=280/5032\n",
+                 first, second);
+    return 1;
+  }
+
+  std::fprintf(
+      stderr, "PASS lexical mixed capture taskloop replay first=%d second=%d\n",
+      first, second);
+  return 0;
+}
+
+// CHECK: PASS lexical mixed capture taskloop replay first=280 second=5032
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_recursive_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_recursive_taskloop.cpp
new file mode 100644
index 0000000000000..ab23adf6ce477
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_recursive_taskloop.cpp
@@ -0,0 +1,89 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+struct Tracker {
+  static int Ctors;
+  static int Dtors;
+
+  int Value;
+
+  explicit Tracker(int V) : Value(V) { ++Ctors; }
+  ~Tracker() { ++Dtors; }
+
+  void bump(int Delta) { Value += Delta; }
+};
+
+int Tracker::Ctors = 0;
+int Tracker::Dtors = 0;
+
+__attribute__((noinline)) static int expected_recursive(int depth, int seed,
+                                                        int run_tag) {
+  int local = 16 * (seed + (depth + 1) * 5 + run_tag) + 120;
+  if (depth == 0)
+    return local;
+  return local + expected_recursive(depth - 1, seed + 9, run_tag);
+}
+
+__attribute__((noinline)) static int
+run_recursive_nontrivial(int depth, int seed, int run_tag) {
+  Tracker Obj(seed);
+  int res = 0;
+
+  int gid = 620 + depth;
+#pragma omp taskgraph graph_id(gid)
+  {
+#pragma omp taskloop replayable num_tasks(8) shared(Obj, depth, run_tag)       \
+    reduction(+ : res)
+    for (int i = 0; i < 16; ++i) {
+      res += Obj.Value + (depth + 1) * 5 + run_tag + i;
+    }
+  }
+
+  if (depth == 0)
+    return res;
+  return res + run_recursive_nontrivial(depth - 1, seed + 9, run_tag);
+}
+
+int main() {
+  const int depth = 3;
+  int total_actual = 0;
+  int total_expected = 0;
+
+  for (int run = 0; run < 3; ++run) {
+    const int seed = 100 * run + 1;
+    const int actual = run_recursive_nontrivial(depth, seed, run);
+    const int expected = expected_recursive(depth, seed, run);
+
+    if (actual != expected) {
+      std::fprintf(
+          stderr,
+          "FAIL recursive nontrivial taskloop run=%d actual=%d expected=%d\n",
+          run, actual, expected);
+      return 1;
+    }
+
+    total_actual += actual;
+    total_expected += expected;
+  }
+
+  if (Tracker::Ctors != Tracker::Dtors || Tracker::Ctors < 12) {
+    std::fprintf(stderr,
+                 "FAIL recursive nontrivial taskloop lifetime ctors=%d "
+                 "dtors=%d total=%d expected=%d\n",
+                 Tracker::Ctors, Tracker::Dtors, total_actual, total_expected);
+    return 1;
+  }
+
+  std::fprintf(stderr,
+               "PASS recursive nontrivial taskloop total=%d expected=%d "
+               "ctors=%d dtors=%d\n",
+               total_actual, total_expected, Tracker::Ctors, Tracker::Dtors);
+  return 0;
+}
+
+// CHECK: PASS recursive nontrivial taskloop total=
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_taskloop.cpp
new file mode 100644
index 0000000000000..9d4d3dd15b7d2
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_nontrivial_type_taskloop.cpp
@@ -0,0 +1,59 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+struct Tracker {
+  static int Ctors;
+  static int Dtors;
+
+  int Value;
+
+  explicit Tracker(int V) : Value(V) { ++Ctors; }
+  ~Tracker() { ++Dtors; }
+
+  void bump(int Delta) { Value += Delta; }
+};
+
+int Tracker::Ctors = 0;
+int Tracker::Dtors = 0;
+
+__attribute__((noinline)) static int run_taskgraph_nontrivial(int seed) {
+  Tracker Obj(seed);
+  int res = 0;
+
+#pragma omp taskgraph graph_id(614)
+  {
+#pragma omp taskloop replayable num_tasks(8) shared(Obj) reduction(+ : res)
+    for (int i = 0; i < 16; ++i) {
+      res += Obj.Value + i;
+    }
+  }
+
+  return res;
+}
+
+int main() {
+  const int first = run_taskgraph_nontrivial(1);
+  const int second = run_taskgraph_nontrivial(100);
+
+  if (first != 136 || second != 1720 || Tracker::Ctors < 2 ||
+      Tracker::Dtors < 2 || Tracker::Ctors != Tracker::Dtors) {
+    std::fprintf(stderr,
+                 "FAIL lexical nontrivial taskloop replay first=%d second=%d "
+                 "ctors=%d dtors=%d\n",
+                 first, second, Tracker::Ctors, Tracker::Dtors);
+    return 1;
+  }
+
+  std::fprintf(stderr,
+               "PASS lexical nontrivial taskloop replay first=%d second=%d "
+               "ctors=%d dtors=%d\n",
+               first, second, Tracker::Ctors, Tracker::Dtors);
+  return 0;
+}
+
+// CHECK: PASS lexical nontrivial taskloop replay first=136 second=1720
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_recursive_frameid_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_recursive_frameid_taskloop.cpp
new file mode 100644
index 0000000000000..fe093ac383d8d
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_recursive_frameid_taskloop.cpp
@@ -0,0 +1,86 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// XFAIL: *
+// clang-format on
+
+#include <cstdio>
+#include <cstdint>
+
+__attribute__((noinline)) static int expected_recursive(int depth, int seed,
+                                                        int run_tag) {
+  int value = 16 * (seed + (depth + 1) * 3 + run_tag) + 120;
+  if (depth == 0)
+    return value;
+  return value + expected_recursive(depth - 1, seed + 7, run_tag);
+}
+
+__attribute__((noinline)) static int run_recursive_frameid(int depth, int seed,
+                                                           int run_tag) {
+  int value = seed;
+  int *ptr = &value;
+  int *&ptr_ref = ptr;
+  int sum_delta = 0;
+  uintptr_t frame_gid = reinterpret_cast<uintptr_t>(__builtin_frame_address(0));
+
+  // Typically, if captured pointers refer to locations on the stack, that
+  // would not be safe for taskgraph record/replay because we in general we
+  // cannot rewrite such pointers to point to the current (live) stack frame.
+  //
+  // This is one possible way around that though: we keep a taskgraph record
+  // per stack-depth, each of which may refer to the local stack frame.
+  //
+  // I probably wouldn't recommend use of this technique in production code.
+#pragma omp taskgraph graph_id(frame_gid)
+  {
+#pragma omp taskloop replayable num_tasks(8) shared(ptr_ref, depth, run_tag)   \
+    reduction(+ : sum_delta)
+    for (int i = 0; i < 16; ++i) {
+      int delta = (depth + 1) * 3 + run_tag + i;
+      __atomic_fetch_add(ptr_ref, delta, __ATOMIC_RELAXED);
+      sum_delta += delta;
+    }
+  }
+
+  int local = value * 17 + sum_delta;
+
+  if (depth == 0)
+    return local;
+  return local + run_recursive_frameid(depth - 1, seed + 7, run_tag);
+}
+
+int main() {
+  const int depth = 3;
+  int recorded_sum = 0;
+  int replayed_sum = 0;
+
+  for (int run = 0; run < 3; ++run) {
+    int seed = 100 * run + 1;
+    int val = run_recursive_frameid(depth, seed, run);
+    if (run == 0)
+      recorded_sum = val;
+    else
+      replayed_sum += val;
+  }
+
+  // With missing relocation for taskloop replay, recursive invocations that
+  // mutate through shared-block pointers are expected to diverge from the
+  // expected replay behavior.
+  const int expected_replayed = 2 * recorded_sum;
+  if (replayed_sum == expected_replayed) {
+    std::fprintf(stderr,
+                 "UNEXPECTED SUCCESS recursive pointer taskloop replay "
+                 "recorded=%d replayed_total=%d expected_total=%d\n",
+                 recorded_sum, replayed_sum, expected_replayed);
+    return 0;
+  }
+
+  std::fprintf(stderr,
+               "EXPECTED FAILURE recursive pointer taskloop replay recorded=%d "
+               "replayed_total=%d expected_total=%d\n",
+               recorded_sum, replayed_sum, expected_replayed);
+  return 1;
+}
+
+// CHECK: EXPECTED FAILURE recursive pointer taskloop replay
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_taskloop.cpp
new file mode 100644
index 0000000000000..ac5471505e539
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_pointer_taskloop.cpp
@@ -0,0 +1,44 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int run_taskgraph_pointer_shared(int seed) {
+  int value = seed;
+  int *ptr = &value;
+  int res = 0;
+
+#pragma omp taskgraph graph_id(613)
+  {
+#pragma omp taskloop replayable num_tasks(8) shared(ptr) reduction(+ : res)
+    for (int i = 0; i < 16; ++i) {
+      res += *ptr + i;
+    }
+  }
+
+  return res;
+}
+
+int main() {
+  const int first = run_taskgraph_pointer_shared(1);
+  const int second = run_taskgraph_pointer_shared(100);
+
+  if (first != 136 || second != 1720) {
+    std::fprintf(stderr,
+                 "FAIL lexical pointer shared taskloop replay first=%d "
+                 "second=%d expected=136/1720\n",
+                 first, second);
+    return 1;
+  }
+
+  std::fprintf(
+      stderr,
+      "PASS lexical pointer shared taskloop replay first=%d second=%d\n", first,
+      second);
+  return 0;
+}
+
+// CHECK: PASS lexical pointer shared taskloop replay first=136 second=1720
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_recursive_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_recursive_taskloop.cpp
new file mode 100644
index 0000000000000..32e3a6f3d9ba2
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_recursive_taskloop.cpp
@@ -0,0 +1,74 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// XFAIL: *
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int expected_recursive(int depth, int seed) {
+  int x = seed;
+  int sum_delta = 0;
+
+  for (int i = 0; i < 16; ++i) {
+    sum_delta += depth + i + 1;
+  }
+
+  x += sum_delta;
+  int local = x * 17 + sum_delta;
+
+  if (depth == 0)
+    return local;
+
+  return local + expected_recursive(depth - 1, seed + 10);
+}
+
+__attribute__((noinline)) static int run_taskgraph_recursive(int depth,
+                                                             int seed) {
+  int x = seed;
+  int *ptr = &x;
+  int sum_delta = 0;
+  int gid = 615;
+
+#pragma omp taskgraph graph_id(gid)
+  {
+#pragma omp taskloop replayable num_tasks(8) shared(ptr, depth)                \
+    reduction(+ : sum_delta)
+    for (int i = 0; i < 16; ++i) {
+      int delta = depth + i + 1;
+      __atomic_fetch_add(ptr, delta, __ATOMIC_RELAXED);
+      sum_delta += delta;
+    }
+  }
+
+  int local = x * 17 + sum_delta;
+
+  if (depth == 0)
+    return local;
+
+  return local + run_taskgraph_recursive(depth - 1, seed + 10);
+}
+
+int main() {
+  const int first = run_taskgraph_recursive(3, 1);
+  const int second = run_taskgraph_recursive(3, 100);
+  const int expected_first = expected_recursive(3, 1);
+  const int expected_second = expected_recursive(3, 100);
+
+  if (first == expected_first && second == expected_second) {
+    std::fprintf(stderr,
+                 "UNEXPECTED SUCCESS lexical recursive taskloop replay "
+                 "first=%d second=%d expected=%d/%d\n",
+                 first, second, expected_first, expected_second);
+    return 0;
+  }
+
+  std::fprintf(stderr,
+               "EXPECTED FAILURE lexical recursive taskloop replay first=%d "
+               "second=%d expected=%d/%d\n",
+               first, second, expected_first, expected_second);
+  return 1;
+}
+
+// CHECK: EXPECTED FAILURE lexical recursive taskloop replay
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_works_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_works_taskloop.cpp
new file mode 100644
index 0000000000000..beaaedfea77f3
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_lexical_shared_works_taskloop.cpp
@@ -0,0 +1,42 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int run_taskgraph_lexical(int seed) {
+  int x = seed;
+  int res = 0;
+
+#pragma omp taskgraph graph_id(611)
+  {
+#pragma omp taskloop replayable num_tasks(8) shared(x) reduction(+ : res)
+    for (int i = 0; i < 16; ++i) {
+      res += x + i;
+    }
+  }
+
+  return res;
+}
+
+int main() {
+  const int first = run_taskgraph_lexical(1);
+  const int second = run_taskgraph_lexical(100);
+
+  if (first != 136 || second != 1720) {
+    std::fprintf(stderr,
+                 "FAIL lexical shared taskloop replay first=%d second=%d "
+                 "expected=136/1720\n",
+                 first, second);
+    return 1;
+  }
+
+  std::fprintf(stderr,
+               "PASS lexical shared taskloop replay first=%d second=%d\n",
+               first, second);
+  return 0;
+}
+
+// CHECK: PASS lexical shared taskloop replay first=136 second=1720
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_1_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_1_taskloop.cpp
new file mode 100644
index 0000000000000..ee81bea27bda2
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_1_taskloop.cpp
@@ -0,0 +1,51 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %not --crash %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int emit_nonlexical_taskloop(int &x) {
+  int sum = 0;
+#pragma omp taskloop replayable num_tasks(8) shared(x) reduction(+ : sum)
+  for (int i = 0; i < 16; ++i) {
+    sum += x + i;
+  }
+  x += 5;
+  return sum;
+}
+
+__attribute__((noinline)) static int run_taskgraph_nonlexical(int seed) {
+  int x = seed;
+  int out = -1;
+
+#pragma omp taskgraph graph_id(631)
+  {
+    out = emit_nonlexical_taskloop(x);
+  }
+
+  return out;
+}
+
+int main() {
+  const int recorded = run_taskgraph_nonlexical(1);
+  const int replayed = run_taskgraph_nonlexical(100);
+
+  if (recorded == replayed) {
+    std::fprintf(stderr,
+                 "UNEXPECTED SUCCESS nonlexical taskloop replay recorded=%d "
+                 "replayed=%d\n",
+                 recorded, replayed);
+    return 0;
+  }
+
+  std::fprintf(
+      stderr,
+      "EXPECTED FAILURE nonlexical taskloop replay recorded=%d replayed=%d\n",
+      recorded, replayed);
+  return 1;
+}
+
+// CHECK: OMP: Error #302: Cannot locate captured shared variable reference for
+// taskgraph replay
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_2_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_2_taskloop.cpp
new file mode 100644
index 0000000000000..1c92ac85db2c1
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_nonlexical_shared_fails_2_taskloop.cpp
@@ -0,0 +1,62 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %not --crash %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+__attribute__((noinline)) static int emit_nonlexical_taskloop(int seed) {
+  int x = seed;
+  int sum = 0;
+
+#pragma omp taskloop replayable num_tasks(8) shared(x) reduction(+ : sum)
+  for (int i = 0; i < 16; ++i) {
+    sum += x + i;
+  }
+
+  return sum;
+}
+
+__attribute__((noinline)) static int run_taskgraph_nonlexical(int seed) {
+  int out;
+
+#pragma omp taskgraph graph_id(632)
+  {
+#pragma omp task shared(out)
+    {
+      out = emit_nonlexical_taskloop(seed);
+    }
+  }
+
+  return out;
+}
+
+int main() {
+  int out = emit_nonlexical_taskloop(50);
+  if (out != 920) {
+    std::fprintf(stderr,
+                 "UNEXPECTED FAILURE: taskloop outside taskgraph returned %d\n",
+                 out);
+  }
+
+  const int recorded = run_taskgraph_nonlexical(1);
+  const int replayed = run_taskgraph_nonlexical(100);
+
+  if (recorded == replayed) {
+    std::fprintf(stderr,
+                 "UNEXPECTED SUCCESS nonlexical taskloop replay recorded=%d "
+                 "replayed=%d\n",
+                 recorded, replayed);
+    return 0;
+  }
+
+  std::fprintf(
+      stderr,
+      "EXPECTED FAILURE nonlexical taskloop replay recorded=%d replayed=%d\n",
+      recorded, replayed);
+  return 1;
+}
+
+// CHECK: OMP: Error #302: Cannot locate captured shared variable reference for
+// taskgraph replay
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth_taskloop.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth_taskloop.cpp
new file mode 100644
index 0000000000000..5a5b8352f4925
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth_taskloop.cpp
@@ -0,0 +1,130 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+#include <cstdio>
+
+static volatile int StackSink = 0;
+// Aggregated across the taskloop body via #pragma omp atomic. Lives in
+// static storage so the address never moves across replays (this test is
+// about firstprivate(saved:), not reductions: reductions on replayable
+// taskgraphs are tracked separately).
+static volatile int Aggregate = 0;
+
+struct Payload {
+  int values[6];
+  int bias;
+};
+
+__attribute__((noinline)) static int evaluate_payload(const Payload &payload,
+                                                      int seed) {
+  return seed * payload.values[0] - payload.values[1] +
+         payload.values[2] * payload.values[3] - payload.bias +
+         payload.values[4] - payload.values[5];
+}
+
+__attribute__((noinline)) static void clobber_stack(int base) {
+  volatile int scratch[4096];
+
+  for (int i = 0; i < 4096; ++i)
+    scratch[i] = base + i;
+
+  StackSink += scratch[base & 63];
+}
+
+// The taskloop directive lives in a helper function called from inside the
+// taskgraph region, so the loop's firstprivate(saved:) captures originate
+// in a non-lexical surrounding scope. The taskloop relocation helper has
+// nothing to refresh at replay since the saved snapshots are sourced from
+// '.kmp_privates.t', not the shareds slots.
+__attribute__((noinline)) static void emit_replayable_taskloop(int seed) {
+  Payload payload{
+      {seed + 1, seed + 3, seed + 5, seed + 7, seed + 11, seed + 13},
+      seed * 17 + 19};
+
+#pragma omp taskloop replayable num_tasks(8) firstprivate(saved : payload, seed)
+  for (int i = 0; i < 16; ++i) {
+    int contribution = evaluate_payload(payload, seed + i);
+#pragma omp atomic
+    Aggregate += contribution;
+  }
+}
+
+__attribute__((noinline)) static int run_taskgraph(int seed) {
+  Aggregate = 0;
+
+#pragma omp taskgraph graph_id(633)
+  {
+    emit_replayable_taskloop(seed);
+  }
+
+  return Aggregate;
+}
+
+__attribute__((noinline)) static int call_with_depth(int seed, int depth) {
+  volatile int padding[128];
+
+  for (int i = 0; i < 128; ++i)
+    padding[i] = seed + depth + i;
+
+  StackSink += padding[(seed + depth) & 127];
+
+  if (depth == 0)
+    return run_taskgraph(seed);
+  return call_with_depth(seed, depth - 1);
+}
+
+__attribute__((noinline)) static int expected_result(int seed) {
+  Payload payload{
+      {seed + 1, seed + 3, seed + 5, seed + 7, seed + 11, seed + 13},
+      seed * 17 + 19};
+  int sum = 0;
+  for (int i = 0; i < 16; ++i)
+    sum += evaluate_payload(payload, seed + i);
+  return sum;
+}
+
+int main() {
+  constexpr int NumCalls = 4;
+  constexpr int Seeds[NumCalls] = {3, 17, 29, 41};
+  constexpr int Depths[NumCalls] = {0, 3, 1, 5};
+
+  int recorded = -1;
+  bool failed = false;
+
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp single
+    {
+      recorded = call_with_depth(Seeds[0], Depths[0]);
+      if (recorded != expected_result(Seeds[0])) {
+        std::fprintf(stderr,
+                     "FAIL initial taskloop record got=%d expected=%d\n",
+                     recorded, expected_result(Seeds[0]));
+        failed = true;
+      }
+
+      for (int i = 1; i < NumCalls; ++i) {
+        clobber_stack(Seeds[i] * 1000);
+        const int replayed = call_with_depth(Seeds[i], Depths[i]);
+        if (replayed != recorded) {
+          std::fprintf(
+              stderr,
+              "FAIL taskloop replay %d depth=%d seed=%d got=%d expected=%d\n",
+              i, Depths[i], Seeds[i], replayed, recorded);
+          failed = true;
+        }
+      }
+    }
+  }
+
+  if (failed)
+    return 1;
+
+  std::fprintf(stderr, "PASS replayable taskloop saved stack result=%d\n",
+               recorded);
+  return 0;
+}
+
+// CHECK: PASS replayable taskloop saved stack result=

>From ba87a92e0186b147dbaaedfccf6f48d3b12ad9be Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 27 May 2026 07:51:15 -0500
Subject: [PATCH 21/24] [Clang][OpenMP] Parsing, serialisation and
 deserialisation for the 'saved' firstprivate modifier

OpenMP 6.0 adds a 'saved' modifier to the data-environment
attribute clauses.  On a 'firstprivate' clause it specifies that
the list item's value is captured when the surrounding construct
is recorded into a taskgraph, and that captured value is reused
(rather than re-evaluating the initialiser from the live original)
on every replay.  See OpenMP 6.0 [7.5.4], [6.4.7] and [14.3].

This patch implements the front-end plumbing for the modifier:
parsing, AST representation and AST printing, profile, template
instantiation and PCH serialisation, modelled directly on the
existing 'lastprivate(conditional: ...)' machinery.  No new
semantic restriction is added here (e.g. requiring a
taskgraph-nested or replayable task context), and codegen and the
runtime continue to treat 'saved' as a no-op for now -- those are
follow-up patches.

Specifically:

  - OPENMP_FIRSTPRIVATE_KIND(saved) is added in OpenMPKinds.def,
    with a matching OpenMPFirstprivateModifier enum and the
    appropriate parse/print helpers in OpenMPKinds.cpp.
  - OMPFirstprivateClause gains FPKind / FPKindLoc / ColonLoc
    fields and an updated Create() signature; OMPClausePrinter
    emits 'firstprivate(saved: ...)' when the modifier is set.
  - ParseOpenMP recognises '[modifier:]' inside a firstprivate
    clause when OpenMP >= 6.0.
  - SemaOpenMP::ActOnOpenMPFirstprivateClause now takes the
    modifier triple and diagnoses an unrecognised modifier via
    the existing err_omp_unexpected_clause_value path; the
    dispatcher in ActOnOpenMPVarListClause and the
    implicit-firstprivate emitter in ActOnOpenMPRegionEnd are
    updated accordingly.
  - TreeTransform forwards the modifier through template
    instantiation; ASTReader / ASTWriter round-trip the new
    fields.

A new lit test taskgraph_firstprivate_saved_ast_print.cpp covers
ast-print and a PCH emit -> include -> ast-print round-trip on
'omp task' and 'omp taskloop' uses of firstprivate(saved: ...).

Assisted-By: Claude Opus 4.7

Pull Request: https://github.com/llvm/llvm-project/pull/200406
---
 clang/include/clang/AST/OpenMPClause.h        | 38 +++++++++++++++--
 clang/include/clang/Basic/OpenMPKinds.def     |  7 ++++
 clang/include/clang/Basic/OpenMPKinds.h       |  7 ++++
 clang/include/clang/Sema/SemaOpenMP.h         |  8 ++--
 clang/lib/AST/OpenMPClause.cpp                | 22 ++++++----
 clang/lib/Basic/OpenMPKinds.cpp               | 17 +++++++-
 clang/lib/Parse/ParseOpenMP.cpp               | 13 ++++++
 clang/lib/Sema/SemaOpenMP.cpp                 | 30 ++++++++++----
 clang/lib/Sema/TreeTransform.h                | 10 +++--
 clang/lib/Serialization/ASTReader.cpp         |  3 ++
 clang/lib/Serialization/ASTWriter.cpp         |  3 ++
 ...taskgraph_firstprivate_saved_ast_print.cpp | 41 +++++++++++++++++++
 12 files changed, 172 insertions(+), 27 deletions(-)
 create mode 100644 clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp

diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h
index 34c0c47bfe710..30c9688f2fecf 100644
--- a/clang/include/clang/AST/OpenMPClause.h
+++ b/clang/include/clang/AST/OpenMPClause.h
@@ -3690,17 +3690,31 @@ class OMPFirstprivateClause final
   friend OMPVarListClause;
   friend TrailingObjects;
 
+  /// Optional firstprivate modifier (e.g. 'saved'), if specified by the user.
+  OpenMPFirstprivateModifier FPKind = OMPC_FIRSTPRIVATE_unknown;
+  /// Optional location of the firstprivate modifier, if specified.
+  SourceLocation FPKindLoc;
+  /// Optional location of the ':' separating the modifier from the list.
+  SourceLocation ColonLoc;
+
   /// Build clause with number of variables \a N.
   ///
   /// \param StartLoc Starting location of the clause.
   /// \param LParenLoc Location of '('.
   /// \param EndLoc Ending location of the clause.
+  /// \param FPKind Firstprivate modifier (e.g. 'saved').
+  /// \param FPKindLoc Location of the firstprivate modifier, if any.
+  /// \param ColonLoc Location of the ':' symbol, if a modifier is used.
   /// \param N Number of the variables in the clause.
   OMPFirstprivateClause(SourceLocation StartLoc, SourceLocation LParenLoc,
-                        SourceLocation EndLoc, unsigned N)
+                        SourceLocation EndLoc,
+                        OpenMPFirstprivateModifier FPKind,
+                        SourceLocation FPKindLoc, SourceLocation ColonLoc,
+                        unsigned N)
       : OMPVarListClause<OMPFirstprivateClause>(llvm::omp::OMPC_firstprivate,
                                                 StartLoc, LParenLoc, EndLoc, N),
-        OMPClauseWithPreInit(this) {}
+        OMPClauseWithPreInit(this), FPKind(FPKind), FPKindLoc(FPKindLoc),
+        ColonLoc(ColonLoc) {}
 
   /// Build an empty clause.
   ///
@@ -3711,6 +3725,13 @@ class OMPFirstprivateClause final
             SourceLocation(), N),
         OMPClauseWithPreInit(this) {}
 
+  /// Sets the firstprivate modifier kind.
+  void setKind(OpenMPFirstprivateModifier Kind) { FPKind = Kind; }
+  /// Sets the location of the firstprivate modifier.
+  void setKindLoc(SourceLocation Loc) { FPKindLoc = Loc; }
+  /// Sets the location of the ':' separator.
+  void setColonLoc(SourceLocation Loc) { ColonLoc = Loc; }
+
   /// Sets the list of references to private copies with initializers for
   /// new private variables.
   /// \param VL List of references.
@@ -3751,12 +3772,16 @@ class OMPFirstprivateClause final
   /// \param InitVL List of references to auto generated variables used for
   /// initialization of a single array element. Used if firstprivate variable is
   /// of array type.
+  /// \param FPKind Firstprivate modifier, e.g. 'saved'.
+  /// \param FPKindLoc Location of the firstprivate modifier, if any.
+  /// \param ColonLoc Location of the ':' symbol, if a modifier is used.
   /// \param PreInit Statement that must be executed before entering the OpenMP
   /// region with this clause.
   static OMPFirstprivateClause *
   Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
          SourceLocation EndLoc, ArrayRef<Expr *> VL, ArrayRef<Expr *> PrivateVL,
-         ArrayRef<Expr *> InitVL, Stmt *PreInit);
+         ArrayRef<Expr *> InitVL, OpenMPFirstprivateModifier FPKind,
+         SourceLocation FPKindLoc, SourceLocation ColonLoc, Stmt *PreInit);
 
   /// Creates an empty clause with the place for \a N variables.
   ///
@@ -3764,6 +3789,13 @@ class OMPFirstprivateClause final
   /// \param N The number of variables.
   static OMPFirstprivateClause *CreateEmpty(const ASTContext &C, unsigned N);
 
+  /// Firstprivate modifier (e.g. 'saved' if specified, otherwise 'unknown').
+  OpenMPFirstprivateModifier getKind() const { return FPKind; }
+  /// Returns the location of the firstprivate modifier, if any.
+  SourceLocation getKindLoc() const { return FPKindLoc; }
+  /// Returns the location of the ':' symbol, if any.
+  SourceLocation getColonLoc() const { return ColonLoc; }
+
   using private_copies_iterator = MutableArrayRef<Expr *>::iterator;
   using private_copies_const_iterator = ArrayRef<const Expr *>::iterator;
   using private_copies_range = llvm::iterator_range<private_copies_iterator>;
diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def
index 3b79b940a1253..99ed68f8bce24 100644
--- a/clang/include/clang/Basic/OpenMPKinds.def
+++ b/clang/include/clang/Basic/OpenMPKinds.def
@@ -59,6 +59,9 @@
 #ifndef OPENMP_DEVICE_TYPE_KIND
 #define OPENMP_DEVICE_TYPE_KIND(Name)
 #endif
+#ifndef OPENMP_FIRSTPRIVATE_KIND
+#define OPENMP_FIRSTPRIVATE_KIND(Name)
+#endif
 #ifndef OPENMP_LASTPRIVATE_KIND
 #define OPENMP_LASTPRIVATE_KIND(Name)
 #endif
@@ -224,6 +227,9 @@ OPENMP_DEVICE_TYPE_KIND(host)
 OPENMP_DEVICE_TYPE_KIND(nohost)
 OPENMP_DEVICE_TYPE_KIND(any)
 
+// Type of the 'firstprivate' clause.
+OPENMP_FIRSTPRIVATE_KIND(saved)
+
 // Type of the 'lastprivate' clause.
 OPENMP_LASTPRIVATE_KIND(conditional)
 
@@ -304,6 +310,7 @@ OPENMP_USE_DEVICE_PTR_FALLBACK_MODIFIER(fb_preserve)
 #undef OPENMP_ORIGINAL_SHARING_MODIFIER
 #undef OPENMP_ORDER_KIND
 #undef OPENMP_ORDER_MODIFIER
+#undef OPENMP_FIRSTPRIVATE_KIND
 #undef OPENMP_LASTPRIVATE_KIND
 #undef OPENMP_DEVICE_TYPE_KIND
 #undef OPENMP_LINEAR_KIND
diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h
index 4e83bfcd0128b..47b4ef545b2c2 100644
--- a/clang/include/clang/Basic/OpenMPKinds.h
+++ b/clang/include/clang/Basic/OpenMPKinds.h
@@ -161,6 +161,13 @@ enum OpenMPDeviceType {
   OMPC_DEVICE_TYPE_unknown
 };
 
+/// OpenMP 'firstprivate' clause modifier.
+enum OpenMPFirstprivateModifier {
+#define OPENMP_FIRSTPRIVATE_KIND(Name) OMPC_FIRSTPRIVATE_##Name,
+#include "clang/Basic/OpenMPKinds.def"
+  OMPC_FIRSTPRIVATE_unknown,
+};
+
 /// OpenMP 'lastprivate' clause modifier.
 enum OpenMPLastprivateModifier {
 #define OPENMP_LASTPRIVATE_KIND(Name) OMPC_LASTPRIVATE_##Name,
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
index 98405b871641a..54f6a5dcaca24 100644
--- a/clang/include/clang/Sema/SemaOpenMP.h
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -1269,10 +1269,10 @@ class SemaOpenMP : public SemaBase {
                                       SourceLocation LParenLoc,
                                       SourceLocation EndLoc);
   /// Called on well-formed 'firstprivate' clause.
-  OMPClause *ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
-                                           SourceLocation StartLoc,
-                                           SourceLocation LParenLoc,
-                                           SourceLocation EndLoc);
+  OMPClause *ActOnOpenMPFirstprivateClause(
+      ArrayRef<Expr *> VarList, OpenMPFirstprivateModifier FPKind,
+      SourceLocation FPKindLoc, SourceLocation ColonLoc,
+      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc);
   /// Called on well-formed 'lastprivate' clause.
   OMPClause *ActOnOpenMPLastprivateClause(
       ArrayRef<Expr *> VarList, OpenMPLastprivateModifier LPKind,
diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp
index 7d2b4aa64a1df..de82e23878a25 100644
--- a/clang/lib/AST/OpenMPClause.cpp
+++ b/clang/lib/AST/OpenMPClause.cpp
@@ -493,14 +493,14 @@ void OMPFirstprivateClause::setInits(ArrayRef<Expr *> VL) {
   llvm::copy(VL, getPrivateCopies().end());
 }
 
-OMPFirstprivateClause *
-OMPFirstprivateClause::Create(const ASTContext &C, SourceLocation StartLoc,
-                              SourceLocation LParenLoc, SourceLocation EndLoc,
-                              ArrayRef<Expr *> VL, ArrayRef<Expr *> PrivateVL,
-                              ArrayRef<Expr *> InitVL, Stmt *PreInit) {
+OMPFirstprivateClause *OMPFirstprivateClause::Create(
+    const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc,
+    SourceLocation EndLoc, ArrayRef<Expr *> VL, ArrayRef<Expr *> PrivateVL,
+    ArrayRef<Expr *> InitVL, OpenMPFirstprivateModifier FPKind,
+    SourceLocation FPKindLoc, SourceLocation ColonLoc, Stmt *PreInit) {
   void *Mem = C.Allocate(totalSizeToAlloc<Expr *>(3 * VL.size()));
-  OMPFirstprivateClause *Clause =
-      new (Mem) OMPFirstprivateClause(StartLoc, LParenLoc, EndLoc, VL.size());
+  OMPFirstprivateClause *Clause = new (Mem) OMPFirstprivateClause(
+      StartLoc, LParenLoc, EndLoc, FPKind, FPKindLoc, ColonLoc, VL.size());
   Clause->setVarRefs(VL);
   Clause->setPrivateCopies(PrivateVL);
   Clause->setInits(InitVL);
@@ -2563,7 +2563,13 @@ void OMPClausePrinter::VisitOMPPrivateClause(OMPPrivateClause *Node) {
 void OMPClausePrinter::VisitOMPFirstprivateClause(OMPFirstprivateClause *Node) {
   if (!Node->varlist_empty()) {
     OS << "firstprivate";
-    VisitOMPClauseList(Node, '(');
+    OpenMPFirstprivateModifier FPKind = Node->getKind();
+    if (FPKind != OMPC_FIRSTPRIVATE_unknown) {
+      OS << "("
+         << getOpenMPSimpleClauseTypeName(OMPC_firstprivate, Node->getKind())
+         << ":";
+    }
+    VisitOMPClauseList(Node, FPKind == OMPC_FIRSTPRIVATE_unknown ? '(' : ' ');
     OS << ")";
   }
 }
diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp
index 7cca74ffe711d..e961dc550d152 100644
--- a/clang/lib/Basic/OpenMPKinds.cpp
+++ b/clang/lib/Basic/OpenMPKinds.cpp
@@ -149,6 +149,11 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
 #define OPENMP_SEVERITY_KIND(Name) .Case(#Name, OMPC_SEVERITY_##Name)
 #include "clang/Basic/OpenMPKinds.def"
         .Default(OMPC_SEVERITY_unknown);
+  case OMPC_firstprivate:
+    return llvm::StringSwitch<OpenMPFirstprivateModifier>(Str)
+#define OPENMP_FIRSTPRIVATE_KIND(Name) .Case(#Name, OMPC_FIRSTPRIVATE_##Name)
+#include "clang/Basic/OpenMPKinds.def"
+        .Default(OMPC_FIRSTPRIVATE_unknown);
   case OMPC_lastprivate:
     return llvm::StringSwitch<OpenMPLastprivateModifier>(Str)
 #define OPENMP_LASTPRIVATE_KIND(Name) .Case(#Name, OMPC_LASTPRIVATE_##Name)
@@ -261,7 +266,6 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str,
   case OMPC_allocator:
   case OMPC_collapse:
   case OMPC_private:
-  case OMPC_firstprivate:
   case OMPC_shared:
   case OMPC_task_reduction:
   case OMPC_in_reduction:
@@ -477,6 +481,16 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
 #include "clang/Basic/OpenMPKinds.def"
     }
     llvm_unreachable("Invalid OpenMP 'severity' clause type");
+  case OMPC_firstprivate:
+    switch (Type) {
+    case OMPC_FIRSTPRIVATE_unknown:
+      return "unknown";
+#define OPENMP_FIRSTPRIVATE_KIND(Name)                                         \
+  case OMPC_FIRSTPRIVATE_##Name:                                               \
+    return #Name;
+#include "clang/Basic/OpenMPKinds.def"
+    }
+    llvm_unreachable("Invalid OpenMP 'firstprivate' clause type");
   case OMPC_lastprivate:
     switch (Type) {
     case OMPC_LASTPRIVATE_unknown:
@@ -643,7 +657,6 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind,
   case OMPC_allocator:
   case OMPC_collapse:
   case OMPC_private:
-  case OMPC_firstprivate:
   case OMPC_shared:
   case OMPC_task_reduction:
   case OMPC_in_reduction:
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 1338001b73b30..91b0f2e739f77 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -4939,6 +4939,19 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
             << "linear-modifier(list)" << getOpenMPClauseName(Kind)
             << "linear(list: [linear-modifier,] step(step-size))";
     }
+  } else if (Kind == OMPC_firstprivate) {
+    // Try to parse modifier if any.
+    Data.ExtraModifier = OMPC_FIRSTPRIVATE_unknown;
+    // The 'saved' modifier is OpenMP 6.0+ only.
+    if (getLangOpts().OpenMP >= 60 && Tok.is(tok::identifier) &&
+        PP.LookAhead(0).is(tok::colon)) {
+      Data.ExtraModifier =
+          getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok), getLangOpts());
+      Data.ExtraModifierLoc = Tok.getLocation();
+      ConsumeToken();
+      assert(Tok.is(tok::colon) && "Expected colon.");
+      Data.ColonLoc = ConsumeToken();
+    }
   } else if (Kind == OMPC_lastprivate) {
     // Try to parse modifier if any.
     Data.ExtraModifier = OMPC_LASTPRIVATE_unknown;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index c1bf6671cfdec..51a5c3522a5db 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -6378,8 +6378,10 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
     }
     if (!ImpInfo.Firstprivates.empty()) {
       if (OMPClause *Implicit = ActOnOpenMPFirstprivateClause(
-              ImpInfo.Firstprivates.getArrayRef(), SourceLocation(),
-              SourceLocation(), SourceLocation())) {
+              ImpInfo.Firstprivates.getArrayRef(), OMPC_FIRSTPRIVATE_unknown,
+              /*FPKindLoc=*/SourceLocation(), /*ColonLoc=*/SourceLocation(),
+              /*StartLoc=*/SourceLocation(), /*LParenLoc=*/SourceLocation(),
+              /*EndLoc=*/SourceLocation())) {
         ClausesWithImplicit.push_back(Implicit);
         ErrorFound = cast<OMPFirstprivateClause>(Implicit)->varlist_size() !=
                      ImpInfo.Firstprivates.size();
@@ -19248,7 +19250,11 @@ OMPClause *SemaOpenMP::ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
     Res = ActOnOpenMPPrivateClause(VarList, StartLoc, LParenLoc, EndLoc);
     break;
   case OMPC_firstprivate:
-    Res = ActOnOpenMPFirstprivateClause(VarList, StartLoc, LParenLoc, EndLoc);
+    assert(0 <= ExtraModifier && ExtraModifier <= OMPC_FIRSTPRIVATE_unknown &&
+           "Unexpected firstprivate modifier.");
+    Res = ActOnOpenMPFirstprivateClause(
+        VarList, static_cast<OpenMPFirstprivateModifier>(ExtraModifier),
+        ExtraModifierLoc, ColonLoc, StartLoc, LParenLoc, EndLoc);
     break;
   case OMPC_lastprivate:
     assert(0 <= ExtraModifier && ExtraModifier <= OMPC_LASTPRIVATE_unknown &&
@@ -19629,10 +19635,19 @@ OMPClause *SemaOpenMP::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
                                   Vars, PrivateCopies);
 }
 
-OMPClause *SemaOpenMP::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
-                                                     SourceLocation StartLoc,
-                                                     SourceLocation LParenLoc,
-                                                     SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPFirstprivateClause(
+    ArrayRef<Expr *> VarList, OpenMPFirstprivateModifier FPKind,
+    SourceLocation FPKindLoc, SourceLocation ColonLoc, SourceLocation StartLoc,
+    SourceLocation LParenLoc, SourceLocation EndLoc) {
+  if (FPKind == OMPC_FIRSTPRIVATE_unknown && FPKindLoc.isValid()) {
+    assert(ColonLoc.isValid() && "Colon location must be valid.");
+    Diag(FPKindLoc, diag::err_omp_unexpected_clause_value)
+        << getListOfPossibleValues(OMPC_firstprivate, /*First=*/0,
+                                   /*Last=*/OMPC_FIRSTPRIVATE_unknown)
+        << getOpenMPClauseNameForDiag(OMPC_firstprivate);
+    return nullptr;
+  }
+
   SmallVector<Expr *, 8> Vars;
   SmallVector<Expr *, 8> PrivateCopies;
   SmallVector<Expr *, 8> Inits;
@@ -19918,6 +19933,7 @@ OMPClause *SemaOpenMP::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
 
   return OMPFirstprivateClause::Create(
       getASTContext(), StartLoc, LParenLoc, EndLoc, Vars, PrivateCopies, Inits,
+      FPKind, FPKindLoc, ColonLoc,
       buildPreInits(getASTContext(), ExprCaptures));
 }
 
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 07812bd8074f9..e0ea135fcf557 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1932,11 +1932,14 @@ class TreeTransform {
   /// By default, performs semantic analysis to build the new OpenMP clause.
   /// Subclasses may override this routine to provide different behavior.
   OMPClause *RebuildOMPFirstprivateClause(ArrayRef<Expr *> VarList,
+                                          OpenMPFirstprivateModifier FPKind,
+                                          SourceLocation FPKindLoc,
+                                          SourceLocation ColonLoc,
                                           SourceLocation StartLoc,
                                           SourceLocation LParenLoc,
                                           SourceLocation EndLoc) {
-    return getSema().OpenMP().ActOnOpenMPFirstprivateClause(VarList, StartLoc,
-                                                            LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPFirstprivateClause(
+        VarList, FPKind, FPKindLoc, ColonLoc, StartLoc, LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'lastprivate' clause.
@@ -11220,7 +11223,8 @@ OMPClause *TreeTransform<Derived>::TransformOMPFirstprivateClause(
     Vars.push_back(EVar.get());
   }
   return getDerived().RebuildOMPFirstprivateClause(
-      Vars, C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
+      Vars, C->getKind(), C->getKindLoc(), C->getColonLoc(), C->getBeginLoc(),
+      C->getLParenLoc(), C->getEndLoc());
 }
 
 template <typename Derived>
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index a1f53a2c742c2..f681f6d47c7e7 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12196,6 +12196,9 @@ void OMPClauseReader::VisitOMPPrivateClause(OMPPrivateClause *C) {
 void OMPClauseReader::VisitOMPFirstprivateClause(OMPFirstprivateClause *C) {
   VisitOMPClauseWithPreInit(C);
   C->setLParenLoc(Record.readSourceLocation());
+  C->setKind(Record.readEnum<OpenMPFirstprivateModifier>());
+  C->setKindLoc(Record.readSourceLocation());
+  C->setColonLoc(Record.readSourceLocation());
   unsigned NumVars = C->varlist_size();
   SmallVector<Expr *, 16> Vars;
   Vars.reserve(NumVars);
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 20e981b9da3b5..faf59d25a87c4 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8308,6 +8308,9 @@ void OMPClauseWriter::VisitOMPFirstprivateClause(OMPFirstprivateClause *C) {
   Record.push_back(C->varlist_size());
   VisitOMPClauseWithPreInit(C);
   Record.AddSourceLocation(C->getLParenLoc());
+  Record.writeEnum(C->getKind());
+  Record.AddSourceLocation(C->getKindLoc());
+  Record.AddSourceLocation(C->getColonLoc());
   for (auto *VE : C->varlist()) {
     Record.AddStmt(VE);
   }
diff --git a/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp b/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp
new file mode 100644
index 0000000000000..faf6b7a2936ae
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp
@@ -0,0 +1,41 @@
+// Check no warnings/errors
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Check unparsing
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -ast-print %s | FileCheck %s
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -emit-pch -o %t %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -include-pch %t -ast-print %s | FileCheck %s
+
+#ifndef HEADER
+#define HEADER
+
+void firstprivate_saved() {
+  int a = 1;
+  int b = 2;
+  int c = 3;
+
+  // CHECK: #pragma omp task firstprivate(a)
+  #pragma omp task firstprivate(a)
+  { (void)a; }
+
+  // CHECK: #pragma omp task firstprivate(saved: a)
+  #pragma omp task firstprivate(saved: a)
+  { (void)a; }
+
+  // CHECK: #pragma omp task firstprivate(saved: a,b,c)
+  #pragma omp task firstprivate(saved: a, b, c)
+  { (void)a; (void)b; (void)c; }
+
+  // CHECK: #pragma omp task firstprivate(saved: a) shared(b)
+  #pragma omp task firstprivate(saved: a) shared(b)
+  { (void)a; (void)b; }
+
+  // CHECK: #pragma omp taskloop firstprivate(saved: a)
+  #pragma omp taskloop firstprivate(saved: a)
+  for (int i = 0; i < 4; ++i) (void)a;
+}
+
+#endif

>From 9e5e93eff10405c7587274b5a289e3f3e8ed4131 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 27 May 2026 08:03:40 -0500
Subject: [PATCH 22/24] [Clang][OpenMP] Restrict 'saved' firstprivate modifier
 to admissible directives

Per OpenMP 6.0 [7.2], the 'saved' modifier on a data-environment
attribute clause has effect only when that clause appears on a
replayable construct.  Per [14.3] and [14.6], the only directives
that admit the 'replayable' clause are: 'target',
'target enter data', 'target exit data', 'target update', 'task',
'taskloop' and 'taskwait'.  Of those seven, only 'target', 'task'
and 'taskloop' also admit a 'firstprivate' clause; the remaining
four cannot syntactically carry 'firstprivate' and need no extra
sema work.  A directive outside the admitted set can never make
'firstprivate(saved: ...)' meaningful.

We do not try to flag every dead-modifier case statically:
'saved' on a directive in the admitted set but without
'replayable' (and without lexical nesting in a taskgraph) is
well-formed per [7.2] and silently has no effect.  Detecting that
would require inspecting the full clause list, and is invalidated
as soon as the user adds the clause or moves the construct inside
a taskgraph.

What we can and do diagnose is the case where the host directive
itself cannot ever be a replayable construct: 'saved' on a
'parallel', 'for', 'sections', 'single', 'distribute', 'teams',
etc. firstprivate clause has no path to replay semantics at all.
Reject that in ActOnOpenMPFirstprivateClause when CurDir is
neither an OpenMP tasking directive
(isOpenMPTaskingDirective, which covers OMPD_task and every
OMPD_*taskloop* variant) nor an OpenMP target execution directive
(isOpenMPTargetExecutionDirective, which covers OMPD_target and
every combined directive that has OMPD_target as a leaf
construct), with a new dedicated diagnostic
err_omp_firstprivate_saved_wrong_directive that points at the
modifier location and lists the three admissible directives.

The existing ast-print test is extended to cover these cases, and other
new tests have been added.

Assisted-By: Claude Opus 4.7

Pull Request: https://github.com/llvm/llvm-project/pull/200407
---
 .../clang/Basic/DiagnosticSemaKinds.td        |   3 +
 clang/lib/Sema/SemaOpenMP.cpp                 |  14 +++
 ...taskgraph_firstprivate_saved_ast_print.cpp |  52 ++++++--
 .../taskgraph_firstprivate_saved_messages.cpp | 115 ++++++++++++++++++
 4 files changed, 177 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/OpenMP/taskgraph_firstprivate_saved_messages.cpp

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index fd442a7b93492..ce6e7f2182d13 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12646,6 +12646,9 @@ def err_omp_one_defaultmap_each_category: Error<
 def err_omp_lastprivate_conditional_non_scalar : Error<
   "expected list item of scalar type in 'lastprivate' clause with 'conditional' modifier"
   >;
+def err_omp_firstprivate_saved_wrong_directive : Error<
+  "'saved' modifier on 'firstprivate' clause is only allowed on a 'task', "
+  "'taskloop', or 'target' construct">;
 def err_omp_flush_order_clause_and_list : Error<
   "'flush' directive with memory order clause '%0' cannot have the list">;
 def note_omp_flush_order_clause_here : Note<
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 51a5c3522a5db..bf888892f8aab 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -19648,6 +19648,20 @@ OMPClause *SemaOpenMP::ActOnOpenMPFirstprivateClause(
     return nullptr;
   }
 
+  // OpenMP 6.0 [7.2]: the 'saved' modifier has effect only on a clause
+  // that appears on a replayable construct.  Per [14.6] the directives
+  // that admit a 'replayable' clause and that also admit 'firstprivate'
+  // are 'task', 'taskloop', and 'target' (the other replayable-eligible
+  // directives -- 'target_enter_data' etc. -- do not admit
+  // 'firstprivate' at all).  Reject 'saved' on any other directive, as
+  // it can never become a replayable construct.
+  OpenMPDirectiveKind CurDir = DSAStack->getCurrentDirective();
+  if (FPKind == OMPC_FIRSTPRIVATE_saved && !isOpenMPTaskingDirective(CurDir) &&
+      !isOpenMPTargetExecutionDirective(CurDir)) {
+    Diag(FPKindLoc, diag::err_omp_firstprivate_saved_wrong_directive);
+    return nullptr;
+  }
+
   SmallVector<Expr *, 8> Vars;
   SmallVector<Expr *, 8> PrivateCopies;
   SmallVector<Expr *, 8> Inits;
diff --git a/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp b/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp
index faf6b7a2936ae..df358df14c82f 100644
--- a/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp
+++ b/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp
@@ -21,21 +21,59 @@ void firstprivate_saved() {
   #pragma omp task firstprivate(a)
   { (void)a; }
 
+  // 'saved' on an omp task lexically inside a taskgraph.
+  // CHECK: #pragma omp taskgraph
   // CHECK: #pragma omp task firstprivate(saved: a)
-  #pragma omp task firstprivate(saved: a)
-  { (void)a; }
+  #pragma omp taskgraph
+  {
+    #pragma omp task firstprivate(saved: a)
+    { (void)a; }
+  }
 
+  // Multiple variables.
+  // CHECK: #pragma omp taskgraph
   // CHECK: #pragma omp task firstprivate(saved: a,b,c)
-  #pragma omp task firstprivate(saved: a, b, c)
-  { (void)a; (void)b; (void)c; }
+  #pragma omp taskgraph
+  {
+    #pragma omp task firstprivate(saved: a, b, c)
+    { (void)a; (void)b; (void)c; }
+  }
 
+  // Mixed with another clause.
+  // CHECK: #pragma omp taskgraph
   // CHECK: #pragma omp task firstprivate(saved: a) shared(b)
-  #pragma omp task firstprivate(saved: a) shared(b)
-  { (void)a; (void)b; }
+  #pragma omp taskgraph
+  {
+    #pragma omp task firstprivate(saved: a) shared(b)
+    { (void)a; (void)b; }
+  }
 
+  // 'saved' on an omp taskloop lexically inside a taskgraph.
+  // CHECK: #pragma omp taskgraph
   // CHECK: #pragma omp taskloop firstprivate(saved: a)
-  #pragma omp taskloop firstprivate(saved: a)
+  #pragma omp taskgraph
+  {
+    #pragma omp taskloop firstprivate(saved: a)
+    for (int i = 0; i < 4; ++i) (void)a;
+  }
+
+  // 'saved' on a replayable omp task outside any taskgraph - also legal.
+  // CHECK: #pragma omp task replayable firstprivate(saved: a)
+  #pragma omp task replayable firstprivate(saved: a)
+  { (void)a; }
+
+  // 'saved' on a replayable omp taskloop outside any taskgraph - also legal.
+  // CHECK: #pragma omp taskloop replayable firstprivate(saved: a)
+  #pragma omp taskloop replayable firstprivate(saved: a)
   for (int i = 0; i < 4; ++i) (void)a;
+
+  // 'saved' on a non-lexically-nested task (dynamic nesting via a call into
+  // a function from a taskgraph region is the runtime use case) - we accept
+  // any task/taskloop construct since the static check cannot prove dynamic
+  // nesting.
+  // CHECK: #pragma omp task firstprivate(saved: a)
+  #pragma omp task firstprivate(saved: a)
+  { (void)a; }
 }
 
 #endif
diff --git a/clang/test/OpenMP/taskgraph_firstprivate_saved_messages.cpp b/clang/test/OpenMP/taskgraph_firstprivate_saved_messages.cpp
new file mode 100644
index 0000000000000..5197d16cd0421
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_firstprivate_saved_messages.cpp
@@ -0,0 +1,115 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=60 -fsyntax-only -verify %s
+
+// Tests the OpenMP 6.0 'saved' modifier on the 'firstprivate' clause.  The
+// modifier is meaningful only on constructs that create tasks or taskloops
+// (the units of work that can participate in taskgraph replay).  Every other
+// directive that admits a 'firstprivate' clause must reject it.
+
+void unknown_modifier() {
+  int a = 0;
+  // The diagnostic comes from the generic "expected <list> in OpenMP clause"
+  // path and enumerates the legal modifier names ('saved' in OpenMP 6.0).
+  #pragma omp task firstprivate(bogus: a) // expected-error {{expected 'saved' in OpenMP clause 'firstprivate'}}
+  { (void)a; }
+}
+
+void rejected_on_non_tasking_constructs() {
+  int a = 0;
+  int b[8];
+
+  // parallel
+  #pragma omp parallel firstprivate(saved: a) // expected-error {{'saved' modifier on 'firstprivate' clause is only allowed on a 'task', 'taskloop', or 'target' construct}}
+  { (void)a; }
+
+  // for (worksharing)
+  #pragma omp parallel
+  {
+    #pragma omp for firstprivate(saved: a) // expected-error {{'saved' modifier on 'firstprivate' clause is only allowed on a 'task', 'taskloop', or 'target' construct}}
+    for (int i = 0; i < 4; ++i) (void)a;
+  }
+
+  // sections
+  #pragma omp parallel
+  {
+    #pragma omp sections firstprivate(saved: a) // expected-error {{'saved' modifier on 'firstprivate' clause is only allowed on a 'task', 'taskloop', or 'target' construct}}
+    {
+      (void)a;
+    }
+  }
+
+  // single
+  #pragma omp parallel
+  {
+    #pragma omp single firstprivate(saved: a) // expected-error {{'saved' modifier on 'firstprivate' clause is only allowed on a 'task', 'taskloop', or 'target' construct}}
+    { (void)a; }
+  }
+
+  // teams (inside target -- the inner directive is a standalone 'teams',
+  // not a 'target teams' combined directive, so it is not a target
+  // execution directive in its own right).
+  #pragma omp target
+  #pragma omp teams firstprivate(saved: a) // expected-error {{'saved' modifier on 'firstprivate' clause is only allowed on a 'task', 'taskloop', or 'target' construct}}
+  { (void)a; }
+
+  // distribute (inside teams) -- standalone 'distribute' is not a target
+  // execution directive either.
+  #pragma omp target teams
+  #pragma omp distribute firstprivate(saved: a) // expected-error {{'saved' modifier on 'firstprivate' clause is only allowed on a 'task', 'taskloop', or 'target' construct}}
+  for (int i = 0; i < 4; ++i) (void)a;
+}
+
+void accepted_on_task_taskloop_and_target() {
+  int a = 0;
+
+  // Bare task (no enclosing taskgraph, no replayable clause): accepted.
+  // Per OpenMP 6.0 [7.2] the 'saved' modifier silently has no effect on a
+  // non-replayable construct; we only enforce the directive-kind check
+  // statically.  In this implementation a bare task / taskloop is never
+  // recorded into a taskgraph -- recording requires either lexical
+  // nesting inside a '#pragma omp taskgraph' or an explicit 'replayable'
+  // clause -- so the modifier is a well-defined no-op here.
+  #pragma omp task firstprivate(saved: a)
+  { (void)a; }
+
+  // Bare taskloop: accepted, same rationale.
+  #pragma omp taskloop firstprivate(saved: a)
+  for (int i = 0; i < 4; ++i) (void)a;
+
+  // Replayable task: explicitly opted-in for replay.
+  #pragma omp task replayable firstprivate(saved: a)
+  { (void)a; }
+
+  // Replayable taskloop.
+  #pragma omp taskloop replayable firstprivate(saved: a)
+  for (int i = 0; i < 4; ++i) (void)a;
+
+  // Task lexically nested inside a taskgraph.
+  #pragma omp taskgraph
+  {
+    #pragma omp task firstprivate(saved: a)
+    { (void)a; }
+  }
+
+  // Bare target: accepted on the same well-formed-but-no-effect grounds
+  // as a bare task.  Per OpenMP 6.0 [14.6] the 'target' construct admits
+  // both 'firstprivate' and 'replayable', so 'saved' is meaningful as
+  // soon as a 'replayable' clause is added or the construct is nested
+  // inside a 'taskgraph' region.
+  #pragma omp target firstprivate(saved: a)
+  { (void)a; }
+
+  // Replayable target: explicitly opted-in for replay.
+  #pragma omp target replayable firstprivate(saved: a)
+  { (void)a; }
+
+  // Combined target construct (target + parallel): accepted because the
+  // composite directive is a target execution directive in its own
+  // right, so the captured snapshot at the target boundary belongs to a
+  // construct that may participate in taskgraph replay.
+  #pragma omp target parallel firstprivate(saved: a)
+  { (void)a; }
+
+  // Combined target teams distribute parallel for: same rationale.
+  #pragma omp target teams distribute parallel for firstprivate(saved: a)
+  for (int i = 0; i < 4; ++i) (void)a;
+}

>From 9ae220f7c95566104bd0f76a96787bd8d88a65b0 Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Wed, 27 May 2026 10:23:14 -0500
Subject: [PATCH 23/24] [OpenMP] Constructor and destructor fixes for tasks
 cloned for recorded taskgraphs

The OpenMP 6.0 'saved' firstprivate modifier (see [7.2] together
with [14.3]) requires each list item to be snapshotted once at
recording time and observable on every replay of the recorded
task.  libomp reuses the same task descriptor across every replay
of a taskgraph-owned task, so the '.kmp_privates.t' tail struct
that holds the firstprivate values is also the natural home for
the saved data environment.  Getting that right needs two
changes, which this patch lands together: the destructor of each
list item must fire exactly once at end-of-taskgraph (not after
every replay), and non-trivially-copyable list items must be
re-constructed per replay so that copy constructors and inner
self-references are respected.

On the runtime side, move the per-task destructor-thunk
invocation from __kmp_task_finish (which previously fired it at
the end of every replay, leaving the saved snapshot in a
destructed state for the next replay) to __kmp_taskgraph_free,
so it fires exactly once per task at end-of-taskgraph.  Skip
taskwait nodes (record_map entries with task == nullptr) in that
loop while we are there, to avoid a latent nullptr dereference
that the existing tests do not exercise.

On the compiler side, the runtime previously cloned each replay's
task descriptor with a bitwise memcpy in __kmp_taskgraph_clone_task,
and a FIXME noted that this silently corrupts firstprivate list items
whose type is not trivially copyable (self-referential structs, types
with user-defined copy constructors / destructors, types holding inner
pointers into themselves).  Emit a per-task clone helper

  void __omp_task_clone.NN(kmp_task_t *dst, kmp_task_t *src,
                           int lastpriv);

modelled on emitTaskDupFunction and reusing emitPrivatesInit (now
extended with a tri-state PrivatesInitMode of Normal / ForDup /
ForClone), which re-runs the copy constructor of each
firstprivate list item into the freshly allocated descriptor's
'.kmp_privates.t'.  Tasks whose firstprivates are all trivially
copyable still rely on the runtime's memcpy fast-path and emit no
clone helper.  emitTaskCall passes the helper to the runtime as
the new trailing argument of __kmpc_taskgraph_task (null when no
helper is needed).

Two previously-XFAIL'd taskgraph runtime tests
(taskgraph_replayable_saved_stack_depth.cpp and
taskgraph_shared_stack_depth.cpp) now pass and are un-XFAIL'd, and other
tests have been added to cover new functionality.

Assisted-By: Claude Opus 4.7

Pull Request: https://github.com/llvm/llvm-project/pull/200408
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 115 +++++++++++++-
 clang/lib/CodeGen/CGOpenMPRuntime.h           |   4 +
 ...taskgraph_firstprivate_saved_ast_print.cpp |  67 ++++++++
 .../OpenMP/taskgraph_task_clone_codegen.cpp   |  51 ++++++
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |   2 +-
 openmp/runtime/src/kmp.h                      |   2 +-
 openmp/runtime/src/kmp_tasking.cpp            |  57 ++++++-
 ...askgraph_firstprivate_saved_nontrivial.cpp | 146 ++++++++++++++++++
 ..._firstprivate_saved_nontrivial_selfref.cpp |  97 ++++++++++++
 .../taskgraph_firstprivate_saved_static.cpp   | 100 ++++++++++++
 ...taskgraph_replayable_saved_stack_depth.cpp |   1 -
 .../taskgraph_shared_stack_depth.cpp          |  16 +-
 12 files changed, 641 insertions(+), 17 deletions(-)
 create mode 100644 clang/test/OpenMP/taskgraph_task_clone_codegen.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_nontrivial.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_nontrivial_selfref.cpp
 create mode 100644 openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_static.cpp

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 95ac4b8aca572..022b2d87a3a5b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -3679,14 +3679,34 @@ emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc,
 }
 
 /// Emit initialization for private variables in task-based directives.
+/// Selects where \c emitPrivatesInit should read the initial value of each
+/// non-trivial firstprivate copy from.
+enum class PrivatesInitMode {
+  /// Initialize using the captured original lvalues in the caller IR (i.e.
+  /// at task-allocation time).
+  Normal,
+  /// Reading from the source task's \c shareds region. Used by the taskloop
+  /// task-dup function to seed sibling tasks.
+  ForDup,
+  /// Reading from the source task's \c .kmp_privates.t region (the field at
+  /// the same index as the destination). Used by the taskgraph clone
+  /// function to seed the persistent clone from the original task's
+  /// already-initialized snapshot. Works uniformly for captured and
+  /// static-storage firstprivates because no capture lookup is needed.
+  ForClone,
+};
+
 static void emitPrivatesInit(CodeGenFunction &CGF,
                              const OMPExecutableDirective &D,
                              Address KmpTaskSharedsPtr, LValue TDBase,
                              const RecordDecl *KmpTaskTWithPrivatesQTyRD,
                              QualType SharedsTy, QualType SharedsPtrTy,
                              const OMPTaskDataTy &Data,
-                             ArrayRef<PrivateDataTy> Privates, bool ForDup) {
+                             ArrayRef<PrivateDataTy> Privates,
+                             PrivatesInitMode Mode, LValue SrcPrivatesBase) {
   ASTContext &C = CGF.getContext();
+  const bool ForDup = Mode == PrivatesInitMode::ForDup;
+  const bool ForClone = Mode == PrivatesInitMode::ForClone;
   auto FI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
   LValue PrivatesBase = CGF.EmitLValueForField(TDBase, *FI);
   OpenMPDirectiveKind Kind = isOpenMPTaskLoopDirective(D.getDirectiveKind())
@@ -3718,8 +3738,11 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
     }
     const VarDecl *VD = Pair.second.PrivateCopy;
     const Expr *Init = VD->getAnyInitializer();
-    if (Init && (!ForDup || (isa<CXXConstructExpr>(Init) &&
-                             !CGF.isTrivialInitializer(Init)))) {
+    // ForDup and ForClone only re-initialize non-trivial firstprivates; the
+    // surrounding runtime memcpy is sufficient for trivially-copyable ones.
+    const bool NonTrivialOnly = ForDup || ForClone;
+    if (Init && (!NonTrivialOnly || (isa<CXXConstructExpr>(Init) &&
+                                     !CGF.isTrivialInitializer(Init)))) {
       LValue PrivateLValue = CGF.EmitLValueForField(PrivatesBase, *FI);
       if (const VarDecl *Elem = Pair.second.PrivateElemInit) {
         const VarDecl *OriginalVD = Pair.second.Original;
@@ -3739,6 +3762,9 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
                  "Expected artificial target data variable.");
           SharedRefLValue =
               CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(OriginalVD), Type);
+        } else if (ForClone) {
+          // Source is the same field on the origin task's privates record.
+          SharedRefLValue = CGF.EmitLValueForField(SrcPrivatesBase, *FI);
         } else if (ForDup) {
           SharedRefLValue = CGF.EmitLValueForField(SrcBase, SharedField);
           SharedRefLValue = CGF.MakeAddrLValue(
@@ -3889,11 +3915,74 @@ emitTaskDupFunction(CodeGenModule &CGM, SourceLocation Loc,
         CGF.Int8Ty, CGM.getNaturalTypeAlignment(SharedsTy));
   }
   emitPrivatesInit(CGF, D, KmpTaskSharedsPtr, TDBase, KmpTaskTWithPrivatesQTyRD,
-                   SharedsTy, SharedsPtrTy, Data, Privates, /*ForDup=*/true);
+                   SharedsTy, SharedsPtrTy, Data, Privates,
+                   PrivatesInitMode::ForDup, /*SrcPrivatesBase=*/LValue());
   CGF.FinishFunction();
   return TaskDup;
 }
 
+/// Emit task_clone function (for re-initializing non-trivially-copyable
+/// firstprivate copies when cloning a task into a taskgraph record).
+/// \code
+/// void __omp_task_clone(kmp_task_t *task_dst, const kmp_task_t *task_src,
+///                       int /*unused*/) {
+///   // copy-construct each non-trivial firstprivate from
+///   // task_src->.kmp_privates.t into task_dst->.kmp_privates.t.
+/// }
+/// \endcode
+/// The (unused) third parameter is present so that the function shares the
+/// same calling convention as the existing taskloop \c task_dup callback
+/// (\c p_task_dup_t in the runtime), letting the runtime invoke either via
+/// a single function-pointer type.
+static llvm::Value *emitTaskCloneFunction(
+    CodeGenModule &CGM, SourceLocation Loc, const OMPExecutableDirective &D,
+    QualType KmpTaskTWithPrivatesPtrQTy,
+    const RecordDecl *KmpTaskTWithPrivatesQTyRD, QualType SharedsTy,
+    QualType SharedsPtrTy, const OMPTaskDataTy &Data,
+    ArrayRef<PrivateDataTy> Privates) {
+  ASTContext &C = CGM.getContext();
+  auto *DstArg = ImplicitParamDecl::Create(
+      C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy,
+      ImplicitParamKind::Other);
+  auto *SrcArg = ImplicitParamDecl::Create(
+      C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, KmpTaskTWithPrivatesPtrQTy,
+      ImplicitParamKind::Other);
+  auto *UnusedArg =
+      ImplicitParamDecl::Create(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
+                                ImplicitParamKind::Other);
+  FunctionArgList Args{DstArg, SrcArg, UnusedArg};
+  const auto &FnInfo =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
+  llvm::FunctionType *FnTy = CGM.getTypes().GetFunctionType(FnInfo);
+  std::string Name = CGM.getOpenMPRuntime().getName({"omp_task_clone", ""});
+  auto *Fn = llvm::Function::Create(FnTy, llvm::GlobalValue::InternalLinkage,
+                                    Name, &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, FnInfo);
+  if (!CGM.getCodeGenOpts().SampleProfileFile.empty())
+    Fn->addFnAttr("sample-profile-suffix-elision-policy", "selected");
+  Fn->setDoesNotRecurse();
+  CodeGenFunction CGF(CGM);
+  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, FnInfo, Args, Loc, Loc);
+
+  LValue DstBase = CGF.EmitLoadOfPointerLValue(
+      CGF.GetAddrOfLocalVar(DstArg),
+      KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
+  LValue SrcBase = CGF.EmitLoadOfPointerLValue(
+      CGF.GetAddrOfLocalVar(SrcArg),
+      KmpTaskTWithPrivatesPtrQTy->castAs<PointerType>());
+  // Address the .kmp_privates.t sub-record of the source task; the
+  // destination's privates record is located via DstBase inside
+  // emitPrivatesInit.
+  auto PrivatesFI = std::next(KmpTaskTWithPrivatesQTyRD->field_begin());
+  LValue SrcPrivatesBase = CGF.EmitLValueForField(SrcBase, *PrivatesFI);
+
+  emitPrivatesInit(CGF, D, /*KmpTaskSharedsPtr=*/Address::invalid(), DstBase,
+                   KmpTaskTWithPrivatesQTyRD, SharedsTy, SharedsPtrTy, Data,
+                   Privates, PrivatesInitMode::ForClone, SrcPrivatesBase);
+  CGF.FinishFunction();
+  return Fn;
+}
+
 /// Checks if destructor function is required to be generated.
 /// \return true if cleanups are required, false otherwise.
 static bool
@@ -4395,7 +4484,7 @@ CGOpenMPRuntime::TaskResultTy CGOpenMPRuntime::emitTaskInit(
   if (!Privates.empty()) {
     emitPrivatesInit(CGF, D, KmpTaskSharedsPtr, Base, KmpTaskTWithPrivatesQTyRD,
                      SharedsTy, SharedsPtrTy, Data, Privates,
-                     /*ForDup=*/false);
+                     PrivatesInitMode::Normal, /*SrcPrivatesBase=*/LValue());
     if (isOpenMPTaskLoopDirective(D.getDirectiveKind()) &&
         (!Data.LastprivateVars.empty() || checkInitIsRequired(CGF, Privates))) {
       Result.TaskDupFn = emitTaskDupFunction(
@@ -4403,6 +4492,16 @@ CGOpenMPRuntime::TaskResultTy CGOpenMPRuntime::emitTaskInit(
           KmpTaskTQTyRD, SharedsTy, SharedsPtrTy, Data, Privates,
           /*WithLastIter=*/!Data.LastprivateVars.empty());
     }
+    // For plain tasks (not taskloops) that have at least one non-trivially
+    // copyable firstprivate, emit a clone function so that the runtime can
+    // re-initialize those fields when the task is recorded into a taskgraph.
+    // Taskloops already cover the same need via their TaskDupFn.
+    if (!isOpenMPTaskLoopDirective(D.getDirectiveKind()) &&
+        checkInitIsRequired(CGF, Privates)) {
+      Result.TaskCloneFn = emitTaskCloneFunction(
+          CGM, Loc, D, KmpTaskTWithPrivatesPtrQTy, KmpTaskTWithPrivatesQTyRD,
+          SharedsTy, SharedsPtrTy, Data, Privates);
+    }
   }
   // Fields of union "kmp_cmplrdata_t" for destructors and priority.
   enum { Priority = 0, Destructors = 1 };
@@ -4950,7 +5049,7 @@ void CGOpenMPRuntime::emitTaskCall(
                                                   PrePostActionTy &) {
     llvm::Value *ThreadId = getThreadID(CGF, Loc);
     llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
-    std::array<llvm::Value *, 9> TGTaskArgs;
+    std::array<llvm::Value *, 10> TGTaskArgs;
     std::array<llvm::Value *, 3> TaskAllocArgs;
     TaskResultTy Result = emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy,
                                        Shareds, Data, true, TaskAllocArgs);
@@ -4984,6 +5083,10 @@ void CGOpenMPRuntime::emitTaskCall(
     TGTaskArgs[8] = RelocFn ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
                                   RelocFn, CGM.VoidPtrTy)
                             : llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
+    TGTaskArgs[9] = Result.TaskCloneFn
+                        ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+                              Result.TaskCloneFn, CGF.VoidPtrTy)
+                        : llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
                             CGM.getModule(), OMPRTL___kmpc_taskgraph_task),
                         TGTaskArgs);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index d4dbbef5745a5..15fd273bd8936 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -557,6 +557,10 @@ class CGOpenMPRuntime {
     LValue TDBase;
     const RecordDecl *KmpTaskTQTyRD = nullptr;
     llvm::Value *TaskDupFn = nullptr;
+    /// Compiler-emitted helper that re-initializes any non-trivially-copyable
+    /// firstprivate fields after the runtime bitwise-clones a task into a
+    /// taskgraph record. Null when no such helper is required.
+    llvm::Value *TaskCloneFn = nullptr;
   };
   /// Emit task region for the task directive. The task region is emitted in
   /// several steps:
diff --git a/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp b/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp
index df358df14c82f..23cfbdddb3f68 100644
--- a/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp
+++ b/clang/test/OpenMP/taskgraph_firstprivate_saved_ast_print.cpp
@@ -76,4 +76,71 @@ void firstprivate_saved() {
   { (void)a; }
 }
 
+// Per OpenMP 6.0 [14.3], a 'firstprivate' clause with the 'saved' modifier on
+// a replayable construct may include variables with static storage duration;
+// they are copied into the saved data environment of the taskgraph record.
+// This covers file-scope statics, static-local variables, static data
+// members, and const-qualified statics, all of which Sema accepts and Clang
+// codegen places into the per-task '.kmp_privates.t' tail struct.
+
+static int FileScopeStatic = 100;
+static const int FileScopeConstStatic = 200;
+
+struct WithStaticMember {
+  static int StaticMember;
+  static const int StaticConstMember = 400;
+};
+int WithStaticMember::StaticMember = 0;
+
+void firstprivate_saved_statics() {
+  static int LocalStatic = 300;
+  static const int LocalConstStatic = 500;
+
+  // CHECK-LABEL: void firstprivate_saved_statics
+  // CHECK: #pragma omp task firstprivate(saved: FileScopeStatic)
+  #pragma omp task firstprivate(saved: FileScopeStatic)
+  { (void)FileScopeStatic; }
+
+  // CHECK: #pragma omp task firstprivate(saved: FileScopeConstStatic)
+  #pragma omp task firstprivate(saved: FileScopeConstStatic)
+  { (void)FileScopeConstStatic; }
+
+  // CHECK: #pragma omp task firstprivate(saved: LocalStatic)
+  #pragma omp task firstprivate(saved: LocalStatic)
+  { (void)LocalStatic; }
+
+  // CHECK: #pragma omp task firstprivate(saved: LocalConstStatic)
+  #pragma omp task firstprivate(saved: LocalConstStatic)
+  { (void)LocalConstStatic; }
+
+  // CHECK: #pragma omp task firstprivate(saved: WithStaticMember::StaticMember)
+  #pragma omp task firstprivate(saved: WithStaticMember::StaticMember)
+  { (void)WithStaticMember::StaticMember; }
+
+  // CHECK: #pragma omp task firstprivate(saved: WithStaticMember::StaticConstMember)
+  #pragma omp task firstprivate(saved: WithStaticMember::StaticConstMember)
+  { (void)WithStaticMember::StaticConstMember; }
+
+  // Multiple statics in a single clause, mixed with a non-static.
+  int local_int = 0;
+  // CHECK: #pragma omp task firstprivate(saved: FileScopeStatic,LocalStatic,WithStaticMember::StaticMember,local_int)
+  #pragma omp task firstprivate(saved:                                         \
+                                FileScopeStatic, LocalStatic,                  \
+                                WithStaticMember::StaticMember, local_int)
+  {
+    (void)FileScopeStatic;
+    (void)LocalStatic;
+    (void)WithStaticMember::StaticMember;
+    (void)local_int;
+  }
+
+  // Same on a 'taskloop' construct.
+  // CHECK: #pragma omp taskloop firstprivate(saved: FileScopeStatic,LocalConstStatic)
+  #pragma omp taskloop firstprivate(saved: FileScopeStatic, LocalConstStatic)
+  for (int i = 0; i < 4; ++i) {
+    (void)FileScopeStatic;
+    (void)LocalConstStatic;
+  }
+}
+
 #endif
diff --git a/clang/test/OpenMP/taskgraph_task_clone_codegen.cpp b/clang/test/OpenMP/taskgraph_task_clone_codegen.cpp
new file mode 100644
index 0000000000000..9451ad3ba9110
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_task_clone_codegen.cpp
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// Verifies that for a 'firstprivate' on a task inside a taskgraph whose
+// type has a non-trivial copy constructor, the compiler emits a dedicated
+// '.omp_task_clone.' helper and passes it to __kmpc_taskgraph_task in
+// the trailing argument slot.  The helper re-runs the copy constructor
+// from the origin task's '.kmp_privates.t' field into the clone's, so
+// that the runtime memcpy does not produce a torn copy of a non-
+// trivially-copyable object.
+
+struct NonTrivial {
+  int v;
+  int *self;
+  NonTrivial();
+  NonTrivial(const NonTrivial &other);
+  ~NonTrivial();
+};
+
+void run() {
+  NonTrivial nt;
+#pragma omp taskgraph
+  {
+#pragma omp task firstprivate(nt)
+    {
+      (void)nt.v;
+    }
+  }
+}
+
+// The clone helper is passed as the trailing pointer argument to
+// __kmpc_taskgraph_task (10 total: ident, gtid, task, flags, sizes...,
+// ndeps, deps, relocation, clone).
+// CHECK: call i32 @__kmpc_taskgraph_task(ptr {{[^,]+}}, i32 {{[^,]+}}, ptr {{[^,]+}}, i32 {{[^,]+}}, i64 {{[^,]+}}, i64 {{[^,]+}}, i32 {{[^,]+}}, ptr {{[^,]+}}, ptr {{[^,]+}}, ptr @.omp_task_clone.)
+
+// The clone helper has the same calling convention as the existing
+// taskloop task-dup callback so that the runtime can dispatch through
+// a single function-pointer type; the third parameter is unused here.
+// The body indexes the same .kmp_privates.t field on both source and
+// destination tasks and invokes NonTrivial's copy constructor.
+// CHECK: define internal void @.omp_task_clone.(ptr noundef %{{[^,]+}}, ptr noundef %{{[^,]+}}, i32 noundef %{{[^,]+}})
+// CHECK: getelementptr inbounds {{.*}} %struct.kmp_task_t_with_privates,
+// CHECK: getelementptr inbounds {{.*}} %struct.kmp_task_t_with_privates,
+// CHECK: getelementptr inbounds {{.*}} %struct..kmp_privates.t,
+// CHECK: getelementptr inbounds {{.*}} %struct..kmp_privates.t,
+// CHECK: call void @_ZN10NonTrivialC1ERKS_(
+
+#endif
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 02e3e1f98e969..85a2eb6f35f22 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -360,7 +360,7 @@ __OMP_RTL(__kmpc_taskgroup, false, Void, IdentPtr, Int32)
 __OMP_RTL(__kmpc_taskgraph, false, Void, IdentPtr, Int32, VoidPtrPtr, SizeTy,
           Int32, Int32, VoidPtr, VoidPtr)
 __OMP_RTL(__kmpc_taskgraph_task, false, Int32, IdentPtr, Int32, VoidPtr, Int32,
-          SizeTy, SizeTy, Int32, VoidPtr, VoidPtr)
+          SizeTy, SizeTy, Int32, VoidPtr, VoidPtr, VoidPtr)
 __OMP_RTL(__kmpc_taskgraph_taskloop, false, Int32, IdentPtr, Int32, VoidPtr,
           Int32, Int32, Int64Ptr, Int64Ptr, Int64,
           Int32, Int32, Int64, Int32, VoidPtr, VoidPtr)
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 27b2399ddbc01..826e0009bedbd 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -4511,7 +4511,7 @@ KMP_EXPORT void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
 KMP_EXPORT kmp_uint32 __kmpc_taskgraph_task(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 flags,
     size_t sizeof_kmp_task_t, size_t sizeof_shareds, kmp_int32 ndeps,
-    kmp_depend_info_t *dep_list, kmp_task_relocate_t reloc);
+    kmp_depend_info_t *dep_list, kmp_task_relocate_t reloc, void *task_clone);
 KMP_EXPORT kmp_uint32 __kmpc_taskgraph_taskloop(
     ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 flags,
     kmp_int32 if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 08b0eda3dc08f..0073fee4cf4c1 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -838,8 +838,22 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
      placed here, since at this point other tasks might have been released
      hence overlapping the destructor invocations with some other work in the
      released tasks.  The OpenMP spec is not specific on when the destructors
-     are invoked, so we should be free to choose. */
-  if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
+     are invoked, so we should be free to choose.
+
+     For tasks owned by a taskgraph record, the same task descriptor (and
+     therefore the same .kmp_privates.t storage) is reused for every replay.
+     Firing the per-task destructor here would destruct the per-task
+     'firstprivate' copies (e.g. the snapshot used to realise the OpenMP 6.0
+     'firstprivate(saved: ...)' modifier) on the first replay completion,
+     leaving subsequent replays observing destructed state.  Defer the
+     destructor invocation to __kmp_taskgraph_free, which fires it exactly
+     once per task at end-of-taskgraph. */
+  bool defer_destructors_to_taskgraph_free = false;
+#if OMP_TASKGRAPH_EXPERIMENTAL
+  defer_destructors_to_taskgraph_free = is_taskgraph;
+#endif
+  if (UNLIKELY(taskdata->td_flags.destructors_thunk) &&
+      !defer_destructors_to_taskgraph_free) {
     kmp_routine_entry_t destr_thunk = task->data1.destructors;
     KMP_ASSERT(destr_thunk);
     destr_thunk(gtid, task);
@@ -5823,7 +5837,22 @@ static void __kmp_taskgraph_free(kmp_int32 gtid, kmp_taskgraph_record_t *rec,
   __kmp_taskgraph_free_region_metadata(thread, rec->root);
 
   for (size_t task = 0; task < rec->num_tasks; task++) {
-    kmp_taskdata *taskdata = KMP_TASK_TO_TASKDATA(rec->record_map[task].task);
+    // Skip entries that don't have an associated task (e.g. taskwait nodes
+    // recorded by __kmpc_taskgraph_taskwait).
+    if (rec->record_map[task].task == nullptr)
+      continue;
+    kmp_task_t *taskptr = rec->record_map[task].task;
+    kmp_taskdata *taskdata = KMP_TASK_TO_TASKDATA(taskptr);
+    // Fire the per-task destructor thunk exactly once here, at end-of-
+    // taskgraph.  __kmp_task_finish deliberately skips the thunk for
+    // taskgraph-owned tasks so that the per-replay state (in particular the
+    // 'firstprivate(saved: ...)' snapshot held in .kmp_privates.t) is not
+    // destructed between replays.
+    if (UNLIKELY(taskdata->td_flags.destructors_thunk)) {
+      kmp_routine_entry_t destr_thunk = taskptr->data1.destructors;
+      KMP_ASSERT(destr_thunk);
+      destr_thunk(gtid, taskptr);
+    }
     // Setting this here keeps an assertion in __kmp_free_task happy: the
     // clone may never have been replayed, in which case 'complete' will be
     // zero here, as initialized.
@@ -5862,14 +5891,18 @@ static kmp_taskgraph_header_t *__kmp_taskgraph_header_alloc(kmp_int32 gtid) {
 
 // Clone a (new) task that has had its private variables and shared variables
 // initialised already.
+//
+// The bitwise memcpy below is sufficient for trivially-copyable firstprivate
+// fields stored in the task's .kmp_privates.t region.  For
+// non-trivially-copyable firstprivates, the caller (\c __kmpc_taskgraph_task)
+// invokes a compiler-emitted \c task_clone thunk after we return, to re-run
+// the copy constructors with the source taken from the origin task's already
+// initialised privates record.
 static kmp_task_t *__kmp_taskgraph_clone_task(kmp_info_t *thread,
                                               kmp_taskgraph_record_t *taskgraph,
                                               kmp_task_t *orig,
                                               size_t sizeof_kmp_task_t,
                                               size_t sizeof_shareds) {
-  // FIXME: This should use a "taskdup" function like taskloops in cases where
-  // private variables are not trivially copyable.  For now, do it by plain
-  // bitwise copy.
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(orig);
   size_t shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
@@ -6025,7 +6058,8 @@ kmp_uint32 __kmpc_taskgraph_task(ident_t *loc_ref, kmp_int32 gtid,
                                  size_t sizeof_kmp_task_t,
                                  size_t sizeof_shareds, kmp_int32 ndeps,
                                  kmp_depend_info_t *dep_list,
-                                 kmp_task_relocate_t relocate) {
+                                 kmp_task_relocate_t relocate,
+                                 void *task_clone) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgroup_t *taskgroup = thread->th.th_current_task->td_taskgroup;
   kmp_taskgraph_record_t *rec = __kmp_taskgraph_or_parent_recording(taskgroup);
@@ -6035,6 +6069,15 @@ kmp_uint32 __kmpc_taskgraph_task(ident_t *loc_ref, kmp_int32 gtid,
     if (status == KMP_TDG_RECORDING) {
       kmp_task_t *cloned_task = __kmp_taskgraph_clone_task(
           thread, rec, new_task, sizeof_kmp_task_t, sizeof_shareds);
+      // If the compiler emitted a task-clone helper, run it now so that any
+      // non-trivially-copyable firstprivate fields are copy-constructed from
+      // the origin task's privates record instead of bitwise-copied.  Shares
+      // its calling convention with the taskloop \c task_dup callback; the
+      // \c lastpriv argument is unused for plain tasks.
+      if (task_clone) {
+        p_task_dup_t clone_fn = (p_task_dup_t)task_clone;
+        clone_fn(cloned_task, new_task, /*lastpriv=*/0);
+      }
       kmp_taskgraph_node_t *node = __kmp_taskgraph_node_alloc(rec, cloned_task);
       if (taskgroup->taskgraph.reduce_input) {
         node->reduce_input = taskgroup->taskgraph.reduce_input;
diff --git a/openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_nontrivial.cpp b/openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_nontrivial.cpp
new file mode 100644
index 0000000000000..a10f306d8d3cb
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_nontrivial.cpp
@@ -0,0 +1,146 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+// Verifies the destructor-lifecycle contract for a non-trivially-copyable
+// 'firstprivate(saved: ...)' list item on a replayable task.
+//
+// The saved snapshot is held in '.kmp_privates.t' of the task descriptor
+// that the taskgraph record persists across replays, so the
+// compiler-emitted per-task destructor thunk must NOT fire at the end of
+// each replay (otherwise subsequent replays would observe a destroyed
+// object).  This is verified observationally: the destructor here writes
+// a recognisable sentinel value (-1) into the slot, so any replay that
+// saw a destroyed snapshot would observe -1 rather than the original
+// saved value.
+//
+// Construction/destruction balance is verified at program exit via a
+// global guard whose destructor fires after libomp's static-record
+// teardown has run, at which point the saved-firstprivate snapshot must
+// also have been destroyed exactly once.
+
+#include <cstdio>
+#include <cstdlib>
+
+struct Tracker {
+  static int Ctors;
+  static int CopyCtors;
+  static int Dtors;
+
+  int Value;
+
+  explicit Tracker(int V) : Value(V) { ++Ctors; }
+  Tracker(const Tracker &Other) : Value(Other.Value) { ++CopyCtors; }
+  // Use a sentinel value on destruction so that any read-after-destroy of
+  // the saved snapshot becomes observable.
+  ~Tracker() {
+    ++Dtors;
+    Value = -1;
+  }
+};
+
+int Tracker::Ctors = 0;
+int Tracker::CopyCtors = 0;
+int Tracker::Dtors = 0;
+
+// 'Failed' is shared across tasks and threads so we can record a fail
+// status from within the task body and across the program-exit guard.
+static int Failed = 0;
+
+__attribute__((noinline)) static void run_taskgraph_nontrivial(int seed) {
+  Tracker Local(seed);
+  int observed = 0;
+
+#pragma omp taskgraph graph_id(927)
+  {
+#pragma omp task firstprivate(saved : Local) shared(observed)
+    {
+      // Each replay must observe the value captured at recording time
+      // (which was 11).  If the per-replay destructor deferral were
+      // missing, the second and subsequent replays would observe the
+      // sentinel -1 written by Tracker::~Tracker().  Do not mutate the
+      // snapshot here: we want each replay to read the same value.
+      observed = Local.Value;
+    }
+  }
+
+  if (observed != 11) {
+    std::fprintf(stderr, "FAIL replay observed=%d expected=11 seed=%d\n",
+                 observed, seed);
+    Failed = 1;
+  }
+}
+
+// Final accounting fires at program exit.  Two snapshots are
+// copy-constructed exactly once each at recording time:
+//   1. The "original" task's '.kmp_privates.t' snapshot (initialised in IR
+//      at task allocation, then destructed when the original task finishes).
+//   2. The "clone" task's '.kmp_privates.t' snapshot (initialised by the
+//      compiler-emitted task-clone helper invoked from
+//      __kmpc_taskgraph_task; its destructor is deferred to
+//      __kmp_taskgraph_free, which is not driven by program exit in
+//      libomp today, hence the asymmetric expected dtor count below).
+struct ExitGuard {
+  ~ExitGuard() {
+    bool ok = true;
+    if (Tracker::CopyCtors != 2) {
+      std::fprintf(stderr,
+                   "FAIL (exit) CopyCtors=%d expected=2 (one for the "
+                   "original task, one for the persistent taskgraph clone)\n",
+                   Tracker::CopyCtors);
+      ok = false;
+    }
+    // Expected destruction count at program exit:
+    //   - Local objects: one ctor + one dtor per call to
+    //     run_taskgraph_nontrivial.
+    //   - The original task's snapshot: destructed when the task body
+    //     finishes (its task is not taskgraph-owned).
+    //   - The clone task's snapshot is deferred to __kmp_taskgraph_free,
+    //     which is not yet driven at program exit, so it does not
+    //     contribute to the dtor count here.
+    int expected_dtors = Tracker::Ctors + /* orig task snapshot */ 1;
+    if (Tracker::Dtors != expected_dtors) {
+      std::fprintf(stderr,
+                   "FAIL (exit) ctor/dtor imbalance ctors=%d copyctors=%d "
+                   "dtors=%d expected_dtors=%d\n",
+                   Tracker::Ctors, Tracker::CopyCtors, Tracker::Dtors,
+                   expected_dtors);
+      ok = false;
+    }
+
+    if (!ok || Failed) {
+      std::fprintf(stderr, "FAIL firstprivate(saved) non-trivial lifecycle\n");
+      std::_Exit(1);
+    }
+
+    std::fprintf(stderr,
+                 "PASS firstprivate(saved) non-trivial lifecycle "
+                 "ctors=%d copyctors=%d dtors=%d\n",
+                 Tracker::Ctors, Tracker::CopyCtors, Tracker::Dtors);
+  }
+};
+
+// Global guard whose destructor fires at program exit, after all OpenMP
+// teardown has occurred and the taskgraph record has been freed.
+static ExitGuard Guard;
+
+int main() {
+  // Recording run.  Inside the function we create Local(11), then the saved
+  // firstprivate copy-constructs that into the task's slot (one copyctor).
+  run_taskgraph_nontrivial(11);
+
+  // Replay runs.  Each replay reuses the recorded task descriptor and its
+  // in-place saved snapshot.  No additional copy-construction occurs, and
+  // no destruction occurs until end-of-taskgraph.  The body observes the
+  // snapshot value (11) on every replay -- the per-call seed argument is
+  // intentionally ignored by the task body to demonstrate that the saved
+  // snapshot drives what the task sees, not the call-site argument.
+  for (int i = 0; i < 5; ++i)
+    run_taskgraph_nontrivial(42 + i);
+
+  return 0;
+}
+
+// CHECK: PASS firstprivate(saved) non-trivial lifecycle
diff --git a/openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_nontrivial_selfref.cpp b/openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_nontrivial_selfref.cpp
new file mode 100644
index 0000000000000..dfad59bb4c149
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_nontrivial_selfref.cpp
@@ -0,0 +1,97 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+// Verifies that taskgraph cloning copy-constructs non-trivially-copyable
+// 'firstprivate(saved: ...)' list items rather than bitwise-copying them.
+//
+// 'SelfRef' carries an internal pointer that the constructor wires back to
+// its own member.  Any bitwise clone of the task descriptor (the historical
+// behaviour) would leave the cloned object's internal pointer aimed at the
+// original (which goes out of scope when the recording call returns), so
+// the first replay would dereference dangling memory.  When the compiler-
+// emitted task-clone helper runs, it copy-constructs the cloned snapshot
+// in-place, repairing the internal pointer to refer to the clone's own
+// storage.  Subsequent replays therefore observe a consistent value.
+
+#include <cstdio>
+#include <cstdlib>
+
+struct SelfRef {
+  int Value;
+  // Self-pointer that the (copy) constructor steers at our own 'Value'
+  // member.  Bitwise duplication leaves this dangling.
+  int *Inside;
+
+  explicit SelfRef(int V) : Value(V), Inside(&Value) {}
+  SelfRef(const SelfRef &Other) : Value(Other.Value), Inside(&Value) {}
+  ~SelfRef() {
+    // Poison the self pointer so any dangling read after destruction is
+    // observable as a (likely) crash or wrong value.
+    Inside = nullptr;
+    Value = -1;
+  }
+};
+
+static int Failed = 0;
+
+__attribute__((noinline)) static void run_taskgraph_selfref(int seed) {
+  SelfRef Local(seed);
+  int observed_via_self = 0;
+  int observed_value = 0;
+
+#pragma omp taskgraph graph_id(3142)
+  {
+#pragma omp task firstprivate(saved : Local)                                   \
+    shared(observed_via_self, observed_value)
+    {
+      // Read through the self pointer.  This is exactly the operation that
+      // bitwise cloning would break: the cloned task's 'Inside' would point
+      // at the recording-time stack frame, not at the cloned 'Value'.
+      observed_via_self = *Local.Inside;
+      observed_value = Local.Value;
+    }
+  }
+
+  if (observed_value != 7 || observed_via_self != 7) {
+    std::fprintf(stderr,
+                 "FAIL seed=%d observed_value=%d observed_via_self=%d "
+                 "(expected both = 7)\n",
+                 seed, observed_value, observed_via_self);
+    Failed = 1;
+  }
+}
+
+struct ExitGuard {
+  ~ExitGuard() {
+    if (Failed) {
+      std::fprintf(stderr, "FAIL non-trivial taskgraph clone\n");
+      std::_Exit(1);
+    }
+    std::fprintf(stderr, "PASS non-trivial taskgraph clone\n");
+  }
+};
+
+static ExitGuard Guard;
+
+int main() {
+  // Recording captures Local(7) into the task's '.kmp_privates.t' slot, then
+  // the runtime clones that descriptor.  After this returns 'Local' is gone,
+  // so any cloned 'Inside' pointer that still aimed at &Local.Value would
+  // dangle.  The clone helper must copy-construct the clone in place to
+  // repair the self pointer.
+  run_taskgraph_selfref(7);
+
+  // Replays.  Each one must observe the saved 7 both directly and through
+  // the (repaired) self-referencing pointer.  Different seed values are
+  // intentionally ignored by the task body: only the saved snapshot drives
+  // what the task sees.
+  for (int i = 0; i < 5; ++i)
+    run_taskgraph_selfref(42 + i);
+
+  return 0;
+}
+
+// CHECK: PASS non-trivial taskgraph clone
diff --git a/openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_static.cpp b/openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_static.cpp
new file mode 100644
index 0000000000000..ae613e2bebef2
--- /dev/null
+++ b/openmp/runtime/test/taskgraph/taskgraph_firstprivate_saved_static.cpp
@@ -0,0 +1,100 @@
+// clang-format off
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
+// REQUIRES: omp_taskgraph_experimental
+// clang-format on
+
+// OpenMP 6.0 [14.3]: the 'saved' modifier on a 'firstprivate' clause of a
+// replayable construct extends the saved data environment to also include
+// copies of variables with static storage duration that appear in the
+// clause.  Within the current Clang/libomp implementation those snapshots
+// live in the per-task '.kmp_privates.t' tail struct, which is allocated
+// once at recording time and reused for every replay of the recorded task.
+// This test exercises four flavours of static-storage list items and
+// verifies that the saved snapshot is what each replay observes, regardless
+// of any subsequent mutation of the underlying static variable between
+// taskgraph encounters.
+
+#include <cstdio>
+
+static int FileScopeStaticInt = 100;
+static const int FileScopeConstStaticInt = 200;
+
+struct WithStaticMember {
+  static int StaticMember;
+  static const int StaticConstMember = 400;
+};
+int WithStaticMember::StaticMember = 300;
+// Out-of-line definition is required (pre-C++17) because the saved
+// firstprivate slot captures the static by-value via odr-use.
+const int WithStaticMember::StaticConstMember;
+
+__attribute__((noinline)) static void
+run_taskgraph_saved_static(int *out_fs, int *out_fsc, int *out_local,
+                           int *out_member, int *out_member_const) {
+  static int LocalStaticInt = 500;
+
+#pragma omp taskgraph graph_id(811)
+  {
+#pragma omp task firstprivate(saved : FileScopeStaticInt,                      \
+                                  FileScopeConstStaticInt, LocalStaticInt,     \
+                                  WithStaticMember::StaticMember,              \
+                                  WithStaticMember::StaticConstMember)         \
+    shared(out_fs, out_fsc, out_local, out_member, out_member_const)
+    {
+      *out_fs = FileScopeStaticInt;
+      *out_fsc = FileScopeConstStaticInt;
+      *out_local = LocalStaticInt;
+      *out_member = WithStaticMember::StaticMember;
+      *out_member_const = WithStaticMember::StaticConstMember;
+    }
+  }
+}
+
+int main() {
+  bool failed = false;
+  int fs = -1, fsc = -1, local = -1, member = -1, member_const = -1;
+
+  // First call: recording.  Each captured-by-saved static is snapshotted
+  // into the task's '.kmp_privates.t' slot at this point.
+  run_taskgraph_saved_static(&fs, &fsc, &local, &member, &member_const);
+  if (fs != 100 || fsc != 200 || local != 500 || member != 300 ||
+      member_const != 400) {
+    std::fprintf(stderr,
+                 "FAIL initial record fs=%d fsc=%d local=%d member=%d "
+                 "member_const=%d\n",
+                 fs, fsc, local, member, member_const);
+    failed = true;
+  }
+
+  // Mutate the underlying non-const statics.  Because the task's firstprivate
+  // slots were snapshotted with 'saved:' at recording, every subsequent
+  // replay must continue to observe the recorded values (100, 500, 300) for
+  // the non-const statics, not the mutated values.
+  FileScopeStaticInt = 11;
+  WithStaticMember::StaticMember = 13;
+  // LocalStaticInt is not visible here; we can rely on the fact that the
+  // function-local static is also snapshotted at recording.
+
+  for (int i = 0; i < 4; ++i) {
+    fs = fsc = local = member = member_const = -1;
+    run_taskgraph_saved_static(&fs, &fsc, &local, &member, &member_const);
+    if (fs != 100 || fsc != 200 || local != 500 || member != 300 ||
+        member_const != 400) {
+      std::fprintf(stderr,
+                   "FAIL replay %d fs=%d fsc=%d local=%d member=%d "
+                   "member_const=%d\n",
+                   i, fs, fsc, local, member, member_const);
+      failed = true;
+    }
+  }
+
+  if (failed)
+    return 1;
+
+  std::fprintf(stderr,
+               "PASS firstprivate(saved) statics persist across replays\n");
+  return 0;
+}
+
+// CHECK: PASS firstprivate(saved) statics persist across replays
diff --git a/openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth.cpp b/openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth.cpp
index 922cb85a53eec..aa4eec17ad091 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_replayable_saved_stack_depth.cpp
@@ -1,7 +1,6 @@
 // clang-format off
 // RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
 // REQUIRES: omp_taskgraph_experimental
-// XFAIL: *
 // clang-format on
 
 #include <cstdio>
diff --git a/openmp/runtime/test/taskgraph/taskgraph_shared_stack_depth.cpp b/openmp/runtime/test/taskgraph/taskgraph_shared_stack_depth.cpp
index ad579e8ed1b3d..533cd929170fb 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_shared_stack_depth.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_shared_stack_depth.cpp
@@ -2,9 +2,23 @@
 // RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t
 // RUN: env OMP_NUM_THREADS=4 %libomp-run 2>&1 | FileCheck %s
 // REQUIRES: omp_taskgraph_experimental
-// XFAIL: *
 // clang-format on
 
+// This test exercises a non-lexically-nested replayable task that captures
+// 'payload' and 'seed' as 'firstprivate' (without the OpenMP 6.0 'saved'
+// modifier).  Per OpenMP 6.0 [14.3], when a variable referenced in a
+// replayable construct does not exist in the enclosing data environment
+// of the taskgraph construct, the saved data environment in the
+// taskgraph record is used as the enclosing data environment for that
+// variable.  Both 'payload' and 'seed' live in run_taskgraph's stack
+// frame, not in main's, so the spec mandates the saved-data-environment
+// fallback - i.e. the snapshot captured at recording time is what every
+// replay observes, independent of the call-stack depth of the replay.
+//
+// In the current libomp implementation that snapshot lives in the
+// task descriptor's '.kmp_privates.t' tail struct, which persists for
+// the lifetime of the taskgraph record, so this test passes naturally.
+
 #include <cstdio>
 
 static volatile int StackSink = 0;

>From 35316e4faa1d0cb9b69ca71e50292047732fbcfd Mon Sep 17 00:00:00 2001
From: Julian Brown <julian.brown at amd.com>
Date: Thu, 28 May 2026 05:03:38 -0500
Subject: [PATCH 24/24] [OpenMP] Add taskgraph template interaction codegen
 tests

Add two Clang codegen tests for the OpenMP 6.0 taskgraph
directive exercising its interaction with templates -- in
particular checking that different instantiations of the same
template get distinct taskgraph handles.

Assisted-By: Claude Opus 4.7

Pull Request: https://github.com/llvm/llvm-project/pull/200409
---
 .../taskgraph_template_handle_codegen.cpp     | 36 +++++++++
 .../OpenMP/taskgraph_templates_codegen.cpp    | 73 +++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100644 clang/test/OpenMP/taskgraph_template_handle_codegen.cpp
 create mode 100644 clang/test/OpenMP/taskgraph_templates_codegen.cpp

diff --git a/clang/test/OpenMP/taskgraph_template_handle_codegen.cpp b/clang/test/OpenMP/taskgraph_template_handle_codegen.cpp
new file mode 100644
index 0000000000000..0874fe58edb03
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_template_handle_codegen.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+template <typename T>
+void templ_handle(T &x) {
+#pragma omp taskgraph
+  {
+#pragma omp task shared(x)
+    {
+      x += 1;
+    }
+  }
+}
+
+int main() {
+  int i = 0;
+  long l = 0;
+  templ_handle(i);
+  templ_handle(l);
+  return 0;
+}
+
+// CHECK-DAG: @[[H1:.omp.taskgraph.handle[^ ]*]] = internal global ptr null
+// CHECK-DAG: @[[H2:.omp.taskgraph.handle[^ ]*]] = internal global ptr null
+
+// CHECK-LABEL: define linkonce_odr {{.*}} @_Z12templ_handleIiEvRT_(
+// CHECK: call void @__kmpc_taskgraph(ptr {{[^,]+}}, i32 {{[^,]+}}, ptr @[[H1]], i64 0, i32 0, i32 0, ptr {{[^,]+}}, ptr {{[^)]+}})
+
+// CHECK-LABEL: define linkonce_odr {{.*}} @_Z12templ_handleIlEvRT_(
+// CHECK-NOT: ptr @[[H1]]
+// CHECK: call void @__kmpc_taskgraph(ptr {{[^,]+}}, i32 {{[^,]+}}, ptr @[[H2]], i64 0, i32 0, i32 0, ptr {{[^,]+}}, ptr {{[^)]+}})
+
+#endif
diff --git a/clang/test/OpenMP/taskgraph_templates_codegen.cpp b/clang/test/OpenMP/taskgraph_templates_codegen.cpp
new file mode 100644
index 0000000000000..16765bb078f4e
--- /dev/null
+++ b/clang/test/OpenMP/taskgraph_templates_codegen.cpp
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// Exercises taskgraph codegen with orthogonal language/runtime features:
+//  - C++ templates + non-trivial firstprivate cloning.
+//  - taskgroup task_reduction / task in_reduction inside taskgraph.
+//  - taskwait depend(...) inside taskgraph (task-generating path).
+
+template <typename T>
+struct Box {
+  T v;
+  Box();
+  Box(const Box &);
+  ~Box();
+};
+
+template <typename T>
+void templated_clone(T seed) {
+  Box<T> B;
+  B.v = seed;
+#pragma omp taskgraph
+  {
+#pragma omp task firstprivate(B)
+    {
+      (void)B.v;
+    }
+
+#pragma omp task replayable(false) firstprivate(B)
+    {
+      (void)B.v;
+    }
+  }
+}
+
+template <typename T>
+T templated_task_reduction(T seed) {
+  T Acc = seed;
+#pragma omp taskgraph
+  {
+#pragma omp taskgroup task_reduction(+: Acc)
+    {
+#pragma omp task in_reduction(+: Acc)
+      {
+        Acc += seed;
+      }
+    }
+#pragma omp taskwait depend(in: Acc)
+  }
+  return Acc;
+}
+
+int main() {
+  templated_clone<int>(7);
+  return templated_task_reduction<int>(1);
+}
+
+// The taskgraph task entry uses the taskgraph runtime path and carries a
+// clone helper for non-trivial firstprivate copies.
+// CHECK: call i32 @__kmpc_taskgraph_task(
+// CHECK-SAME: ptr @.omp_task_clone.)
+// CHECK: define internal void @.omp_task_clone.(
+
+// task_reduction in a taskgraph uses the dedicated taskgraph reduction init.
+// CHECK: call ptr @__kmpc_taskgraph_taskred_init(
+
+// taskwait depend(...) inside taskgraph uses the dedicated taskgraph taskwait
+// entry point instead of the generic taskwait runtime path.
+// CHECK: call void @__kmpc_taskgraph_taskwait(
+
+#endif



More information about the Openmp-commits mailing list