[clang] [llvm] [openmp] [OpenMP] OpenMP 6.0 taskgraph support (WIP) (PR #188765)

via cfe-commits cfe-commits at lists.llvm.org
Thu Mar 26 08:09:47 PDT 2026


github-actions[bot] wrote:

<!--LLVM CODE FORMAT COMMENT: {clang-format}-->


:warning: C/C++ code formatter, clang-format found issues in your code. :warning:

<details>
<summary>
You can test this locally with the following command:
</summary>

``````````bash
git-clang-format --diff origin/main HEAD --extensions h,cpp -- clang/test/OpenMP/taskgraph_ast_print.cpp clang/test/OpenMP/taskgraph_codegen.cpp openmp/runtime/test/taskgraph/taskgraph_deps_1.cpp openmp/runtime/test/taskgraph/taskgraph_deps_10.cpp openmp/runtime/test/taskgraph/taskgraph_deps_11.cpp openmp/runtime/test/taskgraph/taskgraph_deps_12.cpp openmp/runtime/test/taskgraph/taskgraph_deps_13.cpp openmp/runtime/test/taskgraph/taskgraph_deps_14.cpp openmp/runtime/test/taskgraph/taskgraph_deps_15.cpp openmp/runtime/test/taskgraph/taskgraph_deps_16.cpp openmp/runtime/test/taskgraph/taskgraph_deps_17.cpp openmp/runtime/test/taskgraph/taskgraph_deps_18.cpp openmp/runtime/test/taskgraph/taskgraph_deps_19.cpp openmp/runtime/test/taskgraph/taskgraph_deps_2.cpp openmp/runtime/test/taskgraph/taskgraph_deps_20.cpp openmp/runtime/test/taskgraph/taskgraph_deps_21.cpp openmp/runtime/test/taskgraph/taskgraph_deps_22.cpp openmp/runtime/test/taskgraph/taskgraph_deps_23.cpp openmp/runtime/test/taskgraph/taskgraph_deps_24.cpp openmp/runtime/test/taskgraph/taskgraph_deps_25.cpp openmp/runtime/test/taskgraph/taskgraph_deps_26.cpp openmp/runtime/test/taskgraph/taskgraph_deps_27.cpp openmp/runtime/test/taskgraph/taskgraph_deps_3.cpp openmp/runtime/test/taskgraph/taskgraph_deps_4.cpp openmp/runtime/test/taskgraph/taskgraph_deps_5.cpp openmp/runtime/test/taskgraph/taskgraph_deps_6.cpp openmp/runtime/test/taskgraph/taskgraph_deps_7.cpp openmp/runtime/test/taskgraph/taskgraph_deps_8.cpp openmp/runtime/test/taskgraph/taskgraph_deps_9.cpp openmp/runtime/test/tasking/omp_record_replay_random_id.cpp openmp/runtime/test/tasking/omp_record_replay_reset.cpp openmp/runtime/test/tasking/omp_taskgraph.cpp openmp/runtime/test/tasking/omp_taskgraph_deps.cpp openmp/runtime/test/tasking/omp_taskgraph_multiTDGs.cpp openmp/runtime/test/tasking/omp_taskgraph_taskloop.cpp clang/include/clang-c/Index.h clang/include/clang/AST/OpenMPClause.h clang/include/clang/AST/RecursiveASTVisitor.h clang/include/clang/AST/StmtOpenMP.h clang/include/clang/Sema/SemaOpenMP.h clang/include/clang/Serialization/ASTBitCodes.h clang/lib/AST/OpenMPClause.cpp clang/lib/AST/StmtOpenMP.cpp clang/lib/AST/StmtPrinter.cpp clang/lib/AST/StmtProfile.cpp clang/lib/Basic/OpenMPKinds.cpp clang/lib/CodeGen/CGOpenMPRuntime.cpp clang/lib/CodeGen/CGOpenMPRuntime.h clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp clang/lib/CodeGen/CGStmt.cpp clang/lib/CodeGen/CGStmtOpenMP.cpp clang/lib/CodeGen/CodeGenFunction.h clang/lib/Parse/ParseOpenMP.cpp clang/lib/Sema/SemaExceptionSpec.cpp clang/lib/Sema/SemaOpenMP.cpp clang/lib/Sema/TreeTransform.h clang/lib/Serialization/ASTReader.cpp clang/lib/Serialization/ASTReaderStmt.cpp clang/lib/Serialization/ASTWriter.cpp clang/lib/Serialization/ASTWriterStmt.cpp clang/lib/StaticAnalyzer/Core/ExprEngine.cpp clang/tools/libclang/CIndex.cpp clang/tools/libclang/CXCursor.cpp openmp/runtime/src/kmp.h openmp/runtime/src/kmp_debug.h openmp/runtime/src/kmp_global.cpp openmp/runtime/src/kmp_settings.cpp openmp/runtime/src/kmp_taskdeps.cpp openmp/runtime/src/kmp_taskdeps.h openmp/runtime/src/kmp_tasking.cpp --diff_from_common_commit
``````````

:warning:
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing `origin/main` to the base branch/commit you want to compare against.
:warning:

</details>

<details>
<summary>
View the diff from clang-format here.
</summary>

``````````diff
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index c327617c2..872ab0e18 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -3635,7 +3635,8 @@ bool RecursiveASTVisitor<Derived>::VisitOMPNowaitClause(OMPNowaitClause *C) {
 }
 
 template <typename Derived>
-bool RecursiveASTVisitor<Derived>::VisitOMPReplayableClause(OMPReplayableClause *C) {
+bool RecursiveASTVisitor<Derived>::VisitOMPReplayableClause(
+    OMPReplayableClause *C) {
   TRY_TO(TraverseStmt(C->getCondition()));
   return true;
 }
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 85342658e..00353ea12 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2251,8 +2251,7 @@ void CGOpenMPRuntime::emitTaskgraphCall(CodeGenFunction &CGF,
     return;
 
   // The nogroup clause doesn't support an argument yet.  FIXME.
-  const OMPNogroupClause *NoGroupClause =
-    D.getSingleClause<OMPNogroupClause>();
+  const OMPNogroupClause *NoGroupClause = D.getSingleClause<OMPNogroupClause>();
   llvm::Value *NoGroup;
   if (NoGroupClause) {
     NoGroup = CGF.Builder.getInt32(1);
@@ -2267,7 +2266,7 @@ void CGOpenMPRuntime::emitTaskgraphCall(CodeGenFunction &CGF,
     const Expr *Cond = GraphResetClause->getCondition();
     llvm::Value *CondVal = CGF.EvaluateExprAsBool(Cond);
     GraphReset =
-      CGF.Builder.CreateIntCast(CondVal, CGF.IntTy, /*isSigned=*/true);
+        CGF.Builder.CreateIntCast(CondVal, CGF.IntTy, /*isSigned=*/true);
   } else {
     GraphReset = CGF.Builder.getInt32(0);
   }
@@ -2278,7 +2277,7 @@ void CGOpenMPRuntime::emitTaskgraphCall(CodeGenFunction &CGF,
     const auto *E = GraphIdClause->getCondition();
     auto *GraphIdVal = CGF.EmitScalarExpr(E);
     GraphId =
-      CGF.Builder.CreateIntCast(GraphIdVal, CGM.Int32Ty, /*isSigned=*/false);
+        CGF.Builder.CreateIntCast(GraphIdVal, CGM.Int32Ty, /*isSigned=*/false);
   }
 
   CodeGenFunction OutlinedCGF(CGM, /*suppressNewContext=*/true);
@@ -2299,12 +2298,10 @@ void CGOpenMPRuntime::emitTaskgraphCall(CodeGenFunction &CGF,
 
   // Create an internal-linkage global variable to hold the taskgraph handle.
   std::string GraphHandleName = getName({"omp", "taskgraph", "handle"});
-  auto *GraphHandle =
-    new llvm::GlobalVariable(CGM.getModule(), CGM.VoidPtrTy,
-                             /*IsConstant=*/false,
-                             llvm::GlobalValue::InternalLinkage,
-                             llvm::Constant::getNullValue(CGM.VoidPtrTy),
-                             GraphHandleName);
+  auto *GraphHandle = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.VoidPtrTy,
+      /*IsConstant=*/false, llvm::GlobalValue::InternalLinkage,
+      llvm::Constant::getNullValue(CGM.VoidPtrTy), GraphHandleName);
 
   std::array<llvm::Value *, 8> Args{
       emitUpdateLocation(CGF, Loc),
@@ -2319,19 +2316,19 @@ void CGOpenMPRuntime::emitTaskgraphCall(CodeGenFunction &CGF,
 
   auto &&ThenGen = [&CGF, this, &Args](CodeGenFunction &, PrePostActionTy &) {
     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-          CGM.getModule(), OMPRTL___kmpc_taskgraph),
-        Args);
+                            CGM.getModule(), OMPRTL___kmpc_taskgraph),
+                        Args);
   };
-  auto &&ElseGen = [&CGF, this, &FnT, &CapStruct, &Loc, &OutlinedCGF]
-    (CodeGenFunction &, PrePostActionTy &) {
-      llvm::Value *CapturedArgsPtr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-          CapStruct.getPointer(OutlinedCGF), CGM.VoidPtrTy);
+  auto &&ElseGen = [&CGF, this, &FnT, &CapStruct, &Loc,
+                    &OutlinedCGF](CodeGenFunction &, PrePostActionTy &) {
+    llvm::Value *CapturedArgsPtr =
+        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+            CapStruct.getPointer(OutlinedCGF), CGM.VoidPtrTy);
 
     auto &&CodeGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
       Action.Enter(CGF);
-      CGF.CGM.getOpenMPRuntime().emitOutlinedFunctionCall(CGF, Loc,
-          FnT,
-          CapturedArgsPtr);
+      CGF.CGM.getOpenMPRuntime().emitOutlinedFunctionCall(CGF, Loc, FnT,
+                                                          CapturedArgsPtr);
     };
     RegionCodeGenTy RCG(CodeGen);
     RCG(CGF);
@@ -3878,13 +3875,11 @@ static void getKmpAffinityType(ASTContext &C, QualType &KmpTaskAffinityInfoTy) {
   }
 }
 
-CGOpenMPRuntime::TaskResultTy
-CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
-                              const OMPExecutableDirective &D,
-                              llvm::Function *TaskFunction, QualType SharedsTy,
-                              Address Shareds, const OMPTaskDataTy &Data,
-                              bool ForTaskgraph,
-                              std::array<llvm::Value *, 3> &TaskAllocArgs) {
+CGOpenMPRuntime::TaskResultTy CGOpenMPRuntime::emitTaskInit(
+    CodeGenFunction &CGF, SourceLocation Loc, const OMPExecutableDirective &D,
+    llvm::Function *TaskFunction, QualType SharedsTy, Address Shareds,
+    const OMPTaskDataTy &Data, bool ForTaskgraph,
+    std::array<llvm::Value *, 3> &TaskAllocArgs) {
   ASTContext &C = CGM.getContext();
   llvm::SmallVector<PrivateDataTy, 4> Privates;
   // Aggregate privates and sort them by the alignment.
@@ -4784,35 +4779,31 @@ void CGOpenMPRuntime::emitUpdateClause(CodeGenFunction &CGF, LValue DepobjLVal,
   CGF.EmitBlock(DoneBB, /*IsFinished=*/true);
 }
 
-void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
-                                   const OMPExecutableDirective &D,
-                                   llvm::Function *TaskFunction,
-                                   QualType SharedsTy, Address Shareds,
-                                   const Expr *IfCond,
-                                   const Expr *ReplayableCond,
-                                   const OMPTaskDataTy &Data) {
+void CGOpenMPRuntime::emitTaskCall(
+    CodeGenFunction &CGF, SourceLocation Loc, const OMPExecutableDirective &D,
+    llvm::Function *TaskFunction, QualType SharedsTy, Address Shareds,
+    const Expr *IfCond, const Expr *ReplayableCond, const OMPTaskDataTy &Data) {
   if (!CGF.HaveInsertPoint())
     return;
 
-  auto &&TaskgraphTaskCodeGen =
-      [this, &Loc, &D, TaskFunction, &SharedsTy, &Shareds, &Data]
-        (CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&TaskgraphTaskCodeGen = [this, &Loc, &D, TaskFunction, &SharedsTy,
+                                 &Shareds, &Data](CodeGenFunction &CGF,
+                                                  PrePostActionTy &) {
     llvm::Value *ThreadId = getThreadID(CGF, Loc);
     llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
     std::array<llvm::Value *, 9> TGTaskArgs;
     std::array<llvm::Value *, 3> TaskAllocArgs;
-    TaskResultTy Result =
-        emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data, true,
-                      TaskAllocArgs);
+    TaskResultTy Result = emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy,
+                                       Shareds, Data, true, TaskAllocArgs);
     Address DependenciesArray = Address::invalid();
     llvm::Value *NumOfElements;
     std::tie(NumOfElements, DependenciesArray) =
         emitDependClause(CGF, Data.Dependences, Loc);
-    //llvm::dbgs() << "SharedsTy:\n";
+    // llvm::dbgs() << "SharedsTy:\n";
     TGTaskArgs[0] = UpLoc;
     TGTaskArgs[1] = ThreadId;
     TGTaskArgs[2] = Result.NewTask;
-    //TGTaskArgs[2] = TaskgraphRegion->getTaskgraphValue();
+    // TGTaskArgs[2] = TaskgraphRegion->getTaskgraphValue();
     TGTaskArgs[3] = TaskAllocArgs[0]; // TaskFlags
     TGTaskArgs[4] = TaskAllocArgs[1]; // KmpTaskTWithPrivatesTySize
     TGTaskArgs[5] = Shareds.emitRawPointer(CGF);
@@ -4832,16 +4823,17 @@ void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
       TGTaskArgs[8] = DependenciesArray.emitRawPointer(CGF);
     }
     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-          CGM.getModule(), OMPRTL___kmpc_taskgraph_task),
-        TGTaskArgs);
+                            CGM.getModule(), OMPRTL___kmpc_taskgraph_task),
+                        TGTaskArgs);
   };
 
-  auto &&NonTaskgraphTaskCodeGen =
-      [this, &Loc, &D, TaskFunction, &SharedsTy, &Shareds, IfCond, &Data]
-        (CodeGenFunction &CGF, PrePostActionTy &) {
-    std::array<llvm::Value*, 3> DummyArray;
-    TaskResultTy Result =
-        emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data, false, DummyArray);
+  auto &&NonTaskgraphTaskCodeGen = [this, &Loc, &D, TaskFunction, &SharedsTy,
+                                    &Shareds, IfCond,
+                                    &Data](CodeGenFunction &CGF,
+                                           PrePostActionTy &) {
+    std::array<llvm::Value *, 3> DummyArray;
+    TaskResultTy Result = emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy,
+                                       Shareds, Data, false, DummyArray);
     llvm::Value *NewTask = Result.NewTask;
     llvm::Function *TaskEntry = Result.TaskEntry;
     llvm::Value *NewTaskNewTaskTTy = Result.NewTaskNewTaskTTy;
@@ -4853,15 +4845,15 @@ void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
     std::tie(NumOfElements, DependenciesArray) =
         emitDependClause(CGF, Data.Dependences, Loc);
 
-    // NOTE: routine and part_id fields are initialized by __kmpc_omp_task_alloc()
-    // libcall.
-    // Build kmp_int32 __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid,
-    // kmp_task_t *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
-    // kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) if dependence
-    // list is not empty
+    // NOTE: routine and part_id fields are initialized by
+    // __kmpc_omp_task_alloc() libcall. Build kmp_int32
+    // __kmpc_omp_task_with_deps(ident_t *, kmp_int32 gtid, kmp_task_t
+    // *new_task, kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
+    // ndeps_noalias, kmp_depend_info_t *noalias_dep_list) if dependence list is
+    // not empty
     llvm::Value *ThreadID = getThreadID(CGF, Loc);
     llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
-    llvm::Value *TaskArgs[] = { UpLoc, ThreadID, NewTask };
+    llvm::Value *TaskArgs[] = {UpLoc, ThreadID, NewTask};
     llvm::Value *DepTaskArgs[7];
     if (!Data.Dependences.empty()) {
       DepTaskArgs[0] = UpLoc;
@@ -4873,7 +4865,8 @@ void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
       DepTaskArgs[6] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
     }
     auto &&ThenCodeGen = [this, &Data, TDBase, KmpTaskTQTyRD, &TaskArgs,
-                          &DepTaskArgs](CodeGenFunction &CGF, PrePostActionTy &) {
+                          &DepTaskArgs](CodeGenFunction &CGF,
+                                        PrePostActionTy &) {
       if (!Data.Tied) {
         auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
         LValue PartIdLVal = CGF.EmitLValueForField(TDBase, *PartIdFI);
@@ -4969,13 +4962,10 @@ void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
   }
 }
 
-void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
-                                       const OMPLoopDirective &D,
-                                       llvm::Function *TaskFunction,
-                                       QualType SharedsTy, Address Shareds,
-                                       const Expr *IfCond,
-                                       const Expr *ReplayableCond,
-                                       const OMPTaskDataTy &Data) {
+void CGOpenMPRuntime::emitTaskLoopCall(
+    CodeGenFunction &CGF, SourceLocation Loc, const OMPLoopDirective &D,
+    llvm::Function *TaskFunction, QualType SharedsTy, Address Shareds,
+    const Expr *IfCond, const Expr *ReplayableCond, const OMPTaskDataTy &Data) {
   if (!CGF.HaveInsertPoint())
     return;
 
@@ -4989,16 +4979,16 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
 
   enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
 
-  auto &&TaskgraphTaskloopCodeGen =
-      [this, &Loc, &D, TaskFunction, &SharedsTy, &Shareds, IfVal, &Data]
-        (CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&TaskgraphTaskloopCodeGen = [this, &Loc, &D, TaskFunction, &SharedsTy,
+                                     &Shareds, IfVal,
+                                     &Data](CodeGenFunction &CGF,
+                                            PrePostActionTy &) {
     llvm::Value *ThreadId = getThreadID(CGF, Loc);
     llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
     std::array<llvm::Value *, 16> TGTaskLoopArgs;
     std::array<llvm::Value *, 3> TaskAllocArgs;
-    TaskResultTy Result =
-        emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data, true,
-                     TaskAllocArgs);
+    TaskResultTy Result = emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy,
+                                       Shareds, Data, true, TaskAllocArgs);
 
     // This is all copy/pasted from below. Refactor!
     LValue LBLVal = CGF.EmitLValueForField(
@@ -5006,22 +4996,25 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
         *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound));
     const auto *LBVar =
         cast<VarDecl>(cast<DeclRefExpr>(D.getLowerBoundVariable())->getDecl());
-    CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(), LBLVal.getQuals(),
-                        /*IsInitializer=*/true);
+    CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(),
+                         LBLVal.getQuals(),
+                         /*IsInitializer=*/true);
     LValue UBLVal = CGF.EmitLValueForField(
         Result.TDBase,
         *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound));
     const auto *UBVar =
         cast<VarDecl>(cast<DeclRefExpr>(D.getUpperBoundVariable())->getDecl());
-    CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(), UBLVal.getQuals(),
-                        /*IsInitializer=*/true);
+    CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(),
+                         UBLVal.getQuals(),
+                         /*IsInitializer=*/true);
     LValue StLVal = CGF.EmitLValueForField(
         Result.TDBase,
         *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTStride));
     const auto *StVar =
         cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
-    CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(),
-                        /*IsInitializer=*/true);
+    CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(),
+                         StLVal.getQuals(),
+                         /*IsInitializer=*/true);
     // Store reductions address.
     LValue RedLVal = CGF.EmitLValueForField(
         Result.TDBase,
@@ -5030,7 +5023,7 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
       CGF.EmitStoreOfScalar(Data.Reductions, RedLVal);
     } else {
       CGF.EmitNullInitialization(RedLVal.getAddress(),
-                                CGF.getContext().VoidPtrTy);
+                                 CGF.getContext().VoidPtrTy);
     }
 
     TGTaskLoopArgs[0] = UpLoc;
@@ -5044,34 +5037,40 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
     TGTaskLoopArgs[8] = LBLVal.getPointer(CGF);
     TGTaskLoopArgs[9] = UBLVal.getPointer(CGF);
     TGTaskLoopArgs[10] = CGF.EmitLoadOfScalar(StLVal, Loc);
-    TGTaskLoopArgs[11] = llvm::ConstantInt::getSigned(CGF.IntTy, Data.Nogroup ? 1 : 0);
-    TGTaskLoopArgs[12] = llvm::ConstantInt::getSigned(CGF.IntTy, Data.Schedule.getPointer()
-                           ? Data.Schedule.getInt() ? NumTasks : Grainsize
-                           : NoSchedule);
-    TGTaskLoopArgs[13] = Data.Schedule.getPointer()
-                           ? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty, /*isSigned=*/false)
-                           : llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0);
-    TGTaskLoopArgs[14] = llvm::ConstantInt::getSigned(CGF.IntTy, Data.HasModifier ? 1 : 0);
+    TGTaskLoopArgs[11] =
+        llvm::ConstantInt::getSigned(CGF.IntTy, Data.Nogroup ? 1 : 0);
+    TGTaskLoopArgs[12] = llvm::ConstantInt::getSigned(
+        CGF.IntTy, Data.Schedule.getPointer()
+                       ? Data.Schedule.getInt() ? NumTasks : Grainsize
+                       : NoSchedule);
+    TGTaskLoopArgs[13] =
+        Data.Schedule.getPointer()
+            ? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
+                                        /*isSigned=*/false)
+            : llvm::ConstantInt::get(CGF.Int64Ty, /*V=*/0);
+    TGTaskLoopArgs[14] =
+        llvm::ConstantInt::getSigned(CGF.IntTy, Data.HasModifier ? 1 : 0);
     TGTaskLoopArgs[15] = Result.TaskDupFn
-                           ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-                              Result.TaskDupFn, CGF.VoidPtrTy)
-                           : llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
+                             ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+                                   Result.TaskDupFn, CGF.VoidPtrTy)
+                             : llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-        CGM.getModule(), OMPRTL___kmpc_taskgraph_taskloop),
-        TGTaskLoopArgs);
+                            CGM.getModule(), OMPRTL___kmpc_taskgraph_taskloop),
+                        TGTaskLoopArgs);
   };
 
-  auto &&NonTaskgraphTaskloopCodeGen =
-      [this, &Loc, &D, TaskFunction, &SharedsTy, &Shareds, IfVal, &Data]
-        (CodeGenFunction &CGF, PrePostActionTy &) {
-    std::array<llvm::Value*, 3> DummyArray;
-    TaskResultTy Result =
-        emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy, Shareds, Data, false, DummyArray);
-    // NOTE: routine and part_id fields are initialized by __kmpc_omp_task_alloc()
-    // libcall.
-    // Call to void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int
-    // if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, int
-    // sched, kmp_uint64 grainsize, void *task_dup);
+  auto &&NonTaskgraphTaskloopCodeGen = [this, &Loc, &D, TaskFunction,
+                                        &SharedsTy, &Shareds, IfVal,
+                                        &Data](CodeGenFunction &CGF,
+                                               PrePostActionTy &) {
+    std::array<llvm::Value *, 3> DummyArray;
+    TaskResultTy Result = emitTaskInit(CGF, Loc, D, TaskFunction, SharedsTy,
+                                       Shareds, Data, false, DummyArray);
+    // NOTE: routine and part_id fields are initialized by
+    // __kmpc_omp_task_alloc() libcall. Call to void __kmpc_taskloop(ident_t
+    // *loc, int gtid, kmp_task_t *task, int if_val, kmp_uint64 *lb, kmp_uint64
+    // *ub, kmp_int64 st, int nogroup, int sched, kmp_uint64 grainsize, void
+    // *task_dup);
     llvm::Value *ThreadID = getThreadID(CGF, Loc);
     llvm::Value *UpLoc = emitUpdateLocation(CGF, Loc);
 
@@ -5080,22 +5079,25 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
         *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound));
     const auto *LBVar =
         cast<VarDecl>(cast<DeclRefExpr>(D.getLowerBoundVariable())->getDecl());
-    CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(), LBLVal.getQuals(),
-                        /*IsInitializer=*/true);
+    CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(),
+                         LBLVal.getQuals(),
+                         /*IsInitializer=*/true);
     LValue UBLVal = CGF.EmitLValueForField(
         Result.TDBase,
         *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound));
     const auto *UBVar =
         cast<VarDecl>(cast<DeclRefExpr>(D.getUpperBoundVariable())->getDecl());
-    CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(), UBLVal.getQuals(),
-                        /*IsInitializer=*/true);
+    CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(),
+                         UBLVal.getQuals(),
+                         /*IsInitializer=*/true);
     LValue StLVal = CGF.EmitLValueForField(
         Result.TDBase,
         *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTStride));
     const auto *StVar =
         cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
-    CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(),
-                        /*IsInitializer=*/true);
+    CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(),
+                         StLVal.getQuals(),
+                         /*IsInitializer=*/true);
     // Store reductions address.
     LValue RedLVal = CGF.EmitLValueForField(
         Result.TDBase,
@@ -5104,7 +5106,7 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
       CGF.EmitStoreOfScalar(Data.Reductions, RedLVal);
     } else {
       CGF.EmitNullInitialization(RedLVal.getAddress(),
-                                CGF.getContext().VoidPtrTy);
+                                 CGF.getContext().VoidPtrTy);
     }
     llvm::SmallVector<llvm::Value *, 12> TaskArgs{
         UpLoc,
@@ -5118,8 +5120,8 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
             CGF.IntTy, 1), // Always 1 because taskgroup emitted by the compiler
         llvm::ConstantInt::getSigned(
             CGF.IntTy, Data.Schedule.getPointer()
-                          ? Data.Schedule.getInt() ? NumTasks : Grainsize
-                          : NoSchedule),
+                           ? Data.Schedule.getInt() ? NumTasks : Grainsize
+                           : NoSchedule),
         Data.Schedule.getPointer()
             ? CGF.Builder.CreateIntCast(Data.Schedule.getPointer(), CGF.Int64Ty,
                                         /*isSigned=*/false)
@@ -5128,13 +5130,13 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
       TaskArgs.push_back(llvm::ConstantInt::get(CGF.Int32Ty, 1));
 
     TaskArgs.push_back(Result.TaskDupFn
-                          ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-                                Result.TaskDupFn, CGF.VoidPtrTy)
-                          : llvm::ConstantPointerNull::get(CGF.VoidPtrTy));
+                           ? CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+                                 Result.TaskDupFn, CGF.VoidPtrTy)
+                           : llvm::ConstantPointerNull::get(CGF.VoidPtrTy));
     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
                             CGM.getModule(), Data.HasModifier
-                                                ? OMPRTL___kmpc_taskloop_5
-                                                : OMPRTL___kmpc_taskloop),
+                                                 ? OMPRTL___kmpc_taskloop_5
+                                                 : OMPRTL___kmpc_taskloop),
                         TaskArgs);
   };
 
@@ -6290,8 +6292,8 @@ llvm::Value *CGOpenMPRuntime::emitTaskReductionInit(
         Args);
   else
     return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                  CGM.getModule(), OMPRTL___kmpc_taskred_init),
-                              Args);
+                                   CGM.getModule(), OMPRTL___kmpc_taskred_init),
+                               Args);
 }
 
 void CGOpenMPRuntime::emitTaskReductionFini(CodeGenFunction &CGF,
@@ -6368,57 +6370,59 @@ void CGOpenMPRuntime::emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc,
         emitDependClause(CGF, Data.Dependences, Loc);
 
     auto &&TaskgraphTaskwaitCodeGen =
-        [this, UpLoc, ThreadID, NumOfElements, &DependenciesArray, &Data]
-          (CodeGenFunction &CGF, PrePostActionTy &) {
-      llvm::Value *TGTaskWaitArgs[5];
-      TGTaskWaitArgs[0] = UpLoc;
-      TGTaskWaitArgs[1] = ThreadID;
-      TGTaskWaitArgs[2] = NumOfElements;
-      if (Data.Dependences.empty()) {
-        // This should be a proper error
-        fprintf(stderr, "*** Taskwait inside taskgraph with no depend clause is not task-generating\n");
-        exit(1);
-      }
-      TGTaskWaitArgs[3] = DependenciesArray.emitRawPointer(CGF);
-      TGTaskWaitArgs[4] =
-          llvm::ConstantInt::get(CGF.Int32Ty, Data.HasNowaitClause);
-      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-          CGM.getModule(), OMPRTL___kmpc_taskgraph_taskwait),
-          TGTaskWaitArgs);
-    };
+        [this, UpLoc, ThreadID, NumOfElements, &DependenciesArray,
+         &Data](CodeGenFunction &CGF, PrePostActionTy &) {
+          llvm::Value *TGTaskWaitArgs[5];
+          TGTaskWaitArgs[0] = UpLoc;
+          TGTaskWaitArgs[1] = ThreadID;
+          TGTaskWaitArgs[2] = NumOfElements;
+          if (Data.Dependences.empty()) {
+            // This should be a proper error
+            fprintf(stderr, "*** Taskwait inside taskgraph with no depend "
+                            "clause is not task-generating\n");
+            exit(1);
+          }
+          TGTaskWaitArgs[3] = DependenciesArray.emitRawPointer(CGF);
+          TGTaskWaitArgs[4] =
+              llvm::ConstantInt::get(CGF.Int32Ty, Data.HasNowaitClause);
+          CGF.EmitRuntimeCall(
+              OMPBuilder.getOrCreateRuntimeFunction(
+                  CGM.getModule(), OMPRTL___kmpc_taskgraph_taskwait),
+              TGTaskWaitArgs);
+        };
     auto &&NonTaskgraphTaskwaitCodeGen =
-        [this, UpLoc, ThreadID, NumOfElements, &DependenciesArray, &M, &Data]
-          (CodeGenFunction &CGF, PrePostActionTy &) {
-      if (!Data.Dependences.empty()) {
-        llvm::Value *DepWaitTaskArgs[7];
-        DepWaitTaskArgs[0] = UpLoc;
-        DepWaitTaskArgs[1] = ThreadID;
-        DepWaitTaskArgs[2] = NumOfElements;
-        DepWaitTaskArgs[3] = DependenciesArray.emitRawPointer(CGF);
-        DepWaitTaskArgs[4] = CGF.Builder.getInt32(0);
-        DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
-        DepWaitTaskArgs[6] =
-            llvm::ConstantInt::get(CGF.Int32Ty, Data.HasNowaitClause);
-
-        CodeGenFunction::RunCleanupsScope LocalScope(CGF);
-
-        // Build void __kmpc_omp_taskwait_deps_51(ident_t *, kmp_int32 gtid,
-        // kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
-        // ndeps_noalias, kmp_depend_info_t *noalias_dep_list,
-        // kmp_int32 has_no_wait); if dependence info is specified.
-        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                M, OMPRTL___kmpc_omp_taskwait_deps_51),
-                            DepWaitTaskArgs);
-      } else {
-        // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
-        // global_tid);
-        llvm::Value *Args[] = {UpLoc, ThreadID};
-        // Ignore return result until untied tasks are supported.
-        CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_omp_taskwait),
-            Args);
-      }
-    };
+        [this, UpLoc, ThreadID, NumOfElements, &DependenciesArray, &M,
+         &Data](CodeGenFunction &CGF, PrePostActionTy &) {
+          if (!Data.Dependences.empty()) {
+            llvm::Value *DepWaitTaskArgs[7];
+            DepWaitTaskArgs[0] = UpLoc;
+            DepWaitTaskArgs[1] = ThreadID;
+            DepWaitTaskArgs[2] = NumOfElements;
+            DepWaitTaskArgs[3] = DependenciesArray.emitRawPointer(CGF);
+            DepWaitTaskArgs[4] = CGF.Builder.getInt32(0);
+            DepWaitTaskArgs[5] = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
+            DepWaitTaskArgs[6] =
+                llvm::ConstantInt::get(CGF.Int32Ty, Data.HasNowaitClause);
+
+            CodeGenFunction::RunCleanupsScope LocalScope(CGF);
+
+            // Build void __kmpc_omp_taskwait_deps_51(ident_t *, kmp_int32 gtid,
+            // kmp_int32 ndeps, kmp_depend_info_t *dep_list, kmp_int32
+            // ndeps_noalias, kmp_depend_info_t *noalias_dep_list,
+            // kmp_int32 has_no_wait); if dependence info is specified.
+            CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                    M, OMPRTL___kmpc_omp_taskwait_deps_51),
+                                DepWaitTaskArgs);
+          } else {
+            // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
+            // global_tid);
+            llvm::Value *Args[] = {UpLoc, ThreadID};
+            // Ignore return result until untied tasks are supported.
+            CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                    M, OMPRTL___kmpc_omp_taskwait),
+                                Args);
+          }
+        };
 
     if (CGF.getOMPWithinTaskgraph()) {
       // Lexically within taskgraph, always replayable.
@@ -13587,13 +13591,10 @@ void CGOpenMPSIMDRuntime::emitFlush(CodeGenFunction &CGF,
   llvm_unreachable("Not supported in SIMD-only mode");
 }
 
-void CGOpenMPSIMDRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
-                                       const OMPExecutableDirective &D,
-                                       llvm::Function *TaskFunction,
-                                       QualType SharedsTy, Address Shareds,
-                                       const Expr *IfCond,
-                                       const Expr *ReplayableCond,
-                                       const OMPTaskDataTy &Data) {
+void CGOpenMPSIMDRuntime::emitTaskCall(
+    CodeGenFunction &CGF, SourceLocation Loc, const OMPExecutableDirective &D,
+    llvm::Function *TaskFunction, QualType SharedsTy, Address Shareds,
+    const Expr *IfCond, const Expr *ReplayableCond, const OMPTaskDataTy &Data) {
   llvm_unreachable("Not supported in SIMD-only mode");
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 7ac06547a..abcc86fcb 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -585,7 +585,7 @@ protected:
                             llvm::Function *TaskFunction, QualType SharedsTy,
                             Address Shareds, const OMPTaskDataTy &Data,
                             bool ForTaskgraph,
-                            std::array<llvm::Value*, 3> &TaskAllocArgs);
+                            std::array<llvm::Value *, 3> &TaskAllocArgs);
 
   /// Emit update for lastprivate conditional data.
   void emitLastprivateConditionalUpdate(CodeGenFunction &CGF, LValue IVLVal,
@@ -1388,7 +1388,8 @@ public:
 
   /// Emit code for 'taskgraph' directive.
   virtual void emitTaskgraphCall(CodeGenFunction &CGF, SourceLocation Loc,
-                                 const OMPExecutableDirective &D, const Expr *IfCond);
+                                 const OMPExecutableDirective &D,
+                                 const Expr *IfCond);
 
   /// Emit code for 'cancellation point' construct.
   /// \param CancelRegion Region kind for which the cancellation point must be
@@ -2097,8 +2098,8 @@ public:
   void emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
                         const OMPLoopDirective &D, llvm::Function *TaskFunction,
                         QualType SharedsTy, Address Shareds, const Expr *IfCond,
-                        const Expr *ReplayableCond, const OMPTaskDataTy &Data)
-                        override;
+                        const Expr *ReplayableCond,
+                        const OMPTaskDataTy &Data) override;
 
   /// Emit a code for reduction clause. Next code should be emitted for
   /// reduction:
@@ -2218,16 +2219,15 @@ public:
 
   /// Emit code for 'taskwait' directive.
   void emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc,
-                        const Expr *ReplayableCond, const OMPTaskDataTy &Data)
-                        override;
+                        const Expr *ReplayableCond,
+                        const OMPTaskDataTy &Data) override;
 
   /// Emit code for 'taskgraph' directive.
   /// \param IfCond Expression evaluated in if clause associated with the target
   /// \param D Directive to emit.
   void emitTaskgraphCall(CodeGenFunction &CGF, SourceLocation Loc,
                          const OMPExecutableDirective &D,
-                         const Expr *IfCond
-                        ) override;
+                         const Expr *IfCond) override;
 
   /// Emit code for 'cancellation point' construct.
   /// \param CancelRegion Region kind for which the cancellation point must be
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 30728e5ad..98eed360f 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5707,9 +5707,9 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
   IntegerLiteral IfCond(getContext(), TrueOrFalse,
                         getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
                         SourceLocation());
-  IntegerLiteral ReplayableCond(getContext(), llvm::APInt(32, 1),
-                                getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
-                                SourceLocation());
+  IntegerLiteral ReplayableCond(
+      getContext(), llvm::APInt(32, 1),
+      getContext().getIntTypeForBitwidth(32, /*Signed=*/0), SourceLocation());
   CGM.getOpenMPRuntime().emitTaskCall(*this, S.getBeginLoc(), S, OutlinedFn,
                                       SharedsTy, CapturedStruct, &IfCond,
                                       &ReplayableCond, Data);
@@ -5825,20 +5825,18 @@ void CodeGenFunction::EmitOMPTaskDirective(const OMPTaskDirective &S) {
   if (auto *RC = S.getSingleClause<OMPReplayableClause>()) {
     ReplayableCond = RC->getCondition();
     if (!ReplayableCond) {
-      ReplayableCond =
-        IntegerLiteral::Create(
-            getContext(), llvm::APInt(32, 1),
-            getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
-            SourceLocation());
+      ReplayableCond = IntegerLiteral::Create(
+          getContext(), llvm::APInt(32, 1),
+          getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
+          SourceLocation());
     }
   }
   auto &&BodyGen = [CS](CodeGenFunction &CGF, PrePostActionTy &) {
     CGF.EmitStmt(CS->getCapturedStmt());
   };
-  auto &&TaskGen = [&S, SharedsTy, CapturedStruct,
-                    IfCond, ReplayableCond](CodeGenFunction &CGF,
-                            llvm::Function *OutlinedFn,
-                            const OMPTaskDataTy &Data) {
+  auto &&TaskGen = [&S, SharedsTy, CapturedStruct, IfCond, ReplayableCond](
+                       CodeGenFunction &CGF, llvm::Function *OutlinedFn,
+                       const OMPTaskDataTy &Data) {
     CGF.CGM.getOpenMPRuntime().emitTaskCall(CGF, S.getBeginLoc(), S, OutlinedFn,
                                             SharedsTy, CapturedStruct, IfCond,
                                             ReplayableCond, Data);
@@ -5876,11 +5874,10 @@ void CodeGenFunction::EmitOMPTaskwaitDirective(const OMPTaskwaitDirective &S) {
   if (auto *RC = S.getSingleClause<OMPReplayableClause>()) {
     ReplayableCond = RC->getCondition();
     if (!ReplayableCond) {
-      ReplayableCond =
-        IntegerLiteral::Create(
-            getContext(), llvm::APInt(32, 1),
-            getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
-            SourceLocation());
+      ReplayableCond = IntegerLiteral::Create(
+          getContext(), llvm::APInt(32, 1),
+          getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
+          SourceLocation());
     }
   }
   CGM.getOpenMPRuntime().emitTaskwaitCall(*this, S.getBeginLoc(),
@@ -8243,11 +8240,10 @@ void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
   if (auto *RC = S.getSingleClause<OMPReplayableClause>()) {
     ReplayableCond = RC->getCondition();
     if (!ReplayableCond) {
-      ReplayableCond =
-        IntegerLiteral::Create(
-            getContext(), llvm::APInt(32, 1),
-            getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
-            SourceLocation());
+      ReplayableCond = IntegerLiteral::Create(
+          getContext(), llvm::APInt(32, 1),
+          getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
+          SourceLocation());
     }
   }
 
@@ -8370,18 +8366,16 @@ void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
                                (*LIP)->getType(), S.getBeginLoc()));
     });
   };
-  auto &&TaskGen =
-      [&S, SharedsTy, CapturedStruct, IfCond, ReplayableCond]
-        (CodeGenFunction &CGF, llvm::Function *OutlinedFn,
-         const OMPTaskDataTy &Data) {
-    auto &&CodeGen =
-        [&S, OutlinedFn, SharedsTy, CapturedStruct, IfCond, ReplayableCond,
-         &Data](CodeGenFunction &CGF, PrePostActionTy &) {
+  auto &&TaskGen = [&S, SharedsTy, CapturedStruct, IfCond, ReplayableCond](
+                       CodeGenFunction &CGF, llvm::Function *OutlinedFn,
+                       const OMPTaskDataTy &Data) {
+    auto &&CodeGen = [&S, OutlinedFn, SharedsTy, CapturedStruct, IfCond,
+                      ReplayableCond,
+                      &Data](CodeGenFunction &CGF, PrePostActionTy &) {
       OMPLoopScope PreInitScope(CGF, S);
-      CGF.CGM.getOpenMPRuntime().emitTaskLoopCall(CGF, S.getBeginLoc(), S,
-                                                  OutlinedFn, SharedsTy,
-                                                  CapturedStruct, IfCond,
-                                                  ReplayableCond, Data);
+      CGF.CGM.getOpenMPRuntime().emitTaskLoopCall(
+          CGF, S.getBeginLoc(), S, OutlinedFn, SharedsTy, CapturedStruct,
+          IfCond, ReplayableCond, Data);
     };
     CGF.CGM.getOpenMPRuntime().emitInlinedDirective(CGF, OMPD_taskloop,
                                                     CodeGen);
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 0ac35ebe3..b199a1355 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -819,13 +819,12 @@ public:
 
   class OMPWithinTaskgraphRAII {
     CodeGenFunction &CGF;
+
   public:
     OMPWithinTaskgraphRAII(CodeGenFunction &CGF_) : CGF(CGF_) {
       CGF.setOMPWithinTaskgraph(true);
     }
-    ~OMPWithinTaskgraphRAII() {
-      CGF.setOMPWithinTaskgraph(false);
-    }
+    ~OMPWithinTaskgraphRAII() { CGF.setOMPWithinTaskgraph(false); }
   };
 
   template <class T>
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index da1555f02..f98f29fe0 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3343,8 +3343,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind,
     }
 
     if ((CKind == OMPC_nowait || CKind == OMPC_replayable) &&
-        PP.LookAhead(/*N=*/0).is(tok::l_paren) &&
-        getLangOpts().OpenMP >= 60)
+        PP.LookAhead(/*N=*/0).is(tok::l_paren) && getLangOpts().OpenMP >= 60)
       Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective);
     else
       Clause = ParseOpenMPClause(CKind, WrongDirective);
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 19dc278e1..81f77c080 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -10843,9 +10843,8 @@ TreeTransform<Derived>::TransformOMPReplayableClause(OMPReplayableClause *C) {
     if (Cond.isInvalid())
       return nullptr;
   }
-  return getDerived().RebuildOMPReplayableClause(Cond.get(), C->getBeginLoc(),
-                                                 C->getLParenLoc(),
-                                                 C->getEndLoc());
+  return getDerived().RebuildOMPReplayableClause(
+      Cond.get(), C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc());
 }
 
 template <typename Derived>
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 7c0a7ad58..0643df1b2 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -3127,7 +3127,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   kmp_uint8 th_task_state; // alternating 0/1 for task team identification
   kmp_uint32 th_reap_state; // Non-zero indicates thread is not
   // tasking, thus safe to reap
-  //kmp_taskgraph_record_t *th_taskgraph_recording;
+  // kmp_taskgraph_record_t *th_taskgraph_recording;
 
   /* More stuff for keeping track of active/sleeping threads (this part is
      written by the worker thread) */
@@ -4486,37 +4486,25 @@ KMP_EXPORT void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
                                                 uintptr_t hint);
 #if OMP_TASKGRAPH_EXPERIMENTAL
 KMP_EXPORT void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
-                                 std::atomic<void*> *tdg_handle,
+                                 std::atomic<void *> *tdg_handle,
                                  kmp_uint32 graph_id, kmp_int32 graph_reset,
                                  kmp_int32 nogroup, void (*entry)(void *),
                                  void *args);
-KMP_EXPORT kmp_uint32 __kmpc_taskgraph_task(ident_t *loc_ref, kmp_int32 gtid,
-                                            kmp_task_t *new_task,
-                                            kmp_int32 flags,
-                                            size_t sizeof_kmp_task_t,
-                                            void* shareds,
-                                            size_t sizeof_shareds,
-                                            kmp_int32 ndeps,
-                                            kmp_depend_info_t *dep_list);
-KMP_EXPORT kmp_uint32 __kmpc_taskgraph_taskloop(ident_t *loc_ref,
-                                                kmp_int32 gtid,
-                                                kmp_task_t *new_task,
-                                                kmp_int32 flags,
-                                                size_t sizeof_kmp_task_t,
-                                                void *shareds,
-                                                size_t sizeof_shareds,
-                                                kmp_int32 if_val,
-                                                kmp_uint64 *lb, kmp_uint64 *ub,
-                                                kmp_int64 st, kmp_int32 nogroup,
-                                                kmp_int32 sched,
-                                                kmp_uint64 grainsize,
-                                                kmp_int32 modifier,
-                                                void *task_dup);
+KMP_EXPORT kmp_uint32 __kmpc_taskgraph_task(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 flags,
+    size_t sizeof_kmp_task_t, void *shareds, size_t sizeof_shareds,
+    kmp_int32 ndeps, kmp_depend_info_t *dep_list);
+KMP_EXPORT kmp_uint32 __kmpc_taskgraph_taskloop(
+    ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task, kmp_int32 flags,
+    size_t sizeof_kmp_task_t, void *shareds, size_t sizeof_shareds,
+    kmp_int32 if_val, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+    kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize,
+    kmp_int32 modifier, void *task_dup);
 KMP_EXPORT void __kmpc_taskgraph_taskwait(ident_t *loc_ref, kmp_int32 gtid,
                                           kmp_int32 ndeps,
                                           kmp_depend_info_t *dep_list,
                                           kmp_int32 has_no_wait);
-KMP_EXPORT void* __kmpc_taskgraph_taskred_init(kmp_int32 gtid, kmp_int32 num,
+KMP_EXPORT void *__kmpc_taskgraph_taskred_init(kmp_int32 gtid, kmp_int32 num,
                                                void *data);
 #endif
 /* Interface to fast scalable reduce methods routines */
diff --git a/openmp/runtime/src/kmp_taskdeps.cpp b/openmp/runtime/src/kmp_taskdeps.cpp
index 1f28f747c..0113e002f 100644
--- a/openmp/runtime/src/kmp_taskdeps.cpp
+++ b/openmp/runtime/src/kmp_taskdeps.cpp
@@ -299,7 +299,7 @@ __kmp_depnode_link_successor(kmp_int32 gtid, kmp_info_t *thread,
         if (!dep->dn.successors || dep->dn.successors->node != node) {
           __kmp_track_dependence(gtid, dep, node, task);
           dep->dn.successors =
-            __kmp_add_node<true>(thread, dep->dn.successors, node);
+              __kmp_add_node<true>(thread, dep->dn.successors, node);
           KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                         "%p\n",
                         gtid, KMP_TASK_TO_TASKDATA(dep->dn.task),
@@ -328,7 +328,8 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
     if (sink->dn.task) {
       if (!sink->dn.successors || sink->dn.successors->node != source) {
         __kmp_track_dependence(gtid, sink, source, task);
-        sink->dn.successors = __kmp_add_node<true>(thread, sink->dn.successors, source);
+        sink->dn.successors =
+            __kmp_add_node<true>(thread, sink->dn.successors, source);
         KA_TRACE(40, ("__kmp_process_deps: T#%d adding dependence from %p to "
                     "%p\n",
                     gtid, KMP_TASK_TO_TASKDATA(sink->dn.task),
@@ -341,21 +342,23 @@ static inline kmp_int32 __kmp_depnode_link_successor(kmp_int32 gtid,
   return npredecessors;
 }
 
-kmp_taskgraph_region_dep_t *__kmp_region_deplist_add(kmp_info_t *thread,
-    kmp_taskgraph_region_dep_t **recycled_deps, kmp_taskgraph_region_t *region,
-    kmp_taskgraph_region_dep_t *list) {
+kmp_taskgraph_region_dep_t *__kmp_region_deplist_add(
+    kmp_info_t *thread, kmp_taskgraph_region_dep_t **recycled_deps,
+    kmp_taskgraph_region_t *region, kmp_taskgraph_region_dep_t *list) {
   kmp_taskgraph_region_dep_t *head;
   if (*recycled_deps) {
     head = *recycled_deps;
     *recycled_deps = (*recycled_deps)->next;
   } else
-    head = (kmp_taskgraph_region_dep_t *)__kmp_fast_allocate(thread, sizeof(kmp_taskgraph_region_dep_t));
+    head = (kmp_taskgraph_region_dep_t *)__kmp_fast_allocate(
+        thread, sizeof(kmp_taskgraph_region_dep_t));
   head->region = region;
   head->next = list;
   return head;
 }
 
-kmp_taskgraph_region_t *__kmp_region_worklist_reverse(kmp_taskgraph_region_t *list) {
+kmp_taskgraph_region_t *
+__kmp_region_worklist_reverse(kmp_taskgraph_region_t *list) {
   kmp_taskgraph_region_t *last = nullptr;
   while (list) {
     kmp_taskgraph_region_t *next = list->next;
@@ -366,7 +369,8 @@ kmp_taskgraph_region_t *__kmp_region_worklist_reverse(kmp_taskgraph_region_t *li
   return last;
 }
 
-static kmp_depnode_t *__kmp_find_in_depnode_list(kmp_depnode_t *node, kmp_depnode_list_t *list) {
+static kmp_depnode_t *__kmp_find_in_depnode_list(kmp_depnode_t *node,
+                                                 kmp_depnode_list_t *list) {
   for (; list; list = list->next)
     if (list->node == node)
       return list->node;
@@ -381,38 +385,36 @@ typedef struct kmp_bitset {
   kmp_size_t num_chunks;
 } kmp_bitset_t;
 
-static kmp_bitset_t *
-__kmp_bitset_alloc(kmp_info_t *thread, kmp_size_t bitsize) {
+static kmp_bitset_t *__kmp_bitset_alloc(kmp_info_t *thread,
+                                        kmp_size_t bitsize) {
   kmp_size_t bytesize = (bitsize + 7) / 8;
-  kmp_size_t num_chunks = (bytesize + sizeof(kmp_uint64) - 1) / sizeof(kmp_uint64);
-  kmp_bitset_t *bitset = (kmp_bitset_t *) __kmp_fast_allocate(thread, sizeof(kmp_bitset_t) + sizeof(kmp_uint64) * num_chunks);
-  bitset->bits = (kmp_uint64*) &bitset[1];
+  kmp_size_t num_chunks =
+      (bytesize + sizeof(kmp_uint64) - 1) / sizeof(kmp_uint64);
+  kmp_bitset_t *bitset = (kmp_bitset_t *)__kmp_fast_allocate(
+      thread, sizeof(kmp_bitset_t) + sizeof(kmp_uint64) * num_chunks);
+  bitset->bits = (kmp_uint64 *)&bitset[1];
   memset(bitset->bits, 0, sizeof(kmp_uint64) * num_chunks);
   bitset->bitsize = bitsize;
   bitset->num_chunks = num_chunks;
   return bitset;
 }
 
-static void
-__kmp_bitset_free(kmp_info_t *thread, kmp_bitset_t *bitset) {
+static void __kmp_bitset_free(kmp_info_t *thread, kmp_bitset_t *bitset) {
   __kmp_fast_free(thread, bitset);
 }
 
-static void
-__kmp_bitset_set(kmp_bitset_t *bitset, kmp_size_t bitnum) {
+static void __kmp_bitset_set(kmp_bitset_t *bitset, kmp_size_t bitnum) {
   kmp_size_t chunk = bitnum / (8 * sizeof(kmp_uint64));
   if (bitnum < bitset->bitsize)
     bitset->bits[chunk] |= (kmp_uint64)1 << (bitnum & 63);
 }
 
-static void
-__kmp_bitset_clearall(kmp_bitset_t *bitset) {
+static void __kmp_bitset_clearall(kmp_bitset_t *bitset) {
   if (bitset)
     memset(bitset->bits, 0, sizeof(kmp_int64) * bitset->num_chunks);
 }
 
-static void
-__kmp_bitset_setall(kmp_bitset_t *bitset) {
+static void __kmp_bitset_setall(kmp_bitset_t *bitset) {
   for (kmp_int32 chunk = 0; chunk < bitset->num_chunks - 1; chunk++)
     bitset->bits[chunk] = ~(kmp_uint64)0;
   kmp_int32 last_chunk_numbits = bitset->bitsize & 63;
@@ -422,8 +424,7 @@ __kmp_bitset_setall(kmp_bitset_t *bitset) {
   }
 }
 
-static void
-__kmp_bitset_copy(kmp_bitset_t *dst, const kmp_bitset_t *src) {
+static void __kmp_bitset_copy(kmp_bitset_t *dst, const kmp_bitset_t *src) {
   assert(dst->num_chunks == src->num_chunks);
   assert(dst->bitsize == src->bitsize);
   memcpy(dst->bits, src->bits, sizeof(kmp_uint64) * dst->num_chunks);
@@ -431,8 +432,8 @@ __kmp_bitset_copy(kmp_bitset_t *dst, const kmp_bitset_t *src) {
 
 /// Return TRUE if \c b is a subset of \c a.
 
-static bool
-__kmp_bitset_subset_p(const kmp_bitset_t *a, const kmp_bitset_t *b) {
+static bool __kmp_bitset_subset_p(const kmp_bitset_t *a,
+                                  const kmp_bitset_t *b) {
   if (!b)
     return true;
   kmp_size_t chunk_max = std::max(a->num_chunks, b->num_chunks);
@@ -445,8 +446,8 @@ __kmp_bitset_subset_p(const kmp_bitset_t *a, const kmp_bitset_t *b) {
   return true;
 }
 
-static void
-__kmp_bitset_and(kmp_bitset_t *a, kmp_bitset_t *b, kmp_bitset_t *c) {
+static void __kmp_bitset_and(kmp_bitset_t *a, kmp_bitset_t *b,
+                             kmp_bitset_t *c) {
   kmp_size_t chunk_max = std::max(b->num_chunks, c->num_chunks);
   for (kmp_size_t chunk = 0; chunk < chunk_max; chunk++) {
     kmp_uint64 b_bits = chunk < b->num_chunks ? b->bits[chunk] : 0;
@@ -455,8 +456,8 @@ __kmp_bitset_and(kmp_bitset_t *a, kmp_bitset_t *b, kmp_bitset_t *c) {
   }
 }
 
-static void
-__kmp_bitset_and_not(kmp_bitset_t *a, kmp_bitset_t *b, kmp_bitset_t *c) {
+static void __kmp_bitset_and_not(kmp_bitset_t *a, kmp_bitset_t *b,
+                                 kmp_bitset_t *c) {
   if (!c)
     __kmp_bitset_copy(a, b);
   else {
@@ -469,8 +470,7 @@ __kmp_bitset_and_not(kmp_bitset_t *a, kmp_bitset_t *b, kmp_bitset_t *c) {
   }
 }
 
-static void
-__kmp_bitset_or(kmp_bitset_t *a, kmp_bitset_t *b, kmp_bitset_t *c) {
+static void __kmp_bitset_or(kmp_bitset_t *a, kmp_bitset_t *b, kmp_bitset_t *c) {
   if (!b && !c)
     __kmp_bitset_clearall(a);
   else if (!b)
@@ -487,8 +487,7 @@ __kmp_bitset_or(kmp_bitset_t *a, kmp_bitset_t *b, kmp_bitset_t *c) {
   }
 }
 
-static bool
-__kmp_bitset_empty_p(kmp_bitset_t *bitset) {
+static bool __kmp_bitset_empty_p(kmp_bitset_t *bitset) {
   if (!bitset)
     return true;
   for (kmp_size_t chunk = 0; chunk < bitset->num_chunks; chunk++) {
@@ -501,8 +500,7 @@ __kmp_bitset_empty_p(kmp_bitset_t *bitset) {
 /// Test two bitsets for equality.  Note that any unused bits at the end of the
 /// last chunk are kept as zero.
 
-static bool
-__kmp_bitset_equal(kmp_bitset_t *a, kmp_bitset_t *b) {
+static bool __kmp_bitset_equal(kmp_bitset_t *a, kmp_bitset_t *b) {
   if (!b)
     return __kmp_bitset_empty_p(a);
   kmp_size_t chunk_max = std::max(a->num_chunks, b->num_chunks);
@@ -515,8 +513,7 @@ __kmp_bitset_equal(kmp_bitset_t *a, kmp_bitset_t *b) {
   return true;
 }
 
-static bool
-__kmp_bitset_intersect_p(kmp_bitset_t *a, kmp_bitset_t *b) {
+static bool __kmp_bitset_intersect_p(kmp_bitset_t *a, kmp_bitset_t *b) {
   if (!a || !b)
     return false;
   kmp_size_t chunk_max = std::max(a->num_chunks, b->num_chunks);
@@ -529,8 +526,7 @@ __kmp_bitset_intersect_p(kmp_bitset_t *a, kmp_bitset_t *b) {
   return false;
 }
 
-static kmp_int32
-__kmp_bitset_popcount(kmp_bitset_t *bitset) {
+static kmp_int32 __kmp_bitset_popcount(kmp_bitset_t *bitset) {
   if (!bitset)
     return 0;
   kmp_int32 accum = 0;
@@ -546,9 +542,10 @@ static kmp_int32 __kmp_taskgraph_add_dep(kmp_info_t *thread,
   kmp_int32 npredecessors = 0;
   for (; plist; plist = plist->next) {
     kmp_depnode_t *dep = plist->node;
-    if (!dep->dn.successors || !__kmp_find_in_depnode_list(node, dep->dn.successors)) {
+    if (!dep->dn.successors ||
+        !__kmp_find_in_depnode_list(node, dep->dn.successors)) {
       dep->dn.successors =
-        __kmp_add_node<false>(thread, dep->dn.successors, node);
+          __kmp_add_node<false>(thread, dep->dn.successors, node);
       npredecessors++;
     }
   }
@@ -563,21 +560,21 @@ static kmp_int32 __kmp_taskgraph_add_dep(kmp_info_t *thread,
   kmp_int32 npredecessors = 0;
   if (!sink->dn.successors || sink->dn.successors->node != source) {
     if (!__kmp_find_in_depnode_list(source, sink->dn.successors)) {
-      sink->dn.successors = __kmp_add_node<false>(thread, sink->dn.successors,
-                                                  source);
+      sink->dn.successors =
+          __kmp_add_node<false>(thread, sink->dn.successors, source);
       npredecessors++;
     }
   }
   return npredecessors;
 }
 
-template<typename T>
+template <typename T>
 static inline kmp_int32
 __kmp_process_dep_all(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *h,
                       bool dep_barrier, kmp_task_t *task) {
   KA_TRACE(30, ("__kmp_process_dep_all<%s>: T#%d processing dep_all, "
-                "dep_barrier = %d\n", T::name,
-                gtid, dep_barrier));
+                "dep_barrier = %d\n",
+                T::name, gtid, dep_barrier));
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_int32 npredecessors = 0;
 
@@ -622,19 +619,19 @@ __kmp_process_dep_all(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *h,
     }
   }
   KA_TRACE(30, ("__kmp_process_dep_all<%s>: T#%d found %d predecessors\n",
-           T::name, gtid, npredecessors));
+                T::name, gtid, npredecessors));
   return npredecessors;
 }
 
-template<typename T>
+template <typename T>
 static inline kmp_int32
 __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
                    bool dep_barrier, kmp_int32 ndeps,
                    kmp_depend_info_t *dep_list, kmp_task_t *task,
                    kmp_int32 &next_mutex_set, bool filter = true) {
   KA_TRACE(30, ("__kmp_process_deps<%s>: T#%d processing %d dependences : "
-                "dep_barrier = %d, filter = %d\n", T::name,
-                gtid, ndeps, dep_barrier, filter));
+                "dep_barrier = %d, filter = %d\n",
+                T::name, gtid, ndeps, dep_barrier, filter));
 
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_int32 npredecessors = 0;
@@ -713,8 +710,9 @@ __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t **hash,
       }
     }
   }
-  KA_TRACE(30, ("__kmp_process_deps<%s>: T#%d found %d predecessors (filter: %d)\n",
-                T::name, gtid, npredecessors, filter));
+  KA_TRACE(30,
+           ("__kmp_process_deps<%s>: T#%d found %d predecessors (filter: %d)\n",
+            T::name, gtid, npredecessors, filter));
   return npredecessors;
 }
 
@@ -789,7 +787,7 @@ struct taskgraph_deps {
                                   kmp_task_t *task, kmp_depnode_t *node,
                                   kmp_depnode_list_t *plist);
   static kmp_depnode_t *ref(kmp_depnode_t *node) { return node; }
-  static void deref(kmp_info_t *thread, kmp_depnode_t *node) { }
+  static void deref(kmp_info_t *thread, kmp_depnode_t *node) {}
   static void mutex_dep(kmp_info_t *thread, kmp_dephash_entry_t *info,
                         kmp_depnode_t *node, kmp_int32 &next_mutex_set);
 };
@@ -877,15 +875,13 @@ static size_t __kmp_round_up_to_val(size_t size, size_t val) {
 } // __kmp_round_up_to_val
 
 // FIXME: C++-ify this.
-static kmp_taskgraph_region_t *
-__kmp_taskgraph_region_alloc(kmp_info_t *thread,
-                             kmp_taskgraph_record_t *taskgraph,
-                             kmp_taskgraph_region_t **&alloc_chain,
-                             kmp_taskgraph_node_t *node,
-                             kmp_taskgraph_region_t *parent) {
+static kmp_taskgraph_region_t *__kmp_taskgraph_region_alloc(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_node_t *node,
+    kmp_taskgraph_region_t *parent) {
   kmp_taskgraph_region_t *region =
-    (kmp_taskgraph_region_t *)__kmp_fast_allocate(thread,
-        sizeof(kmp_taskgraph_region_t));
+      (kmp_taskgraph_region_t *)__kmp_fast_allocate(
+          thread, sizeof(kmp_taskgraph_region_t));
   region->owner = taskgraph;
   region->type = node ? TASKGRAPH_REGION_NODE : TASKGRAPH_REGION_WAIT;
   region->task.node = node;
@@ -905,22 +901,18 @@ __kmp_taskgraph_region_alloc(kmp_info_t *thread,
 }
 
 // FIXME: This too.
-static kmp_taskgraph_region_t *
-__kmp_taskgraph_region_alloc(kmp_info_t *thread,
-                             kmp_taskgraph_record_t *taskgraph,
-                             kmp_taskgraph_region_t **&alloc_chain,
-                             enum kmp_taskgraph_region_type type,
-                             kmp_int32 num_nodes,
-                             kmp_taskgraph_region_t *parent) {
-  kmp_size_t size =
-      sizeof(kmp_taskgraph_region_t) +
-        num_nodes * sizeof(kmp_taskgraph_region_t *);
+static kmp_taskgraph_region_t *__kmp_taskgraph_region_alloc(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, enum kmp_taskgraph_region_type type,
+    kmp_int32 num_nodes, kmp_taskgraph_region_t *parent) {
+  kmp_size_t size = sizeof(kmp_taskgraph_region_t) +
+                    num_nodes * sizeof(kmp_taskgraph_region_t *);
   size = __kmp_round_up_to_val(size, sizeof(kmp_taskgraph_region_t *));
   kmp_taskgraph_region_t *region =
-    (kmp_taskgraph_region_t *)__kmp_fast_allocate(thread, size);
+      (kmp_taskgraph_region_t *)__kmp_fast_allocate(thread, size);
   region->owner = taskgraph;
   region->type = type;
-  region->inner.children = (kmp_taskgraph_region**)&region[1];
+  region->inner.children = (kmp_taskgraph_region **)&region[1];
   region->inner.num_children = num_nodes;
   region->mark = TASKGRAPH_UNMARKED;
   region->level = -1;
@@ -937,40 +929,36 @@ __kmp_taskgraph_region_alloc(kmp_info_t *thread,
   return region;
 }
 
-// This makes a mostly-deep copy of a region.  The region itself and children nodes are
-// created new, but node pointers are shared.
-static kmp_taskgraph_region_t *
-__kmp_taskgraph_region_clone(kmp_info_t *thread,
-                             kmp_taskgraph_record_t *taskgraph,
-                             kmp_taskgraph_region_t **&alloc_chain,
-                             kmp_taskgraph_region_t *from,
-                             kmp_taskgraph_region_t *parent,
-                             kmp_int32 indent = 0) {
+// This makes a mostly-deep copy of a region.  The region itself and children
+// nodes are created new, but node pointers are shared.
+static kmp_taskgraph_region_t *__kmp_taskgraph_region_clone(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t *from,
+    kmp_taskgraph_region_t *parent, kmp_int32 indent = 0) {
   kmp_taskgraph_region_t *clone = nullptr;
   switch (from->type) {
-    case TASKGRAPH_REGION_ENTRY:
-    case TASKGRAPH_REGION_EXIT:
-      clone = __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
-                                           nullptr, parent);
-      clone->type = from->type;
-      break;
-    case TASKGRAPH_REGION_NODE:
-    case TASKGRAPH_REGION_WAIT:
-      clone = __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
-                                           from->task.node, parent);
-      break;
-    default: {
-      clone = __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
-                                           from->type, from->inner.num_children,
-                                           parent);
-      for (kmp_int32 n = 0; n < from->inner.num_children; n++) {
-        clone->inner.children[n] =
-          __kmp_taskgraph_region_clone(thread, taskgraph, alloc_chain,
-                                       from->inner.children[n], clone,
-                                       indent + 2);
-      }
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+    clone = __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
+                                         nullptr, parent);
+    clone->type = from->type;
+    break;
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT:
+    clone = __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
+                                         from->task.node, parent);
+    break;
+  default: {
+    clone =
+        __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain, from->type,
+                                     from->inner.num_children, parent);
+    for (kmp_int32 n = 0; n < from->inner.num_children; n++) {
+      clone->inner.children[n] = __kmp_taskgraph_region_clone(
+          thread, taskgraph, alloc_chain, from->inner.children[n], clone,
+          indent + 2);
     }
   }
+  }
   TGDBG("%*scloned region %p from region %p\n", indent, "", clone, from);
   return clone;
 }
@@ -987,11 +975,9 @@ __kmp_taskgraph_topological_order(kmp_taskgraph_region_t *region,
   region->mark = TASKGRAPH_TEMP_MARK;
 
   kmp_int32 max_level = -1;
-  for (kmp_taskgraph_region_dep_t *s = region->predecessors;
-       s;
-       s = s->next) {
+  for (kmp_taskgraph_region_dep_t *s = region->predecessors; s; s = s->next) {
     kmp_int32 pred_level =
-      __kmp_taskgraph_topological_order(s->region, order_out, outidx);
+        __kmp_taskgraph_topological_order(s->region, order_out, outidx);
     max_level = pred_level > max_level ? pred_level : max_level;
   }
 
@@ -1038,9 +1024,8 @@ static kmp_int32 __kmp_region_deplist_len(kmp_taskgraph_region_dep_t *list) {
   return len;
 }
 
-static void
-__kmp_region_deplist_free(kmp_info_t *thread,
-                          kmp_taskgraph_region_dep_t *list) {
+static void __kmp_region_deplist_free(kmp_info_t *thread,
+                                      kmp_taskgraph_region_dep_t *list) {
   while (list) {
     kmp_taskgraph_region_dep_t *next = list->next;
     __kmp_fast_free(thread, list);
@@ -1063,13 +1048,10 @@ static void __kmp_region_deplist_recycle(kmp_taskgraph_region_dep_t **recycled,
   }
 }
 
-static bool
-__kmp_taskgraph_collapse_sequence(kmp_info_t *thread,
-                                  kmp_taskgraph_record_t *taskgraph,
-                                  kmp_taskgraph_region_t **&alloc_chain,
-                                  kmp_taskgraph_region_t **region_p,
-                                  kmp_taskgraph_region_t *parent,
-                                  kmp_int32 &stamp) {
+static bool __kmp_taskgraph_collapse_sequence(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t **region_p,
+    kmp_taskgraph_region_t *parent, kmp_int32 &stamp) {
   kmp_taskgraph_region_t *region = *region_p;
   kmp_taskgraph_region_t *chain_start = region;
   kmp_taskgraph_region_t *chain_end = region;
@@ -1094,10 +1076,9 @@ __kmp_taskgraph_collapse_sequence(kmp_info_t *thread,
   if (chain_len <= 1)
     return false;
 
-  kmp_taskgraph_region_t *seq_region =
-    __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
-                                  TASKGRAPH_REGION_SEQUENTIAL, chain_len,
-                                  parent);
+  kmp_taskgraph_region_t *seq_region = __kmp_taskgraph_region_alloc(
+      thread, taskgraph, alloc_chain, TASKGRAPH_REGION_SEQUENTIAL, chain_len,
+      parent);
   TGDBG("allocated new seq region: %p (length %d)\n", seq_region, chain_len);
   kmp_taskgraph_region_t **worklist_p = region_p;
   *worklist_p = seq_region;
@@ -1121,7 +1102,7 @@ __kmp_taskgraph_collapse_sequence(kmp_info_t *thread,
   seq_region->level = level;
   seq_region->predecessors = seq_region->inner.children[0]->predecessors;
   seq_region->successors =
-    seq_region->inner.children[chain_len - 1]->successors;
+      seq_region->inner.children[chain_len - 1]->successors;
   seq_region->inner.children[0]->predecessors = nullptr;
   seq_region->inner.children[chain_len - 1]->successors = nullptr;
 
@@ -1150,21 +1131,20 @@ __kmp_taskgraph_collapse_sequence(kmp_info_t *thread,
   return true;
 }
 
-static const char*
+static const char *
 __kmp_taskgraph_region_type_name(kmp_taskgraph_region_type type);
 
-static void
-__kmp_taskgraph_region_dfs(kmp_taskgraph_region_t *region,
-                           kmp_taskgraph_region_t **order,
-                           kmp_int32 &idx, bool use_preds) {
+static void __kmp_taskgraph_region_dfs(kmp_taskgraph_region_t *region,
+                                       kmp_taskgraph_region_t **order,
+                                       kmp_int32 &idx, bool use_preds) {
   if (order) {
     region->timestamp = --idx;
     order[idx] = region;
   }
   region->mark = TASKGRAPH_TEMP_MARK;
   for (kmp_taskgraph_region_dep_t *reg = use_preds ? region->predecessors
-                                                   : region->successors; reg;
-       reg = reg->next) {
+                                                   : region->successors;
+       reg; reg = reg->next) {
     if (reg->region->mark == TASKGRAPH_UNMARKED)
       __kmp_taskgraph_region_dfs(reg->region, order, idx, use_preds);
   }
@@ -1172,12 +1152,10 @@ __kmp_taskgraph_region_dfs(kmp_taskgraph_region_t *region,
 
 #if defined(DEBUG_TASKGRAPH) && defined(CHECK_WORKLIST)
 
-static void
-__kmp_taskgraph_region_gather_deps(kmp_info_t *thread,
-                                   kmp_taskgraph_record_t *taskgraph,
-                                   kmp_taskgraph_region_t *region,
-                                   kmp_taskgraph_region_dep_t **deplist,
-                                   bool &ok) {
+static void __kmp_taskgraph_region_gather_deps(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, kmp_taskgraph_region_dep_t **deplist,
+    bool &ok) {
   for (kmp_taskgraph_region_dep_t *dep = *deplist; dep; dep = dep->next) {
     if (dep->region == region)
       return;
@@ -1187,14 +1165,14 @@ __kmp_taskgraph_region_gather_deps(kmp_info_t *thread,
                                       *deplist);
 
   for (kmp_taskgraph_region_dep_t *pred = region->predecessors; pred;
-      pred = pred->next) {
+       pred = pred->next) {
     if (pred->region->mark == TASKGRAPH_DELETED) {
       fprintf(stderr, "*** Region %p's predecessor %p is a deleted node\n",
               region, pred->region);
       ok = false;
     }
-    __kmp_taskgraph_region_gather_deps(thread, taskgraph, pred->region,
-                                       deplist, ok);
+    __kmp_taskgraph_region_gather_deps(thread, taskgraph, pred->region, deplist,
+                                       ok);
   }
 
   for (kmp_taskgraph_region_dep_t *succ = region->successors; succ;
@@ -1204,16 +1182,14 @@ __kmp_taskgraph_region_gather_deps(kmp_info_t *thread,
               region, succ->region);
       ok = false;
     }
-    __kmp_taskgraph_region_gather_deps(thread, taskgraph, succ->region,
-                                       deplist, ok);
+    __kmp_taskgraph_region_gather_deps(thread, taskgraph, succ->region, deplist,
+                                       ok);
   }
 }
 
-static bool
-__kmp_taskgraph_region_worklist_check(kmp_info_t *thread,
-                                      kmp_taskgraph_record_t *taskgraph,
-                                      kmp_taskgraph_region_t *region,
-                                      const char *where) {
+static bool __kmp_taskgraph_region_worklist_check(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, const char *where) {
   kmp_taskgraph_region_dep_t *collected_nodes = nullptr;
   bool ok = true;
   __kmp_taskgraph_region_gather_deps(thread, taskgraph, region,
@@ -1246,8 +1222,8 @@ __kmp_taskgraph_region_worklist_check(kmp_info_t *thread,
     }
     if (!in_list) {
       fprintf(stderr,
-              "*** Region %p is in worklist but not dependency graph (%s)\n",
-              r, where);
+              "*** Region %p is in worklist but not dependency graph (%s)\n", r,
+              where);
       ok = false;
     }
   }
@@ -1257,20 +1233,16 @@ __kmp_taskgraph_region_worklist_check(kmp_info_t *thread,
   return ok;
 }
 #else
-static bool
-__kmp_taskgraph_region_worklist_check(kmp_info_t *thread,
-                                      kmp_taskgraph_record_t *taskgraph,
-                                      kmp_taskgraph_region_t *region,
-                                      const char *where) {
+static bool __kmp_taskgraph_region_worklist_check(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, const char *where) {
   return true;
 }
 #endif
 
-static kmp_taskgraph_region_t *
-__kmp_taskgraph_region_dom_intersect(kmp_taskgraph_region_t **order,
-                                     kmp_taskgraph_region_t **doms,
-                                     kmp_taskgraph_region_t *b1,
-                                     kmp_taskgraph_region_t *b2) {
+static kmp_taskgraph_region_t *__kmp_taskgraph_region_dom_intersect(
+    kmp_taskgraph_region_t **order, kmp_taskgraph_region_t **doms,
+    kmp_taskgraph_region_t *b1, kmp_taskgraph_region_t *b2) {
   kmp_int32 finger1 = b1->timestamp;
   kmp_int32 finger2 = b2->timestamp;
   while (finger1 != finger2) {
@@ -1282,10 +1254,10 @@ __kmp_taskgraph_region_dom_intersect(kmp_taskgraph_region_t **order,
   return order[finger1];
 }
 
-static void
-__kmp_taskgraph_region_doms(kmp_taskgraph_region_t **order,
-                            kmp_taskgraph_region_t **doms,
-                            kmp_int32 worklist_length, bool postdom) {
+static void __kmp_taskgraph_region_doms(kmp_taskgraph_region_t **order,
+                                        kmp_taskgraph_region_t **doms,
+                                        kmp_int32 worklist_length,
+                                        bool postdom) {
   bool changed = true;
   // Set doms[start_node] <- start_node
   doms[worklist_length - 1] = order[worklist_length - 1];
@@ -1296,22 +1268,21 @@ __kmp_taskgraph_region_doms(kmp_taskgraph_region_t **order,
       kmp_taskgraph_region_t *b = order[n];
       kmp_taskgraph_region_t *new_idom = nullptr;
       for (kmp_taskgraph_region_dep_t *pred = postdom ? b->successors
-                                                      : b->predecessors; pred;
-           pred = pred->next) {
+                                                      : b->predecessors;
+           pred; pred = pred->next) {
         if (pred->region->mark == TASKGRAPH_PERMANENT_MARK) {
           new_idom = pred->region;
           break;
         }
       }
       for (kmp_taskgraph_region_dep_t *pred = postdom ? b->successors
-                                                      : b->predecessors; pred;
-           pred = pred->next) {
+                                                      : b->predecessors;
+           pred; pred = pred->next) {
         if (pred->region == new_idom)
           continue;
         if (doms[pred->region->timestamp]) {
-          new_idom =
-            __kmp_taskgraph_region_dom_intersect(order, doms, pred->region,
-                                                 new_idom);
+          new_idom = __kmp_taskgraph_region_dom_intersect(
+              order, doms, pred->region, new_idom);
         }
       }
       if (doms[b->timestamp] != new_idom) {
@@ -1323,8 +1294,7 @@ __kmp_taskgraph_region_doms(kmp_taskgraph_region_t **order,
   }
 }
 
-static bool
-__kmp_taskgraph_region_mutex_p(kmp_taskgraph_region_t *reg) {
+static bool __kmp_taskgraph_region_mutex_p(kmp_taskgraph_region_t *reg) {
   if (reg->type == TASKGRAPH_REGION_NODE)
     return reg->mutexset != nullptr;
   return false;
@@ -1349,19 +1319,16 @@ __kmp_taskgraph_region_mutex_p(kmp_taskgraph_region_t *reg) {
 // We choose the pp the the highest level ("furthest down the graph"), and
 // collapse the subgraph into a parallel region.
 
-static bool
-__kmp_taskgraph_collapse_par_exclusive(kmp_info_t *thread,
-                                       kmp_taskgraph_record_t *taskgraph,
-                                       kmp_taskgraph_region_t **&alloc_chain,
-                                       kmp_taskgraph_region_t **region_p,
-                                       kmp_taskgraph_region_t *parent,
-                                       kmp_int32 &stamp) {
+static bool __kmp_taskgraph_collapse_par_exclusive(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t **region_p,
+    kmp_taskgraph_region_t *parent, kmp_int32 &stamp) {
   kmp_taskgraph_region_t *region = *region_p;
   kmp_int32 num_predecessors = __kmp_region_deplist_len(region->predecessors);
 
   TGDBG("predecessors %d, successors %d\n",
-         __kmp_region_deplist_len(region->predecessors),
-         __kmp_region_deplist_len(region->successors));
+        __kmp_region_deplist_len(region->predecessors),
+        __kmp_region_deplist_len(region->successors));
 
   if (num_predecessors <= 1)
     return false;
@@ -1371,7 +1338,7 @@ __kmp_taskgraph_collapse_par_exclusive(kmp_info_t *thread,
   kmp_int32 highest_level = -1;
 
   for (kmp_taskgraph_region_dep_t *pred = region->predecessors; pred;
-        pred = pred->next) {
+       pred = pred->next) {
     TGDBG("consider predecessor: %p\n", pred->region);
     TGDBG("-- successors %d, predecessors %d\n",
           __kmp_region_deplist_len(pred->region->successors),
@@ -1385,7 +1352,8 @@ __kmp_taskgraph_collapse_par_exclusive(kmp_info_t *thread,
       continue;
     bool in_list = false;
     TGDBG("pp region: %p (%s)\n", pred_region->predecessors->region,
-          __kmp_taskgraph_region_type_name(pred_region->predecessors->region->type));
+          __kmp_taskgraph_region_type_name(
+              pred_region->predecessors->region->type));
     kmp_taskgraph_region_t *pp_region = pred_region->predecessors->region;
     for (kmp_taskgraph_region_dep_t *pp = pred_preds; pp; pp = pp->next) {
       if (pp->region == pp_region) {
@@ -1440,10 +1408,9 @@ __kmp_taskgraph_collapse_par_exclusive(kmp_info_t *thread,
     if (preds_for_pp < 2)
       continue;
     kmp_taskgraph_region_type region_type =
-      any_mutex_p ? TASKGRAPH_REGION_EXCLUSIVE : TASKGRAPH_REGION_PARALLEL;
-    kmp_taskgraph_region_t *par_region =
-      __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain, region_type,
-                                   preds_for_pp, parent);
+        any_mutex_p ? TASKGRAPH_REGION_EXCLUSIVE : TASKGRAPH_REGION_PARALLEL;
+    kmp_taskgraph_region_t *par_region = __kmp_taskgraph_region_alloc(
+        thread, taskgraph, alloc_chain, region_type, preds_for_pp, parent);
     changed = true;
     TGDBG("allocated %s region: %p\n",
           region_type == TASKGRAPH_REGION_EXCLUSIVE ? "exclusive" : "parallel",
@@ -1467,7 +1434,7 @@ __kmp_taskgraph_collapse_par_exclusive(kmp_info_t *thread,
         TGDBG("bailing (non-unit pred/succ list length)\n");
         continue;
       }
-      TGDBG("process region %p (%d/%d), level %d\n", pred->region, i+1,
+      TGDBG("process region %p (%d/%d), level %d\n", pred->region, i + 1,
             preds_for_pp, pred_region->level);
       par_region->inner.children[i] = pred_region;
       pred_region->mark = TASKGRAPH_COMBINED;
@@ -1506,8 +1473,7 @@ __kmp_taskgraph_collapse_par_exclusive(kmp_info_t *thread,
     par_region->predecessors = par_preds;
     par_region->successors = par_succs;
 
-    if (region->type == TASKGRAPH_REGION_WAIT &&
-        !found_reduction_data) {
+    if (region->type == TASKGRAPH_REGION_WAIT && !found_reduction_data) {
       // If we have no reduction data, we will not create a taskgroup for this
       // parallel region at replay time, so we don't need to terminate/discard
       // that region when we're done.  Clear the taskloop_task flag.
@@ -1579,8 +1545,8 @@ __kmp_taskgraph_collapse_par_exclusive(kmp_info_t *thread,
   return changed;
 }
 
-static void
-__kmp_taskgraph_region_dot(kmp_taskgraph_region_t *region, const char *name) {
+static void __kmp_taskgraph_region_dot(kmp_taskgraph_region_t *region,
+                                       const char *name) {
   fprintf(stderr, "digraph %s {\n", name);
   for (kmp_taskgraph_region_t *r = region; r; r = r->next) {
     if (r->mark == TASKGRAPH_DELETED) {
@@ -1677,16 +1643,13 @@ __kmp_taskgraph_count_edges_to_dominator(kmp_taskgraph_region_t *reg,
 // critical point is what it means to clone a task node in this way: that is
 // discussed in the commentary of __kmp_taskgraph_rewrite_irreducible.
 
-static void
-__kmp_taskgraph_clone_subgraph(kmp_info_t *thread,
-                               kmp_taskgraph_record_t *taskgraph,
-                               kmp_taskgraph_region_t **&alloc_chain,
-                               kmp_taskgraph_region_t *cloned_nodes[],
-                               kmp_taskgraph_region_t *orig_region,
-                               kmp_taskgraph_region_t *doms[],
-                               kmp_taskgraph_region_dep_t *preds_with_dom,
-                               kmp_taskgraph_region_t *region_dom,
-                               kmp_taskgraph_region_t ***added_worklist) {
+static void __kmp_taskgraph_clone_subgraph(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain,
+    kmp_taskgraph_region_t *cloned_nodes[], kmp_taskgraph_region_t *orig_region,
+    kmp_taskgraph_region_t *doms[], kmp_taskgraph_region_dep_t *preds_with_dom,
+    kmp_taskgraph_region_t *region_dom,
+    kmp_taskgraph_region_t ***added_worklist) {
   for (kmp_taskgraph_region_dep_t *pred = preds_with_dom; pred;
        pred = pred->next) {
     kmp_taskgraph_region_t *pred_region = pred->region;
@@ -1700,9 +1663,8 @@ __kmp_taskgraph_clone_subgraph(kmp_info_t *thread,
         pred->region = cloned_nodes[pred_region->timestamp];
         continue;
       }
-      kmp_taskgraph_region_t *cloned_region =
-        __kmp_taskgraph_region_clone(thread, taskgraph, alloc_chain,
-                                     pred_region, nullptr);
+      kmp_taskgraph_region_t *cloned_region = __kmp_taskgraph_region_clone(
+          thread, taskgraph, alloc_chain, pred_region, nullptr);
       cloned_nodes[pred_region->timestamp] = cloned_region;
 
       **added_worklist = cloned_region;
@@ -1713,9 +1675,8 @@ __kmp_taskgraph_clone_subgraph(kmp_info_t *thread,
       kmp_taskgraph_region_dep_t *cloned_preds = nullptr;
       for (kmp_taskgraph_region_dep_t *p = pred_region->predecessors; p;
            p = p->next) {
-        cloned_preds =
-          __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                   p->region, cloned_preds);
+        cloned_preds = __kmp_region_deplist_add(
+            thread, &taskgraph->recycled_deps, p->region, cloned_preds);
       }
       cloned_region->predecessors = cloned_preds;
       // Note pred_region is the original predecessor region here, not the
@@ -1785,12 +1746,10 @@ __kmp_taskgraph_clone_subgraph(kmp_info_t *thread,
 //
 // For host execution, this is handled by __kmp_exec_descr_link_instances, etc.
 
-static bool
-__kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
-                                    kmp_taskgraph_record_t *taskgraph,
-                                    kmp_taskgraph_region_t **alloc_chain,
-                                    kmp_taskgraph_region_t **region_p,
-                                    kmp_taskgraph_region_t *exitregion) {
+static bool __kmp_taskgraph_rewrite_irreducible(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **alloc_chain, kmp_taskgraph_region_t **region_p,
+    kmp_taskgraph_region_t *exitregion) {
   kmp_taskgraph_region_t *entryregion = *region_p;
   bool changed = false;
 
@@ -1811,11 +1770,11 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
 #endif
 
   kmp_taskgraph_region_t **order =
-    (kmp_taskgraph_region_t **)__kmp_fast_allocate(thread,
-        worklist_length * sizeof(kmp_taskgraph_region_t *));
+      (kmp_taskgraph_region_t **)__kmp_fast_allocate(
+          thread, worklist_length * sizeof(kmp_taskgraph_region_t *));
   kmp_taskgraph_region_t **doms =
-    (kmp_taskgraph_region_t **)__kmp_fast_allocate(thread,
-        worklist_length * sizeof(kmp_taskgraph_region_t *));
+      (kmp_taskgraph_region_t **)__kmp_fast_allocate(
+          thread, worklist_length * sizeof(kmp_taskgraph_region_t *));
   memset(doms, 0, worklist_length * sizeof(kmp_taskgraph_region_t *));
   kmp_int32 cursor = worklist_length;
   assert(entryregion->type == TASKGRAPH_REGION_ENTRY);
@@ -1879,8 +1838,8 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
 
       if (passes_pred) {
         // We can drop this predecessor.
-        TGDBG("dropping pred %p from region %p, dom %p\n",
-              pred->region, region, doms[pred->region->timestamp]);
+        TGDBG("dropping pred %p from region %p, dom %p\n", pred->region, region,
+              doms[pred->region->timestamp]);
         kmp_taskgraph_region_dep_t *next = pred->next;
         kmp_taskgraph_region_dep_t **succp = &pred->region->successors;
         while (*succp) {
@@ -1925,7 +1884,7 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
         kmp_taskgraph_region_t *this_dom = doms[pred_region->timestamp];
 #ifdef DEBUG_TASKGRAPH
         kmp_int32 edges_to_dom =
-          __kmp_taskgraph_count_edges_to_dominator(pred_region, this_dom);
+            __kmp_taskgraph_count_edges_to_dominator(pred_region, this_dom);
         TGDBG("this pred: %p, edges_to_dom=%d\n", pred_region, edges_to_dom);
 #endif
         bool found = false;
@@ -1947,10 +1906,10 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
         TGDBG("region %p: all predecessors have a single dominator\n", region);
 
         if (!pred_bitsets) {
-          pred_bitsets = (kmp_bitset_t **) __kmp_fast_allocate(thread,
-              sizeof(kmp_bitset_t *) * worklist_length);
-          succ_bitsets = (kmp_bitset_t **) __kmp_fast_allocate(thread,
-              sizeof(kmp_bitset_t *) * worklist_length);
+          pred_bitsets = (kmp_bitset_t **)__kmp_fast_allocate(
+              thread, sizeof(kmp_bitset_t *) * worklist_length);
+          succ_bitsets = (kmp_bitset_t **)__kmp_fast_allocate(
+              thread, sizeof(kmp_bitset_t *) * worklist_length);
 
           for (kmp_int32 i = 0; i < worklist_length; i++) {
             pred_bitsets[i] = __kmp_bitset_alloc(thread, worklist_length);
@@ -1985,21 +1944,19 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
             TGDBG("regions %p and %p share all predecessors/successors\n",
                   order[i], order[j]);
             same_preds_and_succs++;
-            equal_deps_chain =
-              __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                       order[j], equal_deps_chain);
+            equal_deps_chain = __kmp_region_deplist_add(
+                thread, &taskgraph->recycled_deps, order[j], equal_deps_chain);
             if (__kmp_taskgraph_region_mutex_p(order[j]))
               any_mutex_p = true;
           }
         }
         if (same_preds_and_succs > 1) {
           kmp_taskgraph_region_type region_type =
-            any_mutex_p ? TASKGRAPH_REGION_EXCLUSIVE
-                        : TASKGRAPH_REGION_PARALLEL;
-          kmp_taskgraph_region_t *par_region =
-            __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
-                                         region_type, same_preds_and_succs,
-                                         nullptr);
+              any_mutex_p ? TASKGRAPH_REGION_EXCLUSIVE
+                          : TASKGRAPH_REGION_PARALLEL;
+          kmp_taskgraph_region_t *par_region = __kmp_taskgraph_region_alloc(
+              thread, taskgraph, alloc_chain, region_type, same_preds_and_succs,
+              nullptr);
           par_region->inner.children[0] = region;
           region->mark = TASKGRAPH_COMBINED;
           region->parent = par_region;
@@ -2013,7 +1970,7 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
             equal_deps_chain = next;
           }
           par_region->predecessors =
-            par_region->inner.children[0]->predecessors;
+              par_region->inner.children[0]->predecessors;
           par_region->inner.children[0]->predecessors = nullptr;
           par_region->successors = par_region->inner.children[0]->successors;
           par_region->inner.children[0]->successors = nullptr;
@@ -2084,11 +2041,10 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
       if (regions_combined_p)
         continue;
 
-      assert (num_groups >= 1);
+      assert(num_groups >= 1);
 
       TGDBG("should split region %p (%d)\n", region, region->timestamp);
-      TGDBG("clone graph to dominator: %p (%d, %s)\n",
-            doms[region->timestamp],
+      TGDBG("clone graph to dominator: %p (%d, %s)\n", doms[region->timestamp],
             doms[region->timestamp]->timestamp,
             __kmp_taskgraph_region_type_name(doms[region->timestamp]->type));
       kmp_taskgraph_region_t *region_dom = doms[region->timestamp];
@@ -2134,9 +2090,8 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
         region->predecessors = preds_with_dom;
         for (kmp_taskgraph_region_dep_t **rp = &region->predecessors; *rp;
              rp = &(*rp)->next) {
-          kmp_int32 count =
-            __kmp_taskgraph_count_edges_to_dominator((*rp)->region,
-                                                     dom_groups[grp].dom);
+          kmp_int32 count = __kmp_taskgraph_count_edges_to_dominator(
+              (*rp)->region, dom_groups[grp].dom);
           TGDBG("for pred %p, outgoing edges to dom = %d\n", (*rp)->region,
                 count);
           if (count > highest) {
@@ -2166,8 +2121,8 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
             __kmp_region_dep_recycle(&taskgraph->recycled_deps, succ);
             TGDBG("unlinking successor %p -> %p\n", pred->region, region);
             unlinked_successors =
-              __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                       pred->region, unlinked_successors);
+                __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
+                                         pred->region, unlinked_successors);
             *succp = next;
           } else {
             succp = &succ->next;
@@ -2194,8 +2149,8 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
              pred; pred = pred->next) {
           kmp_taskgraph_region_t *pred_region = pred->region;
           pred_region->successors =
-            __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                     cloned_region, pred_region->successors);
+              __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
+                                       cloned_region, pred_region->successors);
         }
       }
 
@@ -2228,13 +2183,12 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
       // the region.
       for (kmp_taskgraph_region_dep_t *succ = unlinked_successors; succ;) {
         kmp_taskgraph_region_t *cloned_reg =
-          cloned_nodes[succ->region->timestamp];
+            cloned_nodes[succ->region->timestamp];
         kmp_taskgraph_region_dep_t *next = succ->next;
         __kmp_region_dep_recycle(&taskgraph->recycled_deps, succ);
         TGDBG("add successor to cloned region: %p -> %p\n", cloned_reg, region);
-        cloned_reg->successors =
-          __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, region,
-                                   cloned_reg->successors);
+        cloned_reg->successors = __kmp_region_deplist_add(
+            thread, &taskgraph->recycled_deps, region, cloned_reg->successors);
         succ = next;
       }
 
@@ -2330,7 +2284,7 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
   for (kmp_int32 i = 0; i < worklist_length; i++, r = r->next) {
     if (r->mark == TASKGRAPH_UNMARKED) {
       kmp_int32 level =
-        __kmp_taskgraph_topological_order(r, order_out, &outidx);
+          __kmp_taskgraph_topological_order(r, order_out, &outidx);
       max_level = level > max_level ? level : max_level;
     }
   }
@@ -2367,12 +2321,10 @@ __kmp_taskgraph_rewrite_irreducible(kmp_info_t *thread,
 // much of the heavier processing involved in step (2), so the common case
 // should be relatively fast.
 
-static kmp_taskgraph_region_t *
-__kmp_taskgraph_build_regions(kmp_info_t *thread,
-                              kmp_taskgraph_record_t *taskgraph,
-                              kmp_taskgraph_region_t **&alloc_chain,
-                              kmp_taskgraph_region_t *entryregion,
-                              kmp_taskgraph_region_t *exitregion) {
+static kmp_taskgraph_region_t *__kmp_taskgraph_build_regions(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t *entryregion,
+    kmp_taskgraph_region_t *exitregion) {
   bool changed;
   kmp_int32 phase = 0;
 
@@ -2387,17 +2339,17 @@ __kmp_taskgraph_build_regions(kmp_info_t *thread,
       changed = false;
       TGDBG("starting seq pass\n");
       for (kmp_taskgraph_region_t **seq_head = &entryregion; *seq_head;
-          seq_head = &(*seq_head)->next) {
+           seq_head = &(*seq_head)->next) {
         TGDBG("consider %s region: %p\n",
               __kmp_taskgraph_region_type_name((*seq_head)->type), *seq_head);
         if ((*seq_head)->mark == TASKGRAPH_COMBINED) {
           TGDBG("already combined\n");
           continue;
         }
-        changed |=
-          __kmp_taskgraph_collapse_sequence(thread, taskgraph, alloc_chain, seq_head,
-                                            /*parent=*/nullptr, phase);
-       TGDBG("changed: %s\n", changed ? "true" : "false");
+        changed |= __kmp_taskgraph_collapse_sequence(thread, taskgraph,
+                                                     alloc_chain, seq_head,
+                                                     /*parent=*/nullptr, phase);
+        TGDBG("changed: %s\n", changed ? "true" : "false");
       }
       ++phase;
       __kmp_taskgraph_region_chain_prune(&entryregion);
@@ -2405,17 +2357,16 @@ __kmp_taskgraph_build_regions(kmp_info_t *thread,
                                             "after seq collapse");
       TGDBG("starting par/unordered pass\n");
       for (kmp_taskgraph_region_t **par_head = &entryregion; *par_head;
-          par_head = &(*par_head)->next) {
+           par_head = &(*par_head)->next) {
         TGDBG("consider %s region: %p\n",
               __kmp_taskgraph_region_type_name((*par_head)->type), *par_head);
         if ((*par_head)->mark == TASKGRAPH_COMBINED) {
           TGDBG("already combined\n");
           continue;
         }
-        changed |=
-          __kmp_taskgraph_collapse_par_exclusive(thread, taskgraph, alloc_chain,
-                                                 par_head, /*parent=*/nullptr,
-                                                 phase);
+        changed |= __kmp_taskgraph_collapse_par_exclusive(
+            thread, taskgraph, alloc_chain, par_head, /*parent=*/nullptr,
+            phase);
         TGDBG("changed: %s\n", changed ? "true" : "false");
       }
       ++phase;
@@ -2440,9 +2391,8 @@ __kmp_taskgraph_build_regions(kmp_info_t *thread,
 
     TGDBG("attempting to collapse irreducible regions\n");
 
-    changed |=
-      __kmp_taskgraph_rewrite_irreducible(thread, taskgraph, alloc_chain,
-                                          &entryregion, exitregion);
+    changed |= __kmp_taskgraph_rewrite_irreducible(
+        thread, taskgraph, alloc_chain, &entryregion, exitregion);
 
     if (!changed) {
       fprintf(stderr, "FIXME: Failed to transform irreducible graph\n");
@@ -2453,114 +2403,109 @@ __kmp_taskgraph_build_regions(kmp_info_t *thread,
   return entryregion;
 }
 
-static void
-__kmp_taskgraph_count_nodes(kmp_taskgraph_region_t *region) {
+static void __kmp_taskgraph_count_nodes(kmp_taskgraph_region_t *region) {
   switch (region->type) {
-    case TASKGRAPH_REGION_ENTRY:
-    case TASKGRAPH_REGION_EXIT:
-      return;
-    case TASKGRAPH_REGION_NODE:
-    case TASKGRAPH_REGION_WAIT: {
-      TGDBG("process region %p\n", region);
-      region->task.node->u.resolved.count++;
-      kmp_taskgraph_region_t *last_region =
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+    return;
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT: {
+    TGDBG("process region %p\n", region);
+    region->task.node->u.resolved.count++;
+    kmp_taskgraph_region_t *last_region =
         region->task.node->u.resolved.last_region;
-      TGDBG("last region: %p\n", last_region);
-      if (last_region) {
-        kmp_taskgraph_region_t *next = last_region->task.next_instance;
-        TGDBG("next: %p\n", next);
-        last_region->task.next_instance = region;
-        region->task.next_instance = next;
-      }
-      region->task.node->u.resolved.last_region = region;
-      return;
+    TGDBG("last region: %p\n", last_region);
+    if (last_region) {
+      kmp_taskgraph_region_t *next = last_region->task.next_instance;
+      TGDBG("next: %p\n", next);
+      last_region->task.next_instance = region;
+      region->task.next_instance = next;
+    }
+    region->task.node->u.resolved.last_region = region;
+    return;
+  }
+  default:
+    for (kmp_int32 n = 0; n < region->inner.num_children; n++) {
+      __kmp_taskgraph_count_nodes(region->inner.children[n]);
     }
-    default:
-      for (kmp_int32 n = 0; n < region->inner.num_children; n++) {
-        __kmp_taskgraph_count_nodes(region->inner.children[n]);
-      }
   }
 }
 
-static void
-__kmp_taskgraph_gather_mutex_sets(kmp_info_t *thread,
-                                  kmp_taskgraph_region_t *region,
-                                  const kmp_bitset_t *held) {
+static void __kmp_taskgraph_gather_mutex_sets(kmp_info_t *thread,
+                                              kmp_taskgraph_region_t *region,
+                                              const kmp_bitset_t *held) {
   switch (region->type) {
-    case TASKGRAPH_REGION_ENTRY:
-    case TASKGRAPH_REGION_EXIT:
-    case TASKGRAPH_REGION_WAIT:
-      return;
-    case TASKGRAPH_REGION_NODE: {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+  case TASKGRAPH_REGION_WAIT:
+    return;
+  case TASKGRAPH_REGION_NODE: {
 #ifdef DEBUG_TASKGRAPH
-      if (region->mutexset && __kmp_bitset_subset_p(held, region->mutexset)) {
-        TGDBG("node is mutually exclusive with held: 0x%llx <: 0x%llx\n",
-              (unsigned long long)region->mutexset->bits[0],
-              (unsigned long long)held->bits[0]);
-      }
-#endif
-      return;
+    if (region->mutexset && __kmp_bitset_subset_p(held, region->mutexset)) {
+      TGDBG("node is mutually exclusive with held: 0x%llx <: 0x%llx\n",
+            (unsigned long long)region->mutexset->bits[0],
+            (unsigned long long)held->bits[0]);
     }
-    case TASKGRAPH_REGION_SEQUENTIAL: {
-      kmp_bitset_t *seq_held = __kmp_bitset_alloc(thread, held->bitsize);
-      __kmp_bitset_clearall(seq_held);
+#endif
+    return;
+  }
+  case TASKGRAPH_REGION_SEQUENTIAL: {
+    kmp_bitset_t *seq_held = __kmp_bitset_alloc(thread, held->bitsize);
+    __kmp_bitset_clearall(seq_held);
+    for (kmp_int32 child = 0; child < region->inner.num_children; child++) {
+      __kmp_taskgraph_gather_mutex_sets(thread, region->inner.children[child],
+                                        held);
+      if (region->inner.children[child]->mutexset)
+        __kmp_bitset_or(seq_held, seq_held,
+                        region->inner.children[child]->mutexset);
+    }
+    region->mutexset = seq_held;
+    return;
+  }
+  case TASKGRAPH_REGION_PARALLEL:
+  case TASKGRAPH_REGION_EXCLUSIVE: {
+    kmp_bitset_t *par_held = __kmp_bitset_alloc(thread, held->bitsize);
+    kmp_bitset_t *conflicts = __kmp_bitset_alloc(thread, held->bitsize);
+    while (true) {
+      __kmp_bitset_clearall(par_held);
       for (kmp_int32 child = 0; child < region->inner.num_children; child++) {
+        __kmp_bitset_clearall(conflicts);
+        for (kmp_int32 other = 0; other < region->inner.num_children; other++) {
+          if (other != child) {
+            if (!region->inner.children[other]->mutexset)
+              __kmp_taskgraph_gather_mutex_sets(
+                  thread, region->inner.children[other], held);
+            if (region->inner.children[other]->mutexset)
+              __kmp_bitset_or(conflicts, conflicts,
+                              region->inner.children[other]->mutexset);
+          }
+        }
         __kmp_taskgraph_gather_mutex_sets(thread, region->inner.children[child],
-                                          held);
+                                          conflicts);
         if (region->inner.children[child]->mutexset)
-          __kmp_bitset_or(seq_held, seq_held,
+          __kmp_bitset_or(par_held, par_held,
                           region->inner.children[child]->mutexset);
       }
-      region->mutexset = seq_held;
-      return;
-    }
-    case TASKGRAPH_REGION_PARALLEL:
-    case TASKGRAPH_REGION_EXCLUSIVE: {
-      kmp_bitset_t *par_held = __kmp_bitset_alloc(thread, held->bitsize);
-      kmp_bitset_t *conflicts = __kmp_bitset_alloc(thread, held->bitsize);
-      while (true) {
-        __kmp_bitset_clearall(par_held);
-        for (kmp_int32 child = 0; child < region->inner.num_children; child++) {
-          __kmp_bitset_clearall(conflicts);
-          for (kmp_int32 other = 0; other < region->inner.num_children; other++) {
-            if (other != child) {
-              if (!region->inner.children[other]->mutexset)
-                __kmp_taskgraph_gather_mutex_sets(thread,
-                                                  region->inner.children[other],
-                                                  held);
-              if (region->inner.children[other]->mutexset)
-                __kmp_bitset_or(conflicts, conflicts,
-                                region->inner.children[other]->mutexset);
-            }
-          }
-          __kmp_taskgraph_gather_mutex_sets(thread,
-                                            region->inner.children[child],
-                                            conflicts);
-          if (region->inner.children[child]->mutexset)
-            __kmp_bitset_or(par_held, par_held,
-                            region->inner.children[child]->mutexset);
-        }
-        if (!region->mutexset) {
-          region->mutexset = par_held;
-        } else if (__kmp_bitset_equal(region->mutexset, par_held)) {
-          TGDBG("par mutexes stabilized, exiting loop\n");
-          break;
-        } else {
-          TGDBG("par mutexes not stable, iterating\n");
-          __kmp_bitset_copy(region->mutexset, par_held);
-          __kmp_bitset_free(thread, par_held);
-        }
+      if (!region->mutexset) {
+        region->mutexset = par_held;
+      } else if (__kmp_bitset_equal(region->mutexset, par_held)) {
+        TGDBG("par mutexes stabilized, exiting loop\n");
+        break;
+      } else {
+        TGDBG("par mutexes not stable, iterating\n");
+        __kmp_bitset_copy(region->mutexset, par_held);
+        __kmp_bitset_free(thread, par_held);
       }
-      __kmp_bitset_free(thread, conflicts);
-      return;
     }
+    __kmp_bitset_free(thread, conflicts);
+    return;
+  }
   }
 }
 
-static int
-__kmp_popcount_cmp(const void *a, const void *b) {
-  const kmp_taskgraph_region_t *reg_a = *(kmp_taskgraph_region_t **) a;
-  const kmp_taskgraph_region_t *reg_b = *(kmp_taskgraph_region_t **) b;
+static int __kmp_popcount_cmp(const void *a, const void *b) {
+  const kmp_taskgraph_region_t *reg_a = *(kmp_taskgraph_region_t **)a;
+  const kmp_taskgraph_region_t *reg_b = *(kmp_taskgraph_region_t **)b;
   kmp_int32 popc_a = 0, popc_b = 0;
   if (reg_a->mutexset)
     popc_a = __kmp_bitset_popcount(reg_a->mutexset);
@@ -2576,179 +2521,172 @@ __kmp_popcount_cmp(const void *a, const void *b) {
 /// Find "mutexinoutset" regions that can be represented without explicit
 // mutexes, i.e. using "TASKGRAPH_REGION_EXCLUSIVE".
 
-static void
-__kmp_taskgraph_find_exclusive_regions(kmp_info_t *thread,
-                                       kmp_taskgraph_record_t *taskgraph,
-                                       kmp_taskgraph_region_t **&alloc_chain,
-                                       kmp_taskgraph_region_t **region_p) {
+static void __kmp_taskgraph_find_exclusive_regions(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t **region_p) {
   kmp_taskgraph_region_t *region = *region_p;
   switch (region->type) {
-    case TASKGRAPH_REGION_ENTRY:
-    case TASKGRAPH_REGION_EXIT:
-    case TASKGRAPH_REGION_NODE:
-    case TASKGRAPH_REGION_WAIT:
-      break;
-    case TASKGRAPH_REGION_SEQUENTIAL:
-    case TASKGRAPH_REGION_PARALLEL: {
-      for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
-        __kmp_taskgraph_find_exclusive_regions(thread, taskgraph, alloc_chain,
-                                               &region->inner.children[c]);
-      }
-      break;
-    }
-    case TASKGRAPH_REGION_EXCLUSIVE: {
-      qsort(region->inner.children, region->inner.num_children,
-            sizeof(kmp_taskgraph_region_t *), __kmp_popcount_cmp);
-      for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
-        TGDBG("building tree: region mutexset = 0x%llx\n",
-              (unsigned long long) region->inner.children[c]->mutexset
-                ? region->inner.children[c]->mutexset->bits[0] : 0);
-        region->inner.children[c]->mark = TASKGRAPH_UNMARKED;
-      }
-      kmp_bitset_t *conflicts =
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT:
+    break;
+  case TASKGRAPH_REGION_SEQUENTIAL:
+  case TASKGRAPH_REGION_PARALLEL: {
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      __kmp_taskgraph_find_exclusive_regions(thread, taskgraph, alloc_chain,
+                                             &region->inner.children[c]);
+    }
+    break;
+  }
+  case TASKGRAPH_REGION_EXCLUSIVE: {
+    qsort(region->inner.children, region->inner.num_children,
+          sizeof(kmp_taskgraph_region_t *), __kmp_popcount_cmp);
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      TGDBG("building tree: region mutexset = 0x%llx\n",
+            (unsigned long long)region->inner.children[c]->mutexset
+                ? region->inner.children[c]->mutexset->bits[0]
+                : 0);
+      region->inner.children[c]->mark = TASKGRAPH_UNMARKED;
+    }
+    kmp_bitset_t *conflicts =
         __kmp_bitset_alloc(thread, region->mutexset->bitsize);
-      kmp_bitset_t *subsets_cover =
+    kmp_bitset_t *subsets_cover =
         __kmp_bitset_alloc(thread, region->mutexset->bitsize);
-      __kmp_bitset_copy(conflicts, region->mutexset);
-      bool irregular = false;
-      kmp_int32 combined_children = 0;
-      for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
-        kmp_bitset_t *candidate = region->inner.children[c]->mutexset;
-        if (__kmp_bitset_empty_p(candidate))
-          continue;
-        __kmp_bitset_clearall(subsets_cover);
-        bool found_subset = false;
-        bool other_overlaps = false;
-        for (kmp_int32 d = c + 1; d < region->inner.num_children; d++) {
-          // This could test for a subset in some cases, but that adds
-          // complication for later processing.  Maybe revisit later if it
-          // seems worthwhile.
-          // E.g. if we have deps like this:
-          //
-          // #pragma omp task depend(mutexinoutset: deps[0], deps[1]) { /*a*/ }
-          // #pragma omp task depend(mutexinoutset: deps[0]) { /*b*/ }
-          // #pragma omp task depend(mutexinoutset: deps[1]) { /*c*/ }
-          //
-          // This could be represented as:
-          //
-          // exclusive {
-          //   node: a
-          //   parallel {
-          //     node: b
-          //     node: c
-          //   }
-          // }
-          //
-          // We're not doing that yet though.
-          if (__kmp_bitset_equal(candidate,
-                                 region->inner.children[d]->mutexset)) {
-            found_subset = true;
-            __kmp_bitset_or(subsets_cover, subsets_cover,
-                            region->inner.children[d]->mutexset);
-          } else if (__kmp_bitset_intersect_p(
+    __kmp_bitset_copy(conflicts, region->mutexset);
+    bool irregular = false;
+    kmp_int32 combined_children = 0;
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      kmp_bitset_t *candidate = region->inner.children[c]->mutexset;
+      if (__kmp_bitset_empty_p(candidate))
+        continue;
+      __kmp_bitset_clearall(subsets_cover);
+      bool found_subset = false;
+      bool other_overlaps = false;
+      for (kmp_int32 d = c + 1; d < region->inner.num_children; d++) {
+        // This could test for a subset in some cases, but that adds
+        // complication for later processing.  Maybe revisit later if it
+        // seems worthwhile.
+        // E.g. if we have deps like this:
+        //
+        // #pragma omp task depend(mutexinoutset: deps[0], deps[1]) { /*a*/ }
+        // #pragma omp task depend(mutexinoutset: deps[0]) { /*b*/ }
+        // #pragma omp task depend(mutexinoutset: deps[1]) { /*c*/ }
+        //
+        // This could be represented as:
+        //
+        // exclusive {
+        //   node: a
+        //   parallel {
+        //     node: b
+        //     node: c
+        //   }
+        // }
+        //
+        // We're not doing that yet though.
+        if (__kmp_bitset_equal(candidate,
+                               region->inner.children[d]->mutexset)) {
+          found_subset = true;
+          __kmp_bitset_or(subsets_cover, subsets_cover,
+                          region->inner.children[d]->mutexset);
+        } else if (__kmp_bitset_intersect_p(
                        candidate, region->inner.children[d]->mutexset)) {
-            other_overlaps = true;
-            break;
-          }
-        }
-        if (!found_subset || other_overlaps)
-          continue;
-        if (!__kmp_bitset_equal(subsets_cover, candidate)) {
-          TGDBG("subsets cover: 0x%llx, candidate: 0x%llx\n",
-                (unsigned long long)subsets_cover->bits[0],
-                (unsigned long long)candidate->bits[0]);
-          irregular = true;
+          other_overlaps = true;
           break;
         }
-        for (kmp_int32 d = c + 1; d < region->inner.num_children; d++) {
-          if (region->inner.children[d]->mutexset_parent)
-            continue;
-          // As above wrt. subsets.
-          if (__kmp_bitset_equal(candidate,
-                                 region->inner.children[d]->mutexset)) {
-            TGDBG("set index %d's parent to index %d\n", d, c);
-            region->inner.children[d]->mutexset_parent =
+      }
+      if (!found_subset || other_overlaps)
+        continue;
+      if (!__kmp_bitset_equal(subsets_cover, candidate)) {
+        TGDBG("subsets cover: 0x%llx, candidate: 0x%llx\n",
+              (unsigned long long)subsets_cover->bits[0],
+              (unsigned long long)candidate->bits[0]);
+        irregular = true;
+        break;
+      }
+      for (kmp_int32 d = c + 1; d < region->inner.num_children; d++) {
+        if (region->inner.children[d]->mutexset_parent)
+          continue;
+        // As above wrt. subsets.
+        if (__kmp_bitset_equal(candidate,
+                               region->inner.children[d]->mutexset)) {
+          TGDBG("set index %d's parent to index %d\n", d, c);
+          region->inner.children[d]->mutexset_parent =
               region->inner.children[c];
-            combined_children++;
-            __kmp_bitset_and_not(conflicts, conflicts, candidate);
-          }
+          combined_children++;
+          __kmp_bitset_and_not(conflicts, conflicts, candidate);
         }
       }
-      TGDBG("irregular: %s\n", irregular ? "true" : "false");
-      TGDBG("final conflicts: 0x%llx\n",
-            (unsigned long long)conflicts->bits[0]);
-      __kmp_bitset_free(thread, subsets_cover);
-      region->type = TASKGRAPH_REGION_PARALLEL;
-      if (!irregular && __kmp_bitset_empty_p(conflicts)) {
-        TGDBG("transforming exclusive region %p\n", region);
-        TGDBG("orig region children: %d\n", region->inner.num_children);
-        TGDBG("combined children: %d\n", combined_children);
-        if (region->inner.num_children == combined_children + 1) {
-          region->type = TASKGRAPH_REGION_EXCLUSIVE;
-        } else {
-          kmp_taskgraph_region_t *new_par =
-            __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
-                                         TASKGRAPH_REGION_PARALLEL,
-                                         region->inner.num_children -
-                                           combined_children,
-                                         nullptr);
-          for (kmp_int32 c = region->inner.num_children - 1; c >= 0; c--) {
-            kmp_taskgraph_region_t *child = region->inner.children[c];
-            // Make mutex set into a circular list.
-            if (child->mutexset_parent && child->mark != TASKGRAPH_TEMP_MARK) {
-              if (!child->mutexset_parent->mutexset_parent) {
-                // child <-> parent
-                child->mutexset_parent->mutexset_parent = child;
-                child->mutexset_parent->mark = TASKGRAPH_TEMP_MARK;
-              } else {
-                kmp_taskgraph_region_t *parent = child->mutexset_parent;
-                child->mutexset_parent = parent->mutexset_parent;
-                parent->mutexset_parent = child;
-                parent->mark = TASKGRAPH_TEMP_MARK;
-              }
+    }
+    TGDBG("irregular: %s\n", irregular ? "true" : "false");
+    TGDBG("final conflicts: 0x%llx\n", (unsigned long long)conflicts->bits[0]);
+    __kmp_bitset_free(thread, subsets_cover);
+    region->type = TASKGRAPH_REGION_PARALLEL;
+    if (!irregular && __kmp_bitset_empty_p(conflicts)) {
+      TGDBG("transforming exclusive region %p\n", region);
+      TGDBG("orig region children: %d\n", region->inner.num_children);
+      TGDBG("combined children: %d\n", combined_children);
+      if (region->inner.num_children == combined_children + 1) {
+        region->type = TASKGRAPH_REGION_EXCLUSIVE;
+      } else {
+        kmp_taskgraph_region_t *new_par = __kmp_taskgraph_region_alloc(
+            thread, taskgraph, alloc_chain, TASKGRAPH_REGION_PARALLEL,
+            region->inner.num_children - combined_children, nullptr);
+        for (kmp_int32 c = region->inner.num_children - 1; c >= 0; c--) {
+          kmp_taskgraph_region_t *child = region->inner.children[c];
+          // Make mutex set into a circular list.
+          if (child->mutexset_parent && child->mark != TASKGRAPH_TEMP_MARK) {
+            if (!child->mutexset_parent->mutexset_parent) {
+              // child <-> parent
+              child->mutexset_parent->mutexset_parent = child;
+              child->mutexset_parent->mark = TASKGRAPH_TEMP_MARK;
+            } else {
+              kmp_taskgraph_region_t *parent = child->mutexset_parent;
+              child->mutexset_parent = parent->mutexset_parent;
+              parent->mutexset_parent = child;
+              parent->mark = TASKGRAPH_TEMP_MARK;
             }
           }
-          kmp_int32 idx = 0;
-          for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
-            kmp_taskgraph_region_t *child = region->inner.children[c];
-            TGDBG("process child: %p\n", child);
-            if (child->mutexset_parent && child->mark != TASKGRAPH_COMBINED) {
-              kmp_int32 elems = 0;
-              kmp_taskgraph_region_t *next = child;
-              do {
-                elems++;
-                next = next->mutexset_parent;
-              } while (next != child);
-              TGDBG("make exclusive region with %d children\n", elems);
-              kmp_taskgraph_region_t *excl_region =
-                __kmp_taskgraph_region_alloc(thread, taskgraph, alloc_chain,
-                                             TASKGRAPH_REGION_EXCLUSIVE, elems,
-                                             nullptr);
-              kmp_int32 excl_child = 0;
-              next = child;
-              do {
-                excl_region->inner.children[excl_child++] = next;
-                next->mark = TASKGRAPH_COMBINED;
-                next = next->mutexset_parent;
-              } while (next != child);
-              assert(excl_child == excl_region->inner.num_children);
-              new_par->inner.children[idx++] = excl_region;
-            } else if (!child->mutexset_parent) {
-              new_par->inner.children[idx++] = child;
-            }
+        }
+        kmp_int32 idx = 0;
+        for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+          kmp_taskgraph_region_t *child = region->inner.children[c];
+          TGDBG("process child: %p\n", child);
+          if (child->mutexset_parent && child->mark != TASKGRAPH_COMBINED) {
+            kmp_int32 elems = 0;
+            kmp_taskgraph_region_t *next = child;
+            do {
+              elems++;
+              next = next->mutexset_parent;
+            } while (next != child);
+            TGDBG("make exclusive region with %d children\n", elems);
+            kmp_taskgraph_region_t *excl_region = __kmp_taskgraph_region_alloc(
+                thread, taskgraph, alloc_chain, TASKGRAPH_REGION_EXCLUSIVE,
+                elems, nullptr);
+            kmp_int32 excl_child = 0;
+            next = child;
+            do {
+              excl_region->inner.children[excl_child++] = next;
+              next->mark = TASKGRAPH_COMBINED;
+              next = next->mutexset_parent;
+            } while (next != child);
+            assert(excl_child == excl_region->inner.num_children);
+            new_par->inner.children[idx++] = excl_region;
+          } else if (!child->mutexset_parent) {
+            new_par->inner.children[idx++] = child;
           }
-          TGDBG("idx=%d, supposed to be %d\n", idx,
-                new_par->inner.num_children);
-          assert(idx == new_par->inner.num_children);
-          *region_p = new_par;
-          region->mark = TASKGRAPH_DELETED;
         }
+        TGDBG("idx=%d, supposed to be %d\n", idx, new_par->inner.num_children);
+        assert(idx == new_par->inner.num_children);
+        *region_p = new_par;
+        region->mark = TASKGRAPH_DELETED;
       }
-      __kmp_bitset_free(thread, conflicts);
-      break;
     }
-    default:
-      assert(false && "unreachable");
+    __kmp_bitset_free(thread, conflicts);
+    break;
+  }
+  default:
+    assert(false && "unreachable");
   }
 }
 
@@ -2760,59 +2698,55 @@ __kmp_taskgraph_strip_mutex_sets(kmp_info_t *thread,
                                  bool in_exclusive = false) {
   kmp_int32 mutexes_needed = 0;
   switch (region->type) {
-    case TASKGRAPH_REGION_ENTRY:
-    case TASKGRAPH_REGION_EXIT:
-    case TASKGRAPH_REGION_WAIT:
-      assert(!region->mutexset);
-      break;
-    case TASKGRAPH_REGION_NODE:
-      if (region->mutexset) {
-        if (in_exclusive) {
-          __kmp_bitset_free(thread, region->mutexset);
-          region->mutexset = nullptr;
-        } else {
-          // FIXME: This might be pessimistic -- the remaining mutex sets might
-          // have holes or duplicates.  We could compact them.
-          kmp_int32 m = region->mutexset->bitsize;
-          mutexes_needed = std::max(mutexes_needed, m);
-        }
-      } 
-      break;
-    case TASKGRAPH_REGION_EXCLUSIVE: {
-      if (region->mutexset) {
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+  case TASKGRAPH_REGION_WAIT:
+    assert(!region->mutexset);
+    break;
+  case TASKGRAPH_REGION_NODE:
+    if (region->mutexset) {
+      if (in_exclusive) {
         __kmp_bitset_free(thread, region->mutexset);
         region->mutexset = nullptr;
-      }
-      for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
-        kmp_int32 m =
-          __kmp_taskgraph_strip_mutex_sets(thread, region->inner.children[c],
-                                           true);
+      } else {
+        // FIXME: This might be pessimistic -- the remaining mutex sets might
+        // have holes or duplicates.  We could compact them.
+        kmp_int32 m = region->mutexset->bitsize;
         mutexes_needed = std::max(mutexes_needed, m);
       }
-      break;
     }
-    default: {
-      if (region->mutexset) {
-        __kmp_bitset_free(thread, region->mutexset);
-        region->mutexset = nullptr;
-      }
-      for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
-        kmp_int32 m =
-          __kmp_taskgraph_strip_mutex_sets(thread, region->inner.children[c],
-                                           in_exclusive);
-        mutexes_needed = std::max(mutexes_needed, m);
-      }
+    break;
+  case TASKGRAPH_REGION_EXCLUSIVE: {
+    if (region->mutexset) {
+      __kmp_bitset_free(thread, region->mutexset);
+      region->mutexset = nullptr;
+    }
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      kmp_int32 m = __kmp_taskgraph_strip_mutex_sets(
+          thread, region->inner.children[c], true);
+      mutexes_needed = std::max(mutexes_needed, m);
+    }
+    break;
+  }
+  default: {
+    if (region->mutexset) {
+      __kmp_bitset_free(thread, region->mutexset);
+      region->mutexset = nullptr;
     }
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      kmp_int32 m = __kmp_taskgraph_strip_mutex_sets(
+          thread, region->inner.children[c], in_exclusive);
+      mutexes_needed = std::max(mutexes_needed, m);
+    }
+  }
   }
   return mutexes_needed;
 }
 
-static void
-__kmp_taskgraph_exclusive_regions(kmp_info_t *thread,
-                                  kmp_taskgraph_record_t *taskgraph,
-                                  kmp_taskgraph_region_t **&alloc_chain,
-                                  kmp_taskgraph_region_t **region_p,
-                                  kmp_int32 max_mutex) {
+static void __kmp_taskgraph_exclusive_regions(
+    kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t **&alloc_chain, kmp_taskgraph_region_t **region_p,
+    kmp_int32 max_mutex) {
   kmp_bitset_t *top = __kmp_bitset_alloc(thread, max_mutex);
   __kmp_bitset_clearall(top);
   __kmp_taskgraph_gather_mutex_sets(thread, *region_p, top);
@@ -2822,64 +2756,72 @@ __kmp_taskgraph_exclusive_regions(kmp_info_t *thread,
   taskgraph->num_mutexes = num_mutexes;
 }
 
-static const char*
+static const char *
 __kmp_taskgraph_region_type_name(kmp_taskgraph_region_type type) {
   switch (type) {
-    case TASKGRAPH_REGION_ENTRY: return "entry";
-    case TASKGRAPH_REGION_EXIT: return "exit";
-    case TASKGRAPH_REGION_NODE: return "node";
-    case TASKGRAPH_REGION_WAIT: return "wait";
-    case TASKGRAPH_REGION_PARALLEL: return "parallel";
-    case TASKGRAPH_REGION_EXCLUSIVE: return "exclusive";
-    case TASKGRAPH_REGION_SEQUENTIAL: return "sequential";
-    case TASKGRAPH_REGION_IRREDUCIBLE: return "irreducible";
-    default: return "<unknown>";
+  case TASKGRAPH_REGION_ENTRY:
+    return "entry";
+  case TASKGRAPH_REGION_EXIT:
+    return "exit";
+  case TASKGRAPH_REGION_NODE:
+    return "node";
+  case TASKGRAPH_REGION_WAIT:
+    return "wait";
+  case TASKGRAPH_REGION_PARALLEL:
+    return "parallel";
+  case TASKGRAPH_REGION_EXCLUSIVE:
+    return "exclusive";
+  case TASKGRAPH_REGION_SEQUENTIAL:
+    return "sequential";
+  case TASKGRAPH_REGION_IRREDUCIBLE:
+    return "irreducible";
+  default:
+    return "<unknown>";
   }
 }
 
 #if defined(KMP_DEBUG) || defined(DEBUG_TASKGRAPH)
-static void
-__kmp_dump_taskgraph_regions(FILE *f, kmp_taskgraph_region_t *region,
-                             int indent = 0) {
+static void __kmp_dump_taskgraph_regions(FILE *f,
+                                         kmp_taskgraph_region_t *region,
+                                         int indent = 0) {
   switch (region->type) {
-    case TASKGRAPH_REGION_ENTRY:
-    case TASKGRAPH_REGION_EXIT:
-      fprintf(f, "%*s%s node\n", indent, "",
-              __kmp_taskgraph_region_type_name(region->type));
-      break;
-    case TASKGRAPH_REGION_NODE:
-    case TASKGRAPH_REGION_WAIT: {
-      char set_membership[40];
-      if (region->mutexset)
-        sprintf(set_membership, " [sets: 0x%llx]",
-                (unsigned long long) region->mutexset->bits[0]);
-      else
-        strcpy(set_membership, "");
-      if (region->task.node->u.resolved.count > 1)
-        fprintf(f, "%*s%s: %p (* %d)%s\n", indent, "",
-                __kmp_taskgraph_region_type_name(region->type),
-                region->task.node, region->task.node->u.resolved.count,
-                set_membership);
-      else
-        fprintf(f, "%*s%s: %p%s\n", indent, "",
-                __kmp_taskgraph_region_type_name(region->type),
-                region->task.node, set_membership);
-      break;
-    }
-    default: {
-      char set_membership[40];
-      if (region->mutexset)
-        sprintf(set_membership, " [sets: 0x%llx]",
-                (unsigned long long) region->mutexset->bits[0]);
-      else
-        strcpy(set_membership, "");
-      fprintf(f, "%*s%s%s {\n", indent, "",
-              __kmp_taskgraph_region_type_name (region->type), set_membership);
-      for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
-        __kmp_dump_taskgraph_regions(f, region->inner.children[c], indent + 2);
-      }
-      fprintf(f, "%*s}\n", indent, "");
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+    fprintf(f, "%*s%s node\n", indent, "",
+            __kmp_taskgraph_region_type_name(region->type));
+    break;
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT: {
+    char set_membership[40];
+    if (region->mutexset)
+      sprintf(set_membership, " [sets: 0x%llx]",
+              (unsigned long long)region->mutexset->bits[0]);
+    else
+      strcpy(set_membership, "");
+    if (region->task.node->u.resolved.count > 1)
+      fprintf(f, "%*s%s: %p (* %d)%s\n", indent, "",
+              __kmp_taskgraph_region_type_name(region->type), region->task.node,
+              region->task.node->u.resolved.count, set_membership);
+    else
+      fprintf(f, "%*s%s: %p%s\n", indent, "",
+              __kmp_taskgraph_region_type_name(region->type), region->task.node,
+              set_membership);
+    break;
+  }
+  default: {
+    char set_membership[40];
+    if (region->mutexset)
+      sprintf(set_membership, " [sets: 0x%llx]",
+              (unsigned long long)region->mutexset->bits[0]);
+    else
+      strcpy(set_membership, "");
+    fprintf(f, "%*s%s%s {\n", indent, "",
+            __kmp_taskgraph_region_type_name(region->type), set_membership);
+    for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
+      __kmp_dump_taskgraph_regions(f, region->inner.children[c], indent + 2);
     }
+    fprintf(f, "%*s}\n", indent, "");
+  }
   }
 }
 #endif
@@ -2903,18 +2845,17 @@ __kmp_dump_find_parent_regions(kmp_info *thd, kmp_taskgraph_record_t *taskgraph,
     if (!in_list) {
       list = __kmp_region_deplist_add(thd, &taskgraph->recycled_deps,
                                       region[r].parent, list);
-      list = __kmp_dump_find_parent_regions(thd, taskgraph, region[r].parent,
-                                            1, list);
+      list = __kmp_dump_find_parent_regions(thd, taskgraph, region[r].parent, 1,
+                                            list);
     }
   }
   return list;
 }
 
-static void
-__kmp_dump_raw_taskgraph_regions(FILE *f, kmp_info *thd,
-                                 kmp_taskgraph_record_t *taskgraph,
-                                 kmp_taskgraph_region_t *region,
-                                 int numregions, int indent = 0) {
+static void __kmp_dump_raw_taskgraph_regions(FILE *f, kmp_info *thd,
+                                             kmp_taskgraph_record_t *taskgraph,
+                                             kmp_taskgraph_region_t *region,
+                                             int numregions, int indent = 0) {
   kmp_taskgraph_region_dep_t *parentlist = nullptr;
   kmp_taskgraph_region_dep_t *printedlist = nullptr;
   for (int r = 0; r < numregions; r++) {
@@ -2924,24 +2865,23 @@ __kmp_dump_raw_taskgraph_regions(FILE *f, kmp_info *thd,
         region[r].type == TASKGRAPH_REGION_EXCLUSIVE ||
         region[r].type == TASKGRAPH_REGION_IRREDUCIBLE)
       children = region[r].inner.num_children;
-    fprintf(f,
-            "%*sregion %d (%p): %s%s (%d children) parent %p succs %d preds %d\n",
-            indent, "", r, &region[r],
-            __kmp_taskgraph_region_type_name(region[r].type),
-            region[r].mark == TASKGRAPH_COMBINED ? " (combined)" : "",
-            children, region[r].parent,
-            __kmp_region_deplist_len(region[r].successors),
-            __kmp_region_deplist_len(region[r].predecessors));
+    fprintf(
+        f,
+        "%*sregion %d (%p): %s%s (%d children) parent %p succs %d preds %d\n",
+        indent, "", r, &region[r],
+        __kmp_taskgraph_region_type_name(region[r].type),
+        region[r].mark == TASKGRAPH_COMBINED ? " (combined)" : "", children,
+        region[r].parent, __kmp_region_deplist_len(region[r].successors),
+        __kmp_region_deplist_len(region[r].predecessors));
     if (children > 0) {
       for (int c = 0; c < children; c++)
-        __kmp_dump_raw_taskgraph_regions(f, thd, taskgraph,
-                                         region->inner.children[c], 1,
-                                         indent + 2);
+        __kmp_dump_raw_taskgraph_regions(
+            f, thd, taskgraph, region->inner.children[c], 1, indent + 2);
     }
   }
   if (indent == 0) {
-    parentlist = __kmp_dump_find_parent_regions(thd, taskgraph, region,
-                                                numregions);
+    parentlist =
+        __kmp_dump_find_parent_regions(thd, taskgraph, region, numregions);
     fprintf(stderr, "%*sfound %d parent region(s):\n", indent, "",
             __kmp_region_deplist_len(parentlist));
     for (kmp_taskgraph_region_dep_t *p = parentlist; p; p = p->next) {
@@ -3079,7 +3019,7 @@ __kmp_dump_raw_taskgraph_regions(FILE *f, kmp_info *thd,
 // number of places in the graph.  Care must be taken at replay time that all
 // nodes preceding a multiply-instantiated node execute before the node, and
 // that all nodes succeeding each "instantiation point" are executed once the
-// task has executed.  
+// task has executed.
 //
 // The final region type is "exclusive", which arises for "mutexinoutset"
 // dependencies that are able to be abstracted away (we can't do this in all
@@ -3134,9 +3074,9 @@ __kmp_dump_raw_taskgraph_regions(FILE *f, kmp_info *thd,
 // annotated with a set of mutexes that must be held while executing the task.
 // (Shown with [sets: 0xN] in dump output).
 
-kmp_int32
-__kmp_build_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
-                      kmp_taskgraph_record_t *taskgraph) {
+kmp_int32 __kmp_build_taskgraph(kmp_int32 gtid,
+                                kmp_taskdata_t *current_taskdata,
+                                kmp_taskgraph_record_t *taskgraph) {
   kmp_int32 numnodes = taskgraph->num_tasks;
   kmp_int32 numregions = numnodes + 2;
   kmp_taskgraph_node_t *nodes = taskgraph->record_map;
@@ -3144,9 +3084,8 @@ __kmp_build_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
   kmp_dephash_t *hash = __kmp_dephash_create(thread, current_taskdata);
   bool dep_barrier = false;
 
-  kmp_depnode_t *all_depnodes =
-    (kmp_depnode_t *)__kmp_thread_malloc(thread,
-                                         numregions * sizeof(kmp_depnode_t));
+  kmp_depnode_t *all_depnodes = (kmp_depnode_t *)__kmp_thread_malloc(
+      thread, numregions * sizeof(kmp_depnode_t));
 
   kmp_int32 next_mutex_set = 0;
 
@@ -3162,10 +3101,9 @@ __kmp_build_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
     node->dn.task = nodes[i].task;
     dep_barrier = !nodes[i].task && nodes[i].taskloop_task;
     if (!dep_all) {
-      __kmp_process_deps<taskgraph_deps>(gtid, node, &hash, dep_barrier,
-                                         nodes[i].u.unresolved.ndeps,
-                                         nodes[i].u.unresolved.dep_list,
-                                         nodes[i].task, next_mutex_set);
+      __kmp_process_deps<taskgraph_deps>(
+          gtid, node, &hash, dep_barrier, nodes[i].u.unresolved.ndeps,
+          nodes[i].u.unresolved.dep_list, nodes[i].task, next_mutex_set);
     } else {
       __kmp_process_dep_all<taskgraph_deps>(gtid, node, hash, dep_barrier,
                                             nodes[i].task);
@@ -3176,16 +3114,16 @@ __kmp_build_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
   kmp_int32 outidx = 0;
 
   kmp_taskgraph_region_t *initial_regions =
-    (kmp_taskgraph_region_t *)__kmp_fast_allocate(thread,
-                                  sizeof(kmp_taskgraph_region_t) * numregions);
+      (kmp_taskgraph_region_t *)__kmp_fast_allocate(
+          thread, sizeof(kmp_taskgraph_region_t) * numregions);
   // FIXME: Something like 'placement new' here?
   memset(initial_regions, 0, sizeof(kmp_taskgraph_region_t) * numregions);
 
   kmp_taskgraph_region_t *cfg_barrier = nullptr;
 
   for (kmp_int32 i = 0; i < numnodes; i++) {
-    initial_regions[i].type = nodes[i].task ? TASKGRAPH_REGION_NODE
-                                            : TASKGRAPH_REGION_WAIT;
+    initial_regions[i].type =
+        nodes[i].task ? TASKGRAPH_REGION_NODE : TASKGRAPH_REGION_WAIT;
     initial_regions[i].task.node = &nodes[i];
     initial_regions[i].task.next_instance = &initial_regions[i];
     initial_regions[i].parent = nullptr;
@@ -3196,17 +3134,16 @@ __kmp_build_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
     }
     kmp_depnode_t *depnode = &all_depnodes[i];
     initial_regions[i].mutexset = depnode->dn.set_membership;
-    for (kmp_depnode_list_t *succ = depnode->dn.successors;
-         succ;
+    for (kmp_depnode_list_t *succ = depnode->dn.successors; succ;
          succ = succ->next) {
       kmp_int32 succ_idx = succ->node - all_depnodes;
       kmp_taskgraph_region_t *tg_succ = &initial_regions[succ_idx];
       tg_succ->predecessors =
-        __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                 &initial_regions[i], tg_succ->predecessors);
+          __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
+                                   &initial_regions[i], tg_succ->predecessors);
       initial_regions[i].successors =
-        __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, tg_succ,
-                                 initial_regions[i].successors);
+          __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, tg_succ,
+                                   initial_regions[i].successors);
     }
     // Handle control flow dependencies.  If a node (e.g. a taskloop task) has
     // a wait after it corresponding to the end of an implicit taskgroup, join
@@ -3214,24 +3151,22 @@ __kmp_build_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
     // it will depend on the barrier.
     if (nodes[i].u.unresolved.cfg_successor != -1) {
       kmp_int32 cfg_succ = nodes[i].u.unresolved.cfg_successor;
-      initial_regions[i].successors =
-        __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                  &initial_regions[cfg_succ],
-                                  initial_regions[i].successors);
-      initial_regions[cfg_succ].predecessors =
-        __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                  &initial_regions[i],
-                                  initial_regions[cfg_succ].predecessors);
+      initial_regions[i].successors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, &initial_regions[cfg_succ],
+          initial_regions[i].successors);
+      initial_regions[cfg_succ].predecessors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, &initial_regions[i],
+          initial_regions[cfg_succ].predecessors);
     }
     if (nodes[i].taskloop_task && !nodes[i].task) {
       cfg_barrier = &initial_regions[i];
     } else if (cfg_barrier) {
-      cfg_barrier->successors =
-        __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                 &initial_regions[i], cfg_barrier->successors);
-      initial_regions[i].predecessors =
-        __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                 cfg_barrier, initial_regions[i].predecessors);
+      cfg_barrier->successors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, &initial_regions[i],
+          cfg_barrier->successors);
+      initial_regions[i].predecessors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, cfg_barrier,
+          initial_regions[i].predecessors);
     }
   }
 
@@ -3261,21 +3196,19 @@ __kmp_build_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
     kmp_int32 nsuccs = __kmp_region_deplist_len(region->successors);
     if (npreds == 0) {
       initial_regions[entryregion].successors =
-        __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, region,
-                                 initial_regions[entryregion].successors);
-      region->predecessors =
-        __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                 &initial_regions[entryregion],
-                                 region->predecessors);
+          __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, region,
+                                   initial_regions[entryregion].successors);
+      region->predecessors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, &initial_regions[entryregion],
+          region->predecessors);
     }
     if (nsuccs == 0) {
       initial_regions[exitregion].predecessors =
-        __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, region,
-                                 initial_regions[exitregion].predecessors);
-      region->successors =
-        __kmp_region_deplist_add(thread, &taskgraph->recycled_deps,
-                                 &initial_regions[exitregion],
-                                 region->successors);
+          __kmp_region_deplist_add(thread, &taskgraph->recycled_deps, region,
+                                   initial_regions[exitregion].predecessors);
+      region->successors = __kmp_region_deplist_add(
+          thread, &taskgraph->recycled_deps, &initial_regions[exitregion],
+          region->successors);
     }
     region->owner = taskgraph;
   }
@@ -3287,9 +3220,8 @@ __kmp_build_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
 
   for (kmp_int32 i = 0; i < numregions; i++) {
     if (initial_regions[i].mark == TASKGRAPH_UNMARKED) {
-      kmp_int32 level =
-        __kmp_taskgraph_topological_order(&initial_regions[i], order_out,
-                                          &outidx);
+      kmp_int32 level = __kmp_taskgraph_topological_order(&initial_regions[i],
+                                                          order_out, &outidx);
       max_level = level > max_level ? level : max_level;
     }
   }
@@ -3307,10 +3239,9 @@ __kmp_build_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
 
   kmp_taskgraph_region_t **alloc_chain = &initial_regions[0].alloc_chain;
 
-  kmp_taskgraph_region_t *root_region =
-    __kmp_taskgraph_build_regions(thread, taskgraph, alloc_chain,
-                                  &initial_regions[entryregion],
-                                  &initial_regions[exitregion]);
+  kmp_taskgraph_region_t *root_region = __kmp_taskgraph_build_regions(
+      thread, taskgraph, alloc_chain, &initial_regions[entryregion],
+      &initial_regions[exitregion]);
 
   __kmp_taskgraph_count_nodes(root_region);
 
@@ -3351,14 +3282,14 @@ __kmp_build_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
   taskgraph->recycled_deps = nullptr;
 
   KG_TRACE(10, ("Processed taskgraph %p (graph_id %" PRIx64 "):\n", taskgraph,
-           taskgraph->graph_id));
+                taskgraph->graph_id));
   KG_DUMP(10, __kmp_dump_taskgraph_regions(stderr, root_region));
 
-  #ifdef DEBUG_TASKGRAPH
-  //__kmp_dump_taskgraph_regions(stderr, root_region);
-  //__kmp_dump_raw_taskgraph_regions(stderr, thread, taskgraph,
-  //                                 &initial_regions[0], numregions);
-  #endif
+#ifdef DEBUG_TASKGRAPH
+//__kmp_dump_taskgraph_regions(stderr, root_region);
+//__kmp_dump_raw_taskgraph_regions(stderr, thread, taskgraph,
+//                                 &initial_regions[0], numregions);
+#endif
 
   KMP_ATOMIC_ST_REL(&taskgraph->status, KMP_TDG_READY);
 
@@ -3395,16 +3326,14 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
   kmp_int32 next_mutex = 0;
 
   if (!dep_all) { // regular dependences
-    npredecessors =
-      __kmp_process_deps<normal_deps>(gtid, node, hash, dep_barrier,
-                                      ndeps, dep_list, task, next_mutex);
-    npredecessors +=
-      __kmp_process_deps<normal_deps>(gtid, node, hash, dep_barrier,
-                                      ndeps_noalias, noalias_dep_list, task,
-                                      next_mutex, false);
+    npredecessors = __kmp_process_deps<normal_deps>(
+        gtid, node, hash, dep_barrier, ndeps, dep_list, task, next_mutex);
+    npredecessors += __kmp_process_deps<normal_deps>(
+        gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list, task,
+        next_mutex, false);
   } else { // omp_all_memory dependence
-    npredecessors =
-      __kmp_process_dep_all<normal_deps>(gtid, node, *hash, dep_barrier, task);
+    npredecessors = __kmp_process_dep_all<normal_deps>(gtid, node, *hash,
+                                                       dep_barrier, task);
   }
 
   node->dn.task = task;
diff --git a/openmp/runtime/src/kmp_taskdeps.h b/openmp/runtime/src/kmp_taskdeps.h
index e7df68c3f..e7f237459 100644
--- a/openmp/runtime/src/kmp_taskdeps.h
+++ b/openmp/runtime/src/kmp_taskdeps.h
@@ -130,8 +130,8 @@ static inline void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
                 gtid, task));
 
   KMP_ACQUIRE_DEPNODE(gtid, node);
-    node->dn.task =
-        NULL; // mark this task as finished, so no new dependencies are generated
+  node->dn.task =
+      NULL; // mark this task as finished, so no new dependencies are generated
   KMP_RELEASE_DEPNODE(gtid, node);
 
   kmp_depnode_list_t *next;
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 962609e53..8d625cd93 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -755,9 +755,9 @@ static bool __kmp_track_children_task(kmp_taskdata_t *taskdata) {
   return ret;
 }
 
-static bool __kmp_taskgraph_exec_descr_finish(kmp_int32 gtid,
-                                              kmp_info_t *thread,
-                                              kmp_taskgraph_exec_descr_t *descr);
+static bool
+__kmp_taskgraph_exec_descr_finish(kmp_int32 gtid, kmp_info_t *thread,
+                                  kmp_taskgraph_exec_descr_t *descr);
 
 // __kmp_task_finish: bookkeeping to do when a task finishes execution
 //
@@ -2109,43 +2109,35 @@ __kmp_fill_exec_descr(kmp_int32, kmp_info_t *, kmp_taskgraph_record_t *,
                       kmp_taskgraph_exec_descr_t *, kmp_size_t &,
                       kmp_taskgraph_exec_descr_t **);
 
-static kmp_int32
-__kmp_pred_list_length(kmp_taskgraph_exec_descr_t *desc) {
+static kmp_int32 __kmp_pred_list_length(kmp_taskgraph_exec_descr_t *desc) {
   kmp_int32 res = 0;
   for (; desc; desc = desc->predecessor_chain)
     ++res;
   return res;
 }
 
-static kmp_taskgraph_exec_descr_t *
-__kmp_fill_sequential_descr(kmp_int32 gtid, kmp_info_t *thread,
-                               kmp_taskgraph_record_t *taskgraph,
-                               kmp_taskgraph_region_t *region,
-                               kmp_taskdata_t *parent_taskdata,
-                               kmp_taskgraph_exec_descr_t *exec_descrs,
-                               kmp_size_t &next_idx,
-                               kmp_taskgraph_exec_descr_t **succs_to_fill_p) {
+static kmp_taskgraph_exec_descr_t *__kmp_fill_sequential_descr(
+    kmp_int32 gtid, kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, kmp_taskdata_t *parent_taskdata,
+    kmp_taskgraph_exec_descr_t *exec_descrs, kmp_size_t &next_idx,
+    kmp_taskgraph_exec_descr_t **succs_to_fill_p) {
   assert(region->type == TASKGRAPH_REGION_SEQUENTIAL);
   kmp_taskgraph_exec_descr_t *first_node = nullptr;
   for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
-    kmp_taskgraph_exec_descr *descr =
-      __kmp_fill_exec_descr(gtid, thread, taskgraph, region->inner.children[c],
-                            parent_taskdata, exec_descrs, next_idx,
-                            succs_to_fill_p);
+    kmp_taskgraph_exec_descr *descr = __kmp_fill_exec_descr(
+        gtid, thread, taskgraph, region->inner.children[c], parent_taskdata,
+        exec_descrs, next_idx, succs_to_fill_p);
     if (!first_node)
       first_node = descr;
   }
   return first_node;
 }
 
-static kmp_taskgraph_exec_descr_t *
-__kmp_fill_par_or_excl_descr(kmp_int32 gtid, kmp_info_t *thread,
-                              kmp_taskgraph_record_t *taskgraph,
-                              kmp_taskgraph_region_t *region,
-                              kmp_taskdata_t *parent_taskdata,
-                              kmp_taskgraph_exec_descr_t *exec_descrs,
-                              kmp_size_t &next_idx,
-                              kmp_taskgraph_exec_descr_t **succs_to_fill_p) {
+static kmp_taskgraph_exec_descr_t *__kmp_fill_par_or_excl_descr(
+    kmp_int32 gtid, kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, kmp_taskdata_t *parent_taskdata,
+    kmp_taskgraph_exec_descr_t *exec_descrs, kmp_size_t &next_idx,
+    kmp_taskgraph_exec_descr_t **succs_to_fill_p) {
   assert(region->type == TASKGRAPH_REGION_PARALLEL ||
          region->type == TASKGRAPH_REGION_EXCLUSIVE);
 
@@ -2168,10 +2160,9 @@ __kmp_fill_par_or_excl_descr(kmp_int32 gtid, kmp_info_t *thread,
 
   for (kmp_int32 c = 0; c < region->inner.num_children; c++) {
     kmp_taskgraph_exec_descr_t *succs_to_fill = nullptr;
-    kmp_taskgraph_exec_descr_t *head =
-      __kmp_fill_exec_descr(gtid, thread, taskgraph, region->inner.children[c],
-                            parent_taskdata, exec_descrs, next_idx,
-                            &succs_to_fill);
+    kmp_taskgraph_exec_descr_t *head = __kmp_fill_exec_descr(
+        gtid, thread, taskgraph, region->inner.children[c], parent_taskdata,
+        exec_descrs, next_idx, &succs_to_fill);
     if (!sibling_list) {
       sibling_list = head;
       sibling_list->sibling = head;
@@ -2205,106 +2196,101 @@ __kmp_fill_par_or_excl_descr(kmp_int32 gtid, kmp_info_t *thread,
   return exec_descr;
 }
 
-static kmp_taskgraph_exec_descr_t *
-__kmp_fill_exec_descr(kmp_int32 gtid, kmp_info_t *thread,
-                              kmp_taskgraph_record_t *taskgraph,
-                              kmp_taskgraph_region_t *region,
-                              kmp_taskdata_t *parent_taskdata,
-                              kmp_taskgraph_exec_descr_t *exec_descrs,
-                              kmp_size_t &next_idx,
-                              kmp_taskgraph_exec_descr_t **succs_to_fill_p) {
+static kmp_taskgraph_exec_descr_t *__kmp_fill_exec_descr(
+    kmp_int32 gtid, kmp_info_t *thread, kmp_taskgraph_record_t *taskgraph,
+    kmp_taskgraph_region_t *region, kmp_taskdata_t *parent_taskdata,
+    kmp_taskgraph_exec_descr_t *exec_descrs, kmp_size_t &next_idx,
+    kmp_taskgraph_exec_descr_t **succs_to_fill_p) {
   switch (region->type) {
-    case TASKGRAPH_REGION_ENTRY:
-    case TASKGRAPH_REGION_EXIT:
-      break;
-    case TASKGRAPH_REGION_NODE:
-    case TASKGRAPH_REGION_WAIT: {
-      kmp_taskgraph_exec_descr_t *incoming_succs_to_fill = *succs_to_fill_p;
-      kmp_taskgraph_exec_descr_t *exec_descr = &exec_descrs[next_idx++];
-      exec_descr->region = region;
-      exec_descr->region->exec_descr = exec_descr;
-      exec_descr->nblocks = region->task.node->u.resolved.count - 1;
-      exec_descr->npredecessors = __kmp_pred_list_length(incoming_succs_to_fill);
-      exec_descr->sibling = exec_descr;
-      exec_descr->predecessor_chain = nullptr;
-      exec_descr->successor = nullptr;
-      exec_descr->next_instance = nullptr;
-
-      // Edit the taskdata for this specific instantiation.  At present the
-      // task/taskdata structures cannot be used simultaneously by different
-      // threads. We could duplicate the structures to allow simultaneous issue,
-      // but that's not done yet.  The exec_descr can already by thread-local,
-      // in principle, but for now it points to the taskgraph's single copy
-      // of each task/taskdata structure.
-      if (region->type == TASKGRAPH_REGION_NODE) {
-        kmp_task_t *task = exec_descr->region->task.node->task;
-        kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
-        taskdata->exec_descr = exec_descr;
-      }
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+    break;
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT: {
+    kmp_taskgraph_exec_descr_t *incoming_succs_to_fill = *succs_to_fill_p;
+    kmp_taskgraph_exec_descr_t *exec_descr = &exec_descrs[next_idx++];
+    exec_descr->region = region;
+    exec_descr->region->exec_descr = exec_descr;
+    exec_descr->nblocks = region->task.node->u.resolved.count - 1;
+    exec_descr->npredecessors = __kmp_pred_list_length(incoming_succs_to_fill);
+    exec_descr->sibling = exec_descr;
+    exec_descr->predecessor_chain = nullptr;
+    exec_descr->successor = nullptr;
+    exec_descr->next_instance = nullptr;
+
+    // Edit the taskdata for this specific instantiation.  At present the
+    // task/taskdata structures cannot be used simultaneously by different
+    // threads. We could duplicate the structures to allow simultaneous issue,
+    // but that's not done yet.  The exec_descr can already by thread-local,
+    // in principle, but for now it points to the taskgraph's single copy
+    // of each task/taskdata structure.
+    if (region->type == TASKGRAPH_REGION_NODE) {
+      kmp_task_t *task = exec_descr->region->task.node->task;
+      kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+      taskdata->exec_descr = exec_descr;
+    }
 
-      for (kmp_taskgraph_exec_descr_t *pred = incoming_succs_to_fill; pred;
-           pred = pred->predecessor_chain) {
-        pred->successor = exec_descr;
-      }
+    for (kmp_taskgraph_exec_descr_t *pred = incoming_succs_to_fill; pred;
+         pred = pred->predecessor_chain) {
+      pred->successor = exec_descr;
+    }
 
-      *succs_to_fill_p = exec_descr;
+    *succs_to_fill_p = exec_descr;
 
-      return exec_descr;
-    }
-    case TASKGRAPH_REGION_SEQUENTIAL:
-      return __kmp_fill_sequential_descr(gtid, thread, taskgraph, region,
-                                         parent_taskdata, exec_descrs,
-                                         next_idx, succs_to_fill_p);
-    case TASKGRAPH_REGION_PARALLEL:
-    case TASKGRAPH_REGION_EXCLUSIVE:
-      return __kmp_fill_par_or_excl_descr(gtid, thread, taskgraph, region,
-                                          parent_taskdata, exec_descrs,
-                                          next_idx, succs_to_fill_p);
+    return exec_descr;
+  }
+  case TASKGRAPH_REGION_SEQUENTIAL:
+    return __kmp_fill_sequential_descr(gtid, thread, taskgraph, region,
+                                       parent_taskdata, exec_descrs, next_idx,
+                                       succs_to_fill_p);
+  case TASKGRAPH_REGION_PARALLEL:
+  case TASKGRAPH_REGION_EXCLUSIVE:
+    return __kmp_fill_par_or_excl_descr(gtid, thread, taskgraph, region,
+                                        parent_taskdata, exec_descrs, next_idx,
+                                        succs_to_fill_p);
   }
   return nullptr;
 }
 
 #ifdef DEBUG_TASKGRAPH
-static void
-__kmp_debug_taskgraph_exec_descr(kmp_taskgraph_exec_descr_t *descrs,
-                                 kmp_size_t count) {
+static void __kmp_debug_taskgraph_exec_descr(kmp_taskgraph_exec_descr_t *descrs,
+                                             kmp_size_t count) {
   fprintf(stderr, "digraph ExecDescr {\n");
   fprintf(stderr, "  end [shape=diamond]\n");
   for (kmp_size_t i = 0; i < count; i++) {
     kmp_taskgraph_exec_descr_t *descr = &descrs[i];
     fprintf(stderr, "  \"%p\" [label=< <B>", descr->region);
     switch (descr->region->type) {
-      case TASKGRAPH_REGION_PARALLEL:
-        fprintf(stderr, "par</B> %p<BR/>preds=%d", descr->region,
+    case TASKGRAPH_REGION_PARALLEL:
+      fprintf(stderr, "par</B> %p<BR/>preds=%d", descr->region,
+              descr->npredecessors.load());
+      break;
+    case TASKGRAPH_REGION_EXCLUSIVE:
+      fprintf(stderr, "excl</B> %p<BR/>preds=%d", descr->region,
+              descr->npredecessors.load());
+      break;
+    case TASKGRAPH_REGION_NODE:
+      if (descr->region->task.node->u.resolved.count > 1) {
+        fprintf(stderr, "task</B> %p<BR/>preds=%d instances=%d",
+                descr->region->task.node, descr->npredecessors.load(),
+                descr->region->task.node->u.resolved.count);
+      } else {
+        fprintf(stderr, "task</B> %p<BR/>preds=%d", descr->region->task.node,
                 descr->npredecessors.load());
-        break;
-      case TASKGRAPH_REGION_EXCLUSIVE:
-        fprintf(stderr, "excl</B> %p<BR/>preds=%d", descr->region,
+      }
+      break;
+    case TASKGRAPH_REGION_WAIT:
+      if (descr->region->task.node->u.resolved.count > 1) {
+        fprintf(stderr, "wait</B> %p<BR/>preds=%d instances=%d", descr->region,
+                descr->npredecessors.load(),
+                descr->region->task.node->u.resolved.count);
+      } else {
+        fprintf(stderr, "wait</B> %p<BR/>preds=%d", descr->region,
                 descr->npredecessors.load());
-        break;
-      case TASKGRAPH_REGION_NODE:
-        if (descr->region->task.node->u.resolved.count > 1) {
-          fprintf(stderr, "task</B> %p<BR/>preds=%d instances=%d",
-                  descr->region->task.node,
-                  descr->npredecessors.load(),
-                  descr->region->task.node->u.resolved.count);
-        } else {
-          fprintf(stderr, "task</B> %p<BR/>preds=%d", descr->region->task.node,
-                  descr->npredecessors.load());
-        }
-        break;
-      case TASKGRAPH_REGION_WAIT:
-        if (descr->region->task.node->u.resolved.count > 1) {
-          fprintf(stderr, "wait</B> %p<BR/>preds=%d instances=%d",
-                  descr->region, descr->npredecessors.load(),
-                  descr->region->task.node->u.resolved.count);
-        } else {
-          fprintf(stderr, "wait</B> %p<BR/>preds=%d", descr->region,
-                  descr->npredecessors.load());
-        }
-        break;
-      default:
-        fprintf(stderr, "???</B>");
+      }
+      break;
+    default:
+      fprintf(stderr, "???</B>");
     }
     fprintf(stderr, " >, shape=box]\n");
 
@@ -2312,9 +2298,10 @@ __kmp_debug_taskgraph_exec_descr(kmp_taskgraph_exec_descr_t *descrs,
          descr->region->type == TASKGRAPH_REGION_WAIT) &&
         descr->region->task.node->u.resolved.count > 1) {
       kmp_taskgraph_region_t *region = descr->region;
-      fprintf(stderr,
-              "  \"%p\" -> \"%p\" [style=dotted, color=blue, constraint=false]\n",
-              region, region->task.next_instance);
+      fprintf(
+          stderr,
+          "  \"%p\" -> \"%p\" [style=dotted, color=blue, constraint=false]\n",
+          region, region->task.next_instance);
     }
 
     if (descr->successor) {
@@ -2344,9 +2331,8 @@ __kmp_debug_taskgraph_exec_descr(kmp_taskgraph_exec_descr_t *descrs,
 }
 #endif
 
-static void
-__kmp_exec_descr_link_instances(kmp_taskgraph_exec_descr_t *descrs,
-                                kmp_size_t count) {
+static void __kmp_exec_descr_link_instances(kmp_taskgraph_exec_descr_t *descrs,
+                                            kmp_size_t count) {
   for (kmp_size_t i = 0; i < count; i++) {
     kmp_taskgraph_exec_descr_t *descr = &descrs[i];
     if (descr->region->type == TASKGRAPH_REGION_NODE ||
@@ -2357,9 +2343,10 @@ __kmp_exec_descr_link_instances(kmp_taskgraph_exec_descr_t *descrs,
 
 /// Reset, reparent and regroup the recorded task TASK and re-invoke it.
 
-static void
-__kmp_omp_tg_task(kmp_int32 gtid, kmp_task_t *task, kmp_taskgroup_t *taskgroup,
-                  kmp_taskdata_t *parent_taskdata, bool serialize_immediate) {
+static void __kmp_omp_tg_task(kmp_int32 gtid, kmp_task_t *task,
+                              kmp_taskgroup_t *taskgroup,
+                              kmp_taskdata_t *parent_taskdata,
+                              bool serialize_immediate) {
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
   taskdata->td_parent = parent_taskdata;
 
@@ -2368,8 +2355,8 @@ __kmp_omp_tg_task(kmp_int32 gtid, kmp_task_t *task, kmp_taskgroup_t *taskgroup,
   taskdata->td_flags.freed = 0;
   taskdata->td_flags.executing = 0;
   taskdata->td_flags.task_serial =
-      (parent_taskdata->td_flags.final ||
-        taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser);
+      (parent_taskdata->td_flags.final || taskdata->td_flags.team_serial ||
+       taskdata->td_flags.tasking_ser);
 
   KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
   KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
@@ -2389,79 +2376,76 @@ struct kmp_taskred_input;
 template <typename T>
 void *__kmp_task_reduction_init(int gtid, int num, T *data);
 
-static void
-__kmp_taskgraph_exec_descr_start(kmp_int32 gtid, kmp_info_t *thread,
-                                 kmp_taskgraph_exec_descr_t *descr,
-                                 kmp_taskgroup_t *taskgroup) {
+static void __kmp_taskgraph_exec_descr_start(kmp_int32 gtid, kmp_info_t *thread,
+                                             kmp_taskgraph_exec_descr_t *descr,
+                                             kmp_taskgroup_t *taskgroup) {
   kmp_int32 npredecessors = KMP_ATOMIC_DEC(&descr->npredecessors) - 1;
   if (npredecessors > 0)
     return;
 
   switch (descr->region->type) {
-    case TASKGRAPH_REGION_NODE:
-    case TASKGRAPH_REGION_WAIT: {
-      kmp_taskgraph_exec_descr_t *lowest_descr = nullptr, *iter = descr;
-      do {
-        if (!lowest_descr || lowest_descr > iter)
-          lowest_descr = iter;
-        iter = iter->next_instance;
-      } while (iter != descr);
-      kmp_int32 nblocks = KMP_ATOMIC_DEC(&lowest_descr->nblocks);
-      if (nblocks <= 0) {
-        if (descr->region->type == TASKGRAPH_REGION_NODE) {
-          kmp_task_t *task = descr->region->task.node->task;
-          kmp_taskdata_t *current_taskdata = thread->th.th_current_task;
-          __kmp_omp_tg_task(gtid, task, taskgroup, current_taskdata, false);
-        } else {
-          // There's no task for a 'taskwait', so start successors immediately.
-          kmp_taskgraph_exec_descr_t *walk = descr;
-          do {
-            if (walk->successor) {
-              __kmp_taskgraph_exec_descr_start(gtid, thread, walk->successor,
-                                               taskgroup);
-            }
-            walk = walk->next_instance;
-          } while (walk != descr);
-
-        }
-      }
-      break;
-    }
-    case TASKGRAPH_REGION_PARALLEL: {
-      if (descr->region->reduce_input) {
-        // If there are reductions associated with this parallel region, we
-        // start a new taskgroup here.
-        __kmpc_taskgroup(/*loc=*/nullptr, gtid);
-        // Update variable to the newly-created taskgroup.
-        taskgroup = thread->th.th_current_task->td_taskgroup;
-        __kmp_task_reduction_init(gtid,
-                                  descr->region->reduce_input->reduce_num_data,
-                                  (struct kmp_taskred_input *)
-                                    descr->region->reduce_input->reduce_data);
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT: {
+    kmp_taskgraph_exec_descr_t *lowest_descr = nullptr, *iter = descr;
+    do {
+      if (!lowest_descr || lowest_descr > iter)
+        lowest_descr = iter;
+      iter = iter->next_instance;
+    } while (iter != descr);
+    kmp_int32 nblocks = KMP_ATOMIC_DEC(&lowest_descr->nblocks);
+    if (nblocks <= 0) {
+      if (descr->region->type == TASKGRAPH_REGION_NODE) {
+        kmp_task_t *task = descr->region->task.node->task;
+        kmp_taskdata_t *current_taskdata = thread->th.th_current_task;
+        __kmp_omp_tg_task(gtid, task, taskgroup, current_taskdata, false);
+      } else {
+        // There's no task for a 'taskwait', so start successors immediately.
+        kmp_taskgraph_exec_descr_t *walk = descr;
+        do {
+          if (walk->successor) {
+            __kmp_taskgraph_exec_descr_start(gtid, thread, walk->successor,
+                                             taskgroup);
+          }
+          walk = walk->next_instance;
+        } while (walk != descr);
       }
-      kmp_taskgraph_exec_descr_t *head = descr->successor;
-      kmp_taskgraph_exec_descr_t *item = head;
-      do {
-        __kmp_taskgraph_exec_descr_start(gtid, thread, item, taskgroup);
-        item = item->sibling;
-      } while (item != head);
-      if (descr->region->reduce_input)
-        __kmpc_end_taskgroup(/*loc=*/nullptr, gtid);
-      break;
     }
-    case TASKGRAPH_REGION_EXCLUSIVE: {
-      kmp_taskgraph_exec_descr_t *head = descr->successor;
-      kmp_taskgraph_exec_descr_t *item = head;
-      do {
-        assert(item->region->type == TASKGRAPH_REGION_NODE);
-        kmp_task_t *task = item->region->task.node->task;
-        kmp_taskdata_t *current_taskdata = thread->th.th_current_task;
-        __kmp_omp_tg_task(gtid, task, taskgroup, current_taskdata, true);
-        item = item->sibling;
-      } while (item != head);
-      break;
+    break;
+  }
+  case TASKGRAPH_REGION_PARALLEL: {
+    if (descr->region->reduce_input) {
+      // If there are reductions associated with this parallel region, we
+      // start a new taskgroup here.
+      __kmpc_taskgroup(/*loc=*/nullptr, gtid);
+      // Update variable to the newly-created taskgroup.
+      taskgroup = thread->th.th_current_task->td_taskgroup;
+      __kmp_task_reduction_init(
+          gtid, descr->region->reduce_input->reduce_num_data,
+          (struct kmp_taskred_input *)descr->region->reduce_input->reduce_data);
     }
-    default: ;
+    kmp_taskgraph_exec_descr_t *head = descr->successor;
+    kmp_taskgraph_exec_descr_t *item = head;
+    do {
+      __kmp_taskgraph_exec_descr_start(gtid, thread, item, taskgroup);
+      item = item->sibling;
+    } while (item != head);
+    if (descr->region->reduce_input)
+      __kmpc_end_taskgroup(/*loc=*/nullptr, gtid);
+    break;
+  }
+  case TASKGRAPH_REGION_EXCLUSIVE: {
+    kmp_taskgraph_exec_descr_t *head = descr->successor;
+    kmp_taskgraph_exec_descr_t *item = head;
+    do {
+      assert(item->region->type == TASKGRAPH_REGION_NODE);
+      kmp_task_t *task = item->region->task.node->task;
+      kmp_taskdata_t *current_taskdata = thread->th.th_current_task;
+      __kmp_omp_tg_task(gtid, task, taskgroup, current_taskdata, true);
+      item = item->sibling;
+    } while (item != head);
+    break;
+  }
+  default:;
   }
 }
 
@@ -2469,60 +2453,58 @@ static bool
 __kmp_taskgraph_exec_descr_finish(kmp_int32 gtid, kmp_info_t *thread,
                                   kmp_taskgraph_exec_descr_t *descr) {
   switch (descr->region->type) {
-    case TASKGRAPH_REGION_NODE: {
-      kmp_task_t *task = descr->region->task.node->task;
-      kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
-      taskdata->td_flags.started = 0;
-      taskdata->td_flags.executing = 0;
-      taskdata->td_flags.complete = 0;
-      taskdata->td_flags.freed = 0;
-      bool any_successors = false;
-      kmp_taskgraph_exec_descr_t *walk = descr;
-      do {
-        if (walk->successor) {
-          any_successors = true;
-          __kmp_taskgraph_exec_descr_start(gtid, thread, walk->successor,
-                                          taskdata->td_taskgroup);
-        }
-        walk = walk->next_instance;
-      } while (walk != descr);
-      return any_successors;
-    }
-    default:
-      fprintf(stderr, "unexpected exec descr type for finish? (%p)\n", descr);
-      exit(1);
+  case TASKGRAPH_REGION_NODE: {
+    kmp_task_t *task = descr->region->task.node->task;
+    kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+    taskdata->td_flags.started = 0;
+    taskdata->td_flags.executing = 0;
+    taskdata->td_flags.complete = 0;
+    taskdata->td_flags.freed = 0;
+    bool any_successors = false;
+    kmp_taskgraph_exec_descr_t *walk = descr;
+    do {
+      if (walk->successor) {
+        any_successors = true;
+        __kmp_taskgraph_exec_descr_start(gtid, thread, walk->successor,
+                                         taskdata->td_taskgroup);
+      }
+      walk = walk->next_instance;
+    } while (walk != descr);
+    return any_successors;
+  }
+  default:
+    fprintf(stderr, "unexpected exec descr type for finish? (%p)\n", descr);
+    exit(1);
   }
 
   return false;
 }
 
-static kmp_size_t
-__kmp_exec_descr_count(kmp_taskgraph_region_t *region) {
+static kmp_size_t __kmp_exec_descr_count(kmp_taskgraph_region_t *region) {
   kmp_size_t sum = 0;
 
   switch (region->type) {
-    case TASKGRAPH_REGION_ENTRY:
-    case TASKGRAPH_REGION_EXIT:
-      return 0;
-    case TASKGRAPH_REGION_NODE:
-    case TASKGRAPH_REGION_WAIT:
-      return 1;
-    case TASKGRAPH_REGION_PARALLEL:
-    case TASKGRAPH_REGION_EXCLUSIVE:
-      sum++;
-      KMP_FALLTHROUGH();
-    case TASKGRAPH_REGION_SEQUENTIAL:
-      for (kmp_int32 i = 0; i < region->inner.num_children; i++)
-        sum += __kmp_exec_descr_count(region->inner.children[i]);
-      break;
-    default:
-      fprintf(stderr, "unexpected region type\n");
-      exit(1);
+  case TASKGRAPH_REGION_ENTRY:
+  case TASKGRAPH_REGION_EXIT:
+    return 0;
+  case TASKGRAPH_REGION_NODE:
+  case TASKGRAPH_REGION_WAIT:
+    return 1;
+  case TASKGRAPH_REGION_PARALLEL:
+  case TASKGRAPH_REGION_EXCLUSIVE:
+    sum++;
+    KMP_FALLTHROUGH();
+  case TASKGRAPH_REGION_SEQUENTIAL:
+    for (kmp_int32 i = 0; i < region->inner.num_children; i++)
+      sum += __kmp_exec_descr_count(region->inner.children[i]);
+    break;
+  default:
+    fprintf(stderr, "unexpected region type\n");
+    exit(1);
   }
   return sum;
 }
 
-
 // Task Reduction implementation
 //
 // Note: initial implementation didn't take into account the possibility
@@ -2710,8 +2692,8 @@ void *__kmpc_taskred_init(int gtid, int num, void *data) {
   return __kmp_task_reduction_init(gtid, num, (kmp_taskred_input_t *)data);
 }
 
-static kmp_taskgraph_record_t *__kmp_taskgraph_or_parent_recording(
-    kmp_taskgroup_t *taskgroup) {
+static kmp_taskgraph_record_t *
+__kmp_taskgraph_or_parent_recording(kmp_taskgroup_t *taskgroup) {
   kmp_taskgraph_record_t *rec = nullptr;
 
   for (; taskgroup; taskgroup = taskgroup->parent) {
@@ -2732,13 +2714,15 @@ void *__kmpc_taskgraph_taskred_init(kmp_int32 gtid, kmp_int32 num, void *data) {
     kmp_taskgraph_status_t status = KMP_ATOMIC_LD_ACQ(&rec->status);
     if (status == KMP_TDG_RECORDING) {
       kmp_taskgraph_reduce_input_data_t *input_data =
-        (kmp_taskgraph_reduce_input_data_t *)
-          __kmp_fast_allocate(thread,
-                              sizeof(kmp_taskgraph_reduce_input_data_t));
+          (kmp_taskgraph_reduce_input_data_t *)__kmp_fast_allocate(
+              thread, sizeof(kmp_taskgraph_reduce_input_data_t));
       // The compiler might build the reduction input data on the stack, so
       // we must make a copy.
-      input_data->reduce_data = __kmp_fast_allocate(thread, sizeof(kmp_taskred_input_t) * num);
-      KMP_MEMCPY(input_data->reduce_data, data, sizeof(kmp_taskred_input_t) * num);;
+      input_data->reduce_data =
+          __kmp_fast_allocate(thread, sizeof(kmp_taskred_input_t) * num);
+      KMP_MEMCPY(input_data->reduce_data, data,
+                 sizeof(kmp_taskred_input_t) * num);
+      ;
       input_data->reduce_num_data = num;
       taskgroup->taskgraph.reduce_input = input_data;
     } else if (status == KMP_TDG_READY)
@@ -3175,29 +3159,26 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
 #endif
 }
 
-void
-__kmp_replay_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
-                       kmp_taskgraph_record_t *taskgraph, kmp_uint32 graph_id,
-                       kmp_taskgroup_t *taskgroup) {
+void __kmp_replay_taskgraph(kmp_int32 gtid, kmp_taskdata_t *current_taskdata,
+                            kmp_taskgraph_record_t *taskgraph,
+                            kmp_uint32 graph_id, kmp_taskgroup_t *taskgroup) {
   kmp_info_t *thread = __kmp_threads[gtid];
 
   kmp_taskgraph_exec_descr_t *exec_descrs = taskgraph->exec_descrs;
 
   if (!exec_descrs) {
     kmp_int32 exec_descr_count = __kmp_exec_descr_count(taskgraph->root);
-    exec_descrs =
-      (kmp_taskgraph_exec_descr_t*)__kmp_thread_malloc(thread,
-          exec_descr_count * sizeof(kmp_taskgraph_exec_descr_t));
+    exec_descrs = (kmp_taskgraph_exec_descr_t *)__kmp_thread_malloc(
+        thread, exec_descr_count * sizeof(kmp_taskgraph_exec_descr_t));
     taskgraph->exec_descrs = exec_descrs;
     taskgraph->exec_descr_size = exec_descr_count;
   }
 
   kmp_taskgraph_exec_descr_t *succs_to_fill = nullptr;
   kmp_size_t next_idx = 0;
-  kmp_taskgraph_exec_descr_t *head =
-    __kmp_fill_exec_descr(gtid, thread, taskgraph, taskgraph->root,
-                          current_taskdata, exec_descrs, next_idx,
-                          &succs_to_fill);
+  kmp_taskgraph_exec_descr_t *head = __kmp_fill_exec_descr(
+      gtid, thread, taskgraph, taskgraph->root, current_taskdata, exec_descrs,
+      next_idx, &succs_to_fill);
   assert(next_idx == taskgraph->exec_descr_size);
 
   __kmp_exec_descr_link_instances(exec_descrs, taskgraph->exec_descr_size);
@@ -4984,9 +4965,9 @@ public:
   }
 };
 
-kmp_taskgraph_node_t* __kmp_taskgraph_node_alloc(kmp_taskgraph_record_t *rec,
-                                                 kmp_task_t *task,
-                                                 kmp_size_t *index_p = nullptr) {
+kmp_taskgraph_node_t *
+__kmp_taskgraph_node_alloc(kmp_taskgraph_record_t *rec, kmp_task_t *task,
+                           kmp_size_t *index_p = nullptr) {
   kmp_int32 gtid = rec->gtid;
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgraph_node_t *new_task = nullptr;
@@ -4995,14 +4976,14 @@ kmp_taskgraph_node_t* __kmp_taskgraph_node_alloc(kmp_taskgraph_record_t *rec,
 
   if (!rec->record_map) {
     rec->nodes_allocated = 4;
-    rec->record_map = (kmp_taskgraph_node_t *)__kmp_thread_malloc(thread,
-                        rec->nodes_allocated * sizeof(kmp_taskgraph_node_t));
+    rec->record_map = (kmp_taskgraph_node_t *)__kmp_thread_malloc(
+        thread, rec->nodes_allocated * sizeof(kmp_taskgraph_node_t));
   }
 
   if (rec->num_tasks >= rec->nodes_allocated) {
-    rec->record_map =
-      (kmp_taskgraph_node_t *)__kmp_thread_realloc(thread, rec->record_map,
-                      2 * rec->nodes_allocated * sizeof(kmp_taskgraph_node_t));
+    rec->record_map = (kmp_taskgraph_node_t *)__kmp_thread_realloc(
+        thread, rec->record_map,
+        2 * rec->nodes_allocated * sizeof(kmp_taskgraph_node_t));
     rec->nodes_allocated *= 2;
   }
 
@@ -5039,18 +5020,15 @@ kmp_taskgraph_node_t* __kmp_taskgraph_node_alloc(kmp_taskgraph_record_t *rec,
 // tc         Iterations count
 // task_dup   Tasks duplication routine
 // codeptr_ra Return address for OMPT events
-static void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
-                                  kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
-                                  kmp_int32 nogroup, kmp_uint64 ub_glob,
-                                  kmp_uint64 num_tasks, kmp_uint64 grainsize,
-                                  kmp_uint64 extras, kmp_int64 last_chunk,
-                                  kmp_uint64 tc,
+static void __kmp_taskloop_linear(
+    ident_t *loc, int gtid, kmp_task_t *task, kmp_uint64 *lb, kmp_uint64 *ub,
+    kmp_int64 st, kmp_int32 nogroup, kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+    kmp_uint64 grainsize, kmp_uint64 extras, kmp_int64 last_chunk,
+    kmp_uint64 tc,
 #if OMPT_SUPPORT
-                                  void *codeptr_ra,
+    void *codeptr_ra,
 #endif
-                                  void *task_dup,
-                                  kmp_taskgraph_record_t *taskgraph_rec =
-                                    nullptr) {
+    void *task_dup, kmp_taskgraph_record_t *taskgraph_rec = nullptr) {
   KMP_COUNT_BLOCK(OMP_TASKLOOP);
   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
@@ -5134,7 +5112,7 @@ static void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
       kmp_size_t rec_index = -1;
       // Record the task in the taskgraph.
       kmp_taskgraph_node_t *node =
-        __kmp_taskgraph_node_alloc(taskgraph_rec, next_task, &rec_index);
+          __kmp_taskgraph_node_alloc(taskgraph_rec, next_task, &rec_index);
       kmp_taskgroup_t *taskgroup = current_task->td_taskgroup;
       if (taskgroup->taskgraph.reduce_input) {
         node->reduce_input = taskgroup->taskgraph.reduce_input;
@@ -5151,8 +5129,8 @@ static void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
       if (nogroup)
         taskgraph_rec->record_map[rec_index].u.unresolved.cfg_successor = -1;
       else if (taskloop_prev_idx != -1)
-        taskgraph_rec->record_map[taskloop_prev_idx].u.unresolved.cfg_successor =
-          rec_index;
+        taskgraph_rec->record_map[taskloop_prev_idx]
+            .u.unresolved.cfg_successor = rec_index;
       if (taskloop_first_idx == -1)
         taskloop_first_idx = rec_index;
       taskloop_prev_idx = rec_index;
@@ -5177,14 +5155,14 @@ static void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
       // Create a node to act as an "end group" marker.
       kmp_size_t endgroup_idx = -1;
       kmp_taskgraph_node_t *endgrpnode =
-        __kmp_taskgraph_node_alloc(taskgraph_rec, nullptr, &endgroup_idx);
+          __kmp_taskgraph_node_alloc(taskgraph_rec, nullptr, &endgroup_idx);
       endgrpnode->taskloop_task = true;
       // Point all the cfg_successor indices to this node now.
       for (kmp_int32 looptask = taskloop_first_idx; looptask != -1;) {
         kmp_int32 next_task =
-          taskgraph_rec->record_map[looptask].u.unresolved.cfg_successor;
+            taskgraph_rec->record_map[looptask].u.unresolved.cfg_successor;
         taskgraph_rec->record_map[looptask].u.unresolved.cfg_successor =
-          endgroup_idx;
+            endgroup_idx;
         looptask = next_task;
       }
     }
@@ -5391,7 +5369,7 @@ void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
 
 #if OMP_TASKGRAPH_EXPERIMENTAL
   kmp_taskdata_t *new_task_data = KMP_TASK_TO_TASKDATA(new_task);
-  //new_task_data->tdg = taskdata->tdg;
+  // new_task_data->tdg = taskdata->tdg;
   new_task_data->owning_taskgraph = nullptr;
 #endif
 
@@ -5432,7 +5410,7 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     OMPT_STORE_RETURN_ADDRESS(gtid);
 #endif
-      // This is unreachable, I think.
+    // This is unreachable, I think.
     if (!taskgraph_rec)
       __kmpc_taskgroup(loc, gtid);
   }
@@ -5548,8 +5526,9 @@ static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
                           OMPT_GET_RETURN_ADDRESS(0),
 #endif
                           task_dup, taskgraph_rec);
-  // check if clause value next
-  // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native)
+    // check if clause value next
+    // Also require GOMP_taskloop to reduce to linear
+    // (taskdata->td_flags.native)
   } else if (if_val == 0) { // if(0) specified, mark task as serial
     taskdata->td_flags.task_serial = 1;
     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
@@ -5702,11 +5681,12 @@ bool __kmpc_omp_has_task_team(kmp_int32 gtid) {
 
 #if OMP_TASKGRAPH_EXPERIMENTAL
 
-static kmp_taskgraph_record_t*
-__kmp_taskgraph_alloc(kmp_int32 gtid, kmp_int32 graph_id) {
+static kmp_taskgraph_record_t *__kmp_taskgraph_alloc(kmp_int32 gtid,
+                                                     kmp_int32 graph_id) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgraph_record_t *new_rec =
-    (kmp_taskgraph_record_t *)__kmp_fast_allocate(thread, sizeof(kmp_taskgraph_record_t));
+      (kmp_taskgraph_record_t *)__kmp_fast_allocate(
+          thread, sizeof(kmp_taskgraph_record_t));
   new_rec->status = KMP_TDG_RECORDING;
   new_rec->gtid = gtid;
   new_rec->graph_id = graph_id;
@@ -5726,8 +5706,10 @@ __kmp_taskgraph_alloc(kmp_int32 gtid, kmp_int32 graph_id) {
 // Clone a (new) task that has had its private variables and shared variables
 // initialised already.
 static kmp_task_t *__kmp_taskgraph_clone_task(kmp_info_t *thread,
-    kmp_taskgraph_record_t *taskgraph, kmp_task_t *orig,
-    size_t sizeof_kmp_task_t, size_t sizeof_shareds) {
+                                              kmp_taskgraph_record_t *taskgraph,
+                                              kmp_task_t *orig,
+                                              size_t sizeof_kmp_task_t,
+                                              size_t sizeof_shareds) {
   // FIXME: This should use a "taskdup" function like taskloops in cases where
   // private variables are not trivially copyable.  For now, do it by plain
   // bitwise copy.
@@ -5738,7 +5720,8 @@ static kmp_task_t *__kmp_taskgraph_clone_task(kmp_info_t *thread,
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(orig);
   size_t shareds_offset = sizeof(kmp_taskdata_t) + sizeof_kmp_task_t;
   shareds_offset = __kmp_round_up_to_val(shareds_offset, sizeof(kmp_uint64));
-  kmp_taskdata_t *copy_td = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset + sizeof_shareds);
+  kmp_taskdata_t *copy_td = (kmp_taskdata_t *)__kmp_fast_allocate(
+      thread, shareds_offset + sizeof_shareds);
   KMP_MEMCPY(copy_td, taskdata, shareds_offset + sizeof_shareds);
   // Tasks cloned for a taskgraph always have this field set.
   copy_td->owning_taskgraph = taskgraph;
@@ -5757,10 +5740,11 @@ static kmp_task_t *__kmp_taskgraph_clone_task(kmp_info_t *thread,
 // entry:       Pointer to the entry function
 // args:        Pointer to the function arguments
 void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
-                      std::atomic<void*> *tdg_handle, kmp_uint32 graph_id,
+                      std::atomic<void *> *tdg_handle, kmp_uint32 graph_id,
                       kmp_int32 graph_reset, kmp_int32 nogroup,
                       void (*entry)(void *), void *args) {
-  kmp_taskgraph_record_t *record = (kmp_taskgraph_record_t*)KMP_ATOMIC_LD_ACQ(tdg_handle);
+  kmp_taskgraph_record_t *record =
+      (kmp_taskgraph_record_t *)KMP_ATOMIC_LD_ACQ(tdg_handle);
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgroup_t *taskgroup;
 
@@ -5778,11 +5762,11 @@ void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
 
   if (!record) {
     record = __kmp_taskgraph_alloc(gtid, graph_id);
-    // Another thread may have allocated the taskgraph already.  Check that here.
+    // Another thread may have allocated the taskgraph already.  Check that
+    // here.
     kmp_taskgraph_record_t *other =
-      (kmp_taskgraph_record_t *)KMP_COMPARE_AND_STORE_RET64(tdg_handle,
-                                                            nullptr,
-                                                            record);
+        (kmp_taskgraph_record_t *)KMP_COMPARE_AND_STORE_RET64(tdg_handle,
+                                                              nullptr, record);
     if (other != nullptr) {
       __kmp_fast_free(thread, record);
       record = other;
@@ -5807,7 +5791,7 @@ void __kmpc_taskgraph(ident_t *loc_ref, kmp_int32 gtid,
   else if (status == KMP_TDG_READY) {
     kmp_taskdata *current_taskdata = thread->th.th_current_task;
     KG_TRACE(10, ("Replay taskgraph %p from task %p\n", record,
-             KMP_TASKDATA_TO_TASK(current_taskdata)));
+                  KMP_TASKDATA_TO_TASK(current_taskdata)));
     __kmp_acquire_lock(&record->map_lock, gtid);
     __kmp_replay_taskgraph(gtid, current_taskdata, record, graph_id, taskgroup);
     __kmpc_end_taskgroup(loc_ref, gtid);
@@ -5837,9 +5821,8 @@ kmp_uint32 __kmpc_taskgraph_task(ident_t *loc_ref, kmp_int32 gtid,
   if (rec) {
     kmp_taskgraph_status_t status = KMP_ATOMIC_LD_ACQ(&rec->status);
     if (status == KMP_TDG_RECORDING) {
-      kmp_task_t *cloned_task =
-        __kmp_taskgraph_clone_task(thread, rec, new_task, sizeof_kmp_task_t,
-                                   sizeof_shareds);
+      kmp_task_t *cloned_task = __kmp_taskgraph_clone_task(
+          thread, rec, new_task, sizeof_kmp_task_t, sizeof_shareds);
       kmp_taskgraph_node_t *node = __kmp_taskgraph_node_alloc(rec, cloned_task);
       if (taskgroup->taskgraph.reduce_input) {
         node->reduce_input = taskgroup->taskgraph.reduce_input;
@@ -5848,25 +5831,28 @@ kmp_uint32 __kmpc_taskgraph_task(ident_t *loc_ref, kmp_int32 gtid,
 #if defined(DEBUG_TASKGRAPH)
       fprintf(stderr, "__kmpc_taskgraph_task: record task here!\n");
       fprintf(stderr, "private size: %d, shared size: %d\n",
-              (int)(sizeof_kmp_task_t - sizeof(kmp_task_t)), (int)sizeof_shareds);
-      fprintf(stderr, "ndeps: %d\n", (int) ndeps);
+              (int)(sizeof_kmp_task_t - sizeof(kmp_task_t)),
+              (int)sizeof_shareds);
+      fprintf(stderr, "ndeps: %d\n", (int)ndeps);
       fprintf(stderr, "gtid: %d rec->gtid: %d\n", gtid, rec->gtid);
-      fprintf(stderr, "taskgroup: %p\n", thread->th.th_current_task->td_taskgroup);
+      fprintf(stderr, "taskgroup: %p\n",
+              thread->th.th_current_task->td_taskgroup);
       kmp_taskdata_t *parent = thread->th.th_current_task->td_parent;
       while (parent) {
-        fprintf(stderr, "  parent: %p (taskgroup %p)\n", parent, parent->td_taskgroup);
+        fprintf(stderr, "  parent: %p (taskgroup %p)\n", parent,
+                parent->td_taskgroup);
         parent = parent->td_parent;
       }
 #endif
       node->u.unresolved.ndeps = ndeps;
-      node->u.unresolved.dep_list =
-        (kmp_depend_info_t *)__kmp_thread_malloc(thread,
-                                                 ndeps * sizeof(kmp_depend_info_t));
+      node->u.unresolved.dep_list = (kmp_depend_info_t *)__kmp_thread_malloc(
+          thread, ndeps * sizeof(kmp_depend_info_t));
       KMP_MEMCPY(node->u.unresolved.dep_list, dep_list,
                  ndeps * sizeof(kmp_depend_info_t));
     } else if (status == KMP_TDG_READY) {
 #ifdef DEBUG_TASKGRAPH
-      fprintf(stderr, "non-taskgraph task entry point for task in finalized taskgraph");
+      fprintf(stderr,
+              "non-taskgraph task entry point for task in finalized taskgraph");
 #endif
       return 0;
     }
@@ -5881,15 +5867,15 @@ kmp_uint32 __kmpc_taskgraph_task(ident_t *loc_ref, kmp_int32 gtid,
   if (ndeps == 0)
     res = __kmpc_omp_task(loc_ref, gtid, new_task);
   else
-    res = __kmpc_omp_task_with_deps(loc_ref, gtid, new_task, ndeps, dep_list,
-                                    0, nullptr);
+    res = __kmpc_omp_task_with_deps(loc_ref, gtid, new_task, ndeps, dep_list, 0,
+                                    nullptr);
 
   return res;
 }
 
-void
-__kmpc_taskgraph_taskwait(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
-                          kmp_depend_info_t *dep_list, kmp_int32 has_no_wait) {
+void __kmpc_taskgraph_taskwait(ident_t *loc_ref, kmp_int32 gtid,
+                               kmp_int32 ndeps, kmp_depend_info_t *dep_list,
+                               kmp_int32 has_no_wait) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgroup_t *taskgroup = thread->th.th_current_task->td_taskgroup;
   kmp_taskgraph_record_t *rec = __kmp_taskgraph_or_parent_recording(taskgroup);
@@ -5900,17 +5886,20 @@ __kmpc_taskgraph_taskwait(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
       kmp_taskgraph_node_t *node = __kmp_taskgraph_node_alloc(rec, nullptr);
 #ifdef DEBUG_TASKGRAPH
       fprintf(stderr, "__kmpc_taskgraph_taskwait: record taskwait here!\n");
-      fprintf(stderr, "ndeps: %d\n", (int) ndeps);
+      fprintf(stderr, "ndeps: %d\n", (int)ndeps);
 #endif
       node->u.unresolved.ndeps = ndeps;
-      node->u.unresolved.dep_list = (kmp_depend_info_t *)__kmp_thread_malloc(thread, ndeps * sizeof(kmp_depend_info_t));
-      KMP_MEMCPY(node->u.unresolved.dep_list, dep_list, ndeps * sizeof(kmp_depend_info_t));
+      node->u.unresolved.dep_list = (kmp_depend_info_t *)__kmp_thread_malloc(
+          thread, ndeps * sizeof(kmp_depend_info_t));
+      KMP_MEMCPY(node->u.unresolved.dep_list, dep_list,
+                 ndeps * sizeof(kmp_depend_info_t));
       // TODO: Record has_no_wait somewhere?
-      //if (has_no_wait)
+      // if (has_no_wait)
       //  return;
     } else if (status == KMP_TDG_READY) {
 #ifdef DEBUG_TASKGRAPH
-      fprintf(stderr, "non-taskgraph taskwait entry point for taskwait in finalized taskgraph\n");
+      fprintf(stderr, "non-taskgraph taskwait entry point for taskwait in "
+                      "finalized taskgraph\n");
 #endif
       return;
     }
@@ -5920,14 +5909,14 @@ __kmpc_taskgraph_taskwait(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
                               has_no_wait);
 }
 
-kmp_uint32
-__kmpc_taskgraph_taskloop(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task,
-                          kmp_int32 flags, size_t sizeof_kmp_task_t,
-                          void *shareds, size_t sizeof_shareds,
-                          kmp_int32 if_val, kmp_uint64 *lb, kmp_uint64 *ub,
-                          kmp_int64 st, kmp_int32 nogroup, kmp_int32 sched,
-                          kmp_uint64 grainsize, kmp_int32 modifier,
-                          void *task_dup) {
+kmp_uint32 __kmpc_taskgraph_taskloop(ident_t *loc_ref, kmp_int32 gtid,
+                                     kmp_task_t *new_task, kmp_int32 flags,
+                                     size_t sizeof_kmp_task_t, void *shareds,
+                                     size_t sizeof_shareds, kmp_int32 if_val,
+                                     kmp_uint64 *lb, kmp_uint64 *ub,
+                                     kmp_int64 st, kmp_int32 nogroup,
+                                     kmp_int32 sched, kmp_uint64 grainsize,
+                                     kmp_int32 modifier, void *task_dup) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskgroup_t *taskgroup = thread->th.th_current_task->td_taskgroup;
   kmp_taskgraph_record_t *rec = __kmp_taskgraph_or_parent_recording(taskgroup);
@@ -5939,7 +5928,8 @@ __kmpc_taskgraph_taskloop(ident_t *loc_ref, kmp_int32 gtid, kmp_task_t *new_task
                      sched, grainsize, modifier, task_dup, rec);
     else if (status == KMP_TDG_READY) {
 #ifdef DEBUG_TASKGRAPH
-      fprintf(stderr, "non-taskgraph taskloop entry point for taskloop in finalized taskgraph\n");
+      fprintf(stderr, "non-taskgraph taskloop entry point for taskloop in "
+                      "finalized taskgraph\n");
 #endif
       return 0;
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_1.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_1.cpp
index d6abdb1e1..cfcb84e88 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_1.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_1.cpp
@@ -1,28 +1,33 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[3];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[2])
-          { }
-          #pragma omp task depend(out: deps[0], deps[1])
-          { }
-          #pragma omp task depend(inout: deps[0])
-          { }
-          #pragma omp task depend(inout: deps[1])
-          { }
-          #pragma omp task depend(inout: deps[2])
-          { }
-          #pragma omp task depend(in: deps[0], deps[1], deps[2])
-          { }
+#pragma omp task depend(out : deps[2])
+          {
+          }
+#pragma omp task depend(out : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(inout : deps[0])
+          {
+          }
+#pragma omp task depend(inout : deps[1])
+          {
+          }
+#pragma omp task depend(inout : deps[2])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[1], deps[2])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_10.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_10.cpp
index f3dd856f8..b37f739fa 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_10.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_10.cpp
@@ -1,29 +1,34 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[5];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1], deps[4])
-          { }
-          #pragma omp task depend(in: deps[0], deps[1])
-          { }
+#pragma omp task depend(out : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1], deps[4])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[1])
+          {
+          }
 
-          #pragma omp task depend(out: deps[2], deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2], deps[3], deps[4])
-          { }
-          #pragma omp task depend(in: deps[2], deps[3])
-          { }
+#pragma omp task depend(out : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2], deps[3], deps[4])
+          {
+          }
+#pragma omp task depend(in : deps[2], deps[3])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_11.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_11.cpp
index 4f2babaef..634b4896c 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_11.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_11.cpp
@@ -1,33 +1,40 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[4];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(in: deps[0], deps[1])
-          { }
+#pragma omp task depend(out : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[1])
+          {
+          }
 
-          #pragma omp task depend(out: deps[2], deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
-          { }
-          #pragma omp task depend(in: deps[2], deps[3])
-          { }
+#pragma omp task depend(out : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(in : deps[2], deps[3])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_12.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_12.cpp
index d3615187b..14973399e 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_12.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_12.cpp
@@ -1,32 +1,39 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[2];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(mutexinoutset: deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[1])
-          { }
+#pragma omp task depend(mutexinoutset : deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[1])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_13.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_13.cpp
index de2b5e138..b04b97a81 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_13.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_13.cpp
@@ -1,28 +1,33 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[4];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(mutexinoutset: deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
-          { }
+#pragma omp task depend(mutexinoutset : deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2], deps[3])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_14.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_14.cpp
index 684a196a8..d907c2827 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_14.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_14.cpp
@@ -1,30 +1,36 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[4];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(mutexinoutset: deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1], deps[2])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1], deps[2], deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[1], deps[2], deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[3])
-          { }
+#pragma omp task depend(mutexinoutset : deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1], deps[2])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1], deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[1], deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[3])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_15.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_15.cpp
index d35660ddf..9bfbb15d1 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_15.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_15.cpp
@@ -1,48 +1,63 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[4];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[1], deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2], deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2], deps[1], deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[3], deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[3], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[3], deps[1], deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[3], deps[2])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[3], deps[2], deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[3], deps[2], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[3], deps[2], deps[1], deps[0])
-          { }
+#pragma omp task
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[1], deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2], deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2], deps[1], deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[3], deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[3], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[3], deps[1], deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[3], deps[2])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[3], deps[2], deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[3], deps[2], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[3], deps[2], deps[1], deps[0])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_16.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_16.cpp
index 45aa3c587..675472b73 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_16.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_16.cpp
@@ -1,32 +1,39 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[8];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(inout: deps[0])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[4], deps[7])
-          { }
-          #pragma omp task depend(inout: deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[4], deps[7])
-          { }
-          #pragma omp task depend(inout: deps[2])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[5], deps[6])
-          { }
-          #pragma omp task depend(inout: deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[5], deps[6])
-          { }
+#pragma omp task depend(inout : deps[0])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[4], deps[7])
+          {
+          }
+#pragma omp task depend(inout : deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[4], deps[7])
+          {
+          }
+#pragma omp task depend(inout : deps[2])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[5], deps[6])
+          {
+          }
+#pragma omp task depend(inout : deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[5], deps[6])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_17.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_17.cpp
index 2c59595f5..3cc229cf0 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_17.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_17.cpp
@@ -2,42 +2,40 @@
 
 #include <cstdio>
 
-int main()
-{
+int main() {
   int deps[4];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[1])
+#pragma omp task depend(out : deps[0], deps[1])
           {
             fprintf(stderr, "task 0\n");
           }
-          #pragma omp task depend(out: deps[2], deps[3])
+#pragma omp task depend(out : deps[2], deps[3])
           {
             fprintf(stderr, "task 1\n");
           }
-          #pragma omp task depend(inout: deps[0])
+#pragma omp task depend(inout : deps[0])
           {
             fprintf(stderr, "task 2\n");
           }
-          #pragma omp task depend(inout: deps[1])
+#pragma omp task depend(inout : deps[1])
           {
             fprintf(stderr, "task 3\n");
           }
-          #pragma omp task depend(inout: deps[2])
+#pragma omp task depend(inout : deps[2])
           {
             fprintf(stderr, "task 4\n");
           }
-          #pragma omp task depend(inout: deps[3])
+#pragma omp task depend(inout : deps[3])
           {
             fprintf(stderr, "task 5\n");
           }
-          #pragma omp task depend(in: deps[0], deps[1], deps[2], deps[3])
+#pragma omp task depend(in : deps[0], deps[1], deps[2], deps[3])
           {
             fprintf(stderr, "task 6\n");
           }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_18.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_18.cpp
index 954cfcbad..8ebd25dd4 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_18.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_18.cpp
@@ -1,24 +1,26 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[2];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[1])
-          { }
-          #pragma omp taskloop num_tasks(strict: 2)
+#pragma omp task depend(out : deps[0], deps[1])
+          {
+          }
+#pragma omp taskloop num_tasks(strict : 2)
+          {
+            for (int j = 0; j < 20; j++) {
+            }
+          }
+#pragma omp task depend(in : deps[0], deps[1])
           {
-            for (int j = 0; j < 20; j++) { }
           }
-          #pragma omp task depend(in: deps[0], deps[1])
-          { }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_19.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_19.cpp
index 24d5fdfcb..83ed4551e 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_19.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_19.cpp
@@ -2,31 +2,29 @@
 
 #include <cstdio>
 
-int main()
-{
+int main() {
   int deps[3];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 10; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 10; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0])
+#pragma omp task depend(out : deps[0])
           {
             fprintf(stderr, "task 0\n");
           }
-          #pragma omp task depend(out: deps[1])
+#pragma omp task depend(out : deps[1])
           {
             fprintf(stderr, "task 1\n");
           }
-          #pragma omp task depend(out: deps[2])
+#pragma omp task depend(out : deps[2])
           {
             fprintf(stderr, "task 2\n");
           }
-          #pragma omp taskwait depend(in: deps[0], deps[1], deps[2])
-          #pragma omp task depend(in: deps[0], deps[1], deps[2])
+#pragma omp taskwait depend(in : deps[0], deps[1], deps[2])
+#pragma omp task depend(in : deps[0], deps[1], deps[2])
           {
             fprintf(stderr, "task 3\n");
           }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_2.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_2.cpp
index 89dd9137e..00e63d54e 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_2.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_2.cpp
@@ -1,30 +1,36 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[4];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[1])
-          { }
-          #pragma omp task depend(out: deps[2], deps[3])
-          { }
-          #pragma omp task depend(inout: deps[0])
-          { }
-          #pragma omp task depend(inout: deps[1])
-          { }
-          #pragma omp task depend(inout: deps[2])
-          { }
-          #pragma omp task depend(inout: deps[3])
-          { }
-          #pragma omp task depend(in: deps[0], deps[1], deps[2], deps[3])
-          { }
+#pragma omp task depend(out : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(out : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(inout : deps[0])
+          {
+          }
+#pragma omp task depend(inout : deps[1])
+          {
+          }
+#pragma omp task depend(inout : deps[2])
+          {
+          }
+#pragma omp task depend(inout : deps[3])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[1], deps[2], deps[3])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_20.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_20.cpp
index ab3b42995..4d5bc191f 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_20.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_20.cpp
@@ -2,31 +2,29 @@
 
 #include <cstdio>
 
-int main()
-{
+int main() {
   int deps[3];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 10; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 10; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0])
+#pragma omp task depend(out : deps[0])
           {
             fprintf(stderr, "task 0\n");
           }
-          #pragma omp task depend(out: deps[1])
+#pragma omp task depend(out : deps[1])
           {
             fprintf(stderr, "task 1\n");
           }
-          #pragma omp task depend(out: deps[2])
+#pragma omp task depend(out : deps[2])
           {
             fprintf(stderr, "task 2\n");
           }
-          #pragma omp taskwait depend(inoutset: deps[0], deps[1])
-          #pragma omp task depend(in: deps[0], deps[1], deps[2])
+#pragma omp taskwait depend(inoutset : deps[0], deps[1])
+#pragma omp task depend(in : deps[0], deps[1], deps[2])
           {
             fprintf(stderr, "task 3\n");
           }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_21.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_21.cpp
index ad36c8c5a..c6be2b0c0 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_21.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_21.cpp
@@ -2,8 +2,7 @@
 
 #include <cstdio>
 
-int main()
-{
+int main() {
   int arr[100];
 
   int res = 0;
@@ -13,16 +12,15 @@ int main()
   }
   printf("base result: %d\n", res);
 
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 10; i++)
-      {
+      for (int i = 0; i < 10; i++) {
         int res = 0;
-        #pragma omp taskgraph
+#pragma omp taskgraph
         {
-          #pragma omp taskloop reduction(+: res) num_tasks(10)
+#pragma omp taskloop reduction(+ : res) num_tasks(10)
           {
             for (int j = 0; j < 100; j++) {
               res += arr[j];
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_22.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_22.cpp
index 254de8e35..38f5c6501 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_22.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_22.cpp
@@ -1,4 +1,5 @@
-// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t &&
+// %libomp-run 2>&1 | FileCheck %s
 
 #include <cstdio>
 
@@ -9,8 +10,7 @@ void foo() {
   }
 }
 
-int main()
-{
+int main() {
   int arr[100];
 
   int res = 0;
@@ -20,16 +20,15 @@ int main()
   }
   fprintf(stderr, "base result: %d\n", res);
 
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 10; i++)
-      {
+      for (int i = 0; i < 10; i++) {
         int res = 0;
-        #pragma omp taskgraph
+#pragma omp taskgraph
         {
-          #pragma omp taskloop reduction(+: res) num_tasks(10)
+#pragma omp taskloop reduction(+ : res) num_tasks(10)
           {
             for (int j = 0; j < 100; j++) {
               res += arr[j];
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_23.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_23.cpp
index eb6930965..4750870c2 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_23.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_23.cpp
@@ -1,4 +1,5 @@
-// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t &&
+// %libomp-run 2>&1 | FileCheck %s
 
 #include <cstdio>
 
@@ -11,8 +12,7 @@ void foo() {
   }
 }
 
-int main()
-{
+int main() {
   int arr[100];
 
   int res = 0;
@@ -22,16 +22,15 @@ int main()
   }
   fprintf(stderr, "base result: %d\n", res);
 
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 10; i++)
-      {
+      for (int i = 0; i < 10; i++) {
         int res = 0;
-        #pragma omp taskgraph
+#pragma omp taskgraph
         {
-          #pragma omp taskloop reduction(+: res) num_tasks(10)
+#pragma omp taskloop reduction(+ : res) num_tasks(10)
           {
             for (int j = 0; j < 100; j++) {
               res += arr[j];
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_24.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_24.cpp
index c974a0852..e03269876 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_24.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_24.cpp
@@ -1,15 +1,15 @@
-// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && env
+// KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
 
 #include <cassert>
 
 int global_dep;
 
 void foo() {
-#pragma omp taskwait replayable(1) depend(in: global_dep)
+#pragma omp taskwait replayable(1) depend(in : global_dep)
 }
 
-int main()
-{
+int main() {
   int arr[100];
 
   int res = 0;
@@ -20,23 +20,23 @@ int main()
 
   assert(res == 4950);
 
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 10; i++)
-      {
+      for (int i = 0; i < 10; i++) {
         int res = 0;
-        #pragma omp taskgraph
+#pragma omp taskgraph
         {
-          #pragma omp taskloop reduction(+: res) num_tasks(10)
+#pragma omp taskloop reduction(+ : res) num_tasks(10)
           {
             for (int j = 0; j < 100; j++) {
               res += arr[j];
             }
           }
-          #pragma omp task depend(out: global_dep)
-          { }
+#pragma omp task depend(out : global_dep)
+          {
+          }
           foo();
         }
         assert(res == 4950);
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_25.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_25.cpp
index 0f2c3fbf9..f9d91f4ac 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_25.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_25.cpp
@@ -1,4 +1,5 @@
-// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t &&
+// %libomp-run 2>&1 | FileCheck %s
 
 #include <cstdio>
 
@@ -6,14 +7,13 @@ int global_dep;
 
 void foo() {
   fprintf(stderr, "called function foo\n");
-#pragma omp task replayable(1) depend(in: global_dep)
+#pragma omp task replayable(1) depend(in : global_dep)
   {
     fprintf(stderr, "out-of-line task created from within taskloop\n");
   }
 }
 
-int main()
-{
+int main() {
   int arr[100];
 
   int res = 0;
@@ -23,16 +23,15 @@ int main()
   }
   fprintf(stderr, "base result: %d\n", res);
 
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 4; i++)
-      {
+      for (int i = 0; i < 4; i++) {
         int res = 0;
-        #pragma omp taskgraph
+#pragma omp taskgraph
         {
-          #pragma omp taskloop reduction(+: res) num_tasks(4)
+#pragma omp taskloop reduction(+ : res) num_tasks(4)
           {
             for (int j = 0; j < 4; j++) {
               res += arr[j];
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_26.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_26.cpp
index 86c69e813..c694545b8 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_26.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_26.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t &&
+// %libomp-run 2>&1 | FileCheck %s
 
 #include <cstdio>
 
-int main()
-{
+int main() {
   int arr[100];
   int arr2[100];
 
@@ -16,22 +16,21 @@ int main()
   }
   fprintf(stderr, "base results: %d, %d\n", res, res2);
 
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 10; i++)
-      {
+      for (int i = 0; i < 10; i++) {
         int res = 0, res2 = 0;
-        #pragma omp taskgraph
+#pragma omp taskgraph
         {
-          #pragma omp taskloop reduction(+: res) num_tasks(10)
+#pragma omp taskloop reduction(+ : res) num_tasks(10)
           {
             for (int j = 0; j < 10; j++) {
               res += arr[j];
             }
           }
-          #pragma omp taskloop reduction(+: res2) num_tasks(10)
+#pragma omp taskloop reduction(+ : res2) num_tasks(10)
           {
             for (int j = 0; j < 10; j++) {
               res2 += arr2[j];
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_27.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_27.cpp
index 20b81b143..da9206383 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_27.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_27.cpp
@@ -1,9 +1,9 @@
-// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t && %libomp-run 2>&1 | FileCheck %s
+// RUN: %clangXX %flags %openmp_flags -fopenmp-version=60 %s -o %t &&
+// %libomp-run 2>&1 | FileCheck %s
 
 #include <cstdio>
 
-int main()
-{
+int main() {
   int arr[100];
   int arr2[100];
 
@@ -18,22 +18,21 @@ int main()
   }
   fprintf(stderr, "base results: %d, %d\n", res, res2);
 
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 10; i++)
-      {
+      for (int i = 0; i < 10; i++) {
         int res = 0, res2 = 0;
-        #pragma omp taskgraph
+#pragma omp taskgraph
         {
-          #pragma omp taskloop reduction(+: res) num_tasks(10)
+#pragma omp taskloop reduction(+ : res) num_tasks(10)
           {
             for (int j = 0; j < 10; j++) {
               res += arr[j];
             }
           }
-          #pragma omp taskloop reduction(+: res2) num_tasks(10)
+#pragma omp taskloop reduction(+ : res2) num_tasks(10)
           {
             for (int j = 0; j < 10; j++) {
               res2 += res * arr2[j];
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_3.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_3.cpp
index 368677269..7fadf7426 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_3.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_3.cpp
@@ -1,34 +1,42 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[6];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[1])
-          { }
-          #pragma omp task depend(out: deps[2], deps[3])
-          { }
-          #pragma omp task depend(inout: deps[0])
-          { }
-          #pragma omp task depend(inout: deps[1])
-          { }
-          #pragma omp task depend(inout: deps[2])
-          { }
-          #pragma omp task depend(inout: deps[3])
-          { }
-          #pragma omp task depend(in: deps[0], deps[1], deps[2], deps[3])
-          { }
-          #pragma omp task depend(in: deps[1], deps[2]) depend(out: deps[5])
-          { }
-          #pragma omp task depend(in: deps[5])
-          { }
+#pragma omp task depend(out : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(out : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(inout : deps[0])
+          {
+          }
+#pragma omp task depend(inout : deps[1])
+          {
+          }
+#pragma omp task depend(inout : deps[2])
+          {
+          }
+#pragma omp task depend(inout : deps[3])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[1], deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(in : deps[1], deps[2]) depend(out : deps[5])
+          {
+          }
+#pragma omp task depend(in : deps[5])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_4.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_4.cpp
index a70fed484..6f12f7625 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_4.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_4.cpp
@@ -1,34 +1,42 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[4];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[1])
-          { }
-          #pragma omp task depend(out: deps[2], deps[3])
-          { }
-          #pragma omp task depend(inout: deps[0])
-          { }
-          #pragma omp task depend(inout: deps[1])
-          { }
-          #pragma omp task depend(inout: deps[2])
-          { }
-          #pragma omp task depend(inout: deps[3])
-          { }
-          #pragma omp task depend(in: deps[0], deps[2], deps[3])
-          { }
-          #pragma omp task depend(in: deps[0], deps[1])
-          { }
-          #pragma omp task depend(in: deps[3])
-          { }
+#pragma omp task depend(out : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(out : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(inout : deps[0])
+          {
+          }
+#pragma omp task depend(inout : deps[1])
+          {
+          }
+#pragma omp task depend(inout : deps[2])
+          {
+          }
+#pragma omp task depend(inout : deps[3])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(in : deps[3])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_5.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_5.cpp
index 636208245..ee2bccebe 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_5.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_5.cpp
@@ -1,32 +1,39 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[4];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[2])
-          { }
-          #pragma omp task depend(out: deps[1], deps[3])
-          { }
-          #pragma omp task depend(inoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(inoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(inoutset: deps[2], deps[3])
-          { }
-          #pragma omp task depend(inoutset: deps[2], deps[3])
-          { }
-          #pragma omp task depend(in: deps[0], deps[1])
-          { }
-          #pragma omp task depend(in: deps[2], deps[3])
-          { }
+#pragma omp task depend(out : deps[0], deps[2])
+          {
+          }
+#pragma omp task depend(out : deps[1], deps[3])
+          {
+          }
+#pragma omp task depend(inoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(inoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(inoutset : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(inoutset : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(in : deps[2], deps[3])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_6.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_6.cpp
index 66e872f83..4ccea3584 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_6.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_6.cpp
@@ -1,32 +1,39 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[4];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[1], deps[3])
-          { }
-          #pragma omp task depend(out: deps[0], deps[2])
-          { }
-          #pragma omp task depend(inoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(inoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(inoutset: deps[2], deps[3])
-          { }
-          #pragma omp task depend(inoutset: deps[2], deps[3])
-          { }
-          #pragma omp task depend(in: deps[0], deps[2])
-          { }
-          #pragma omp task depend(in: deps[0], deps[2])
-          { }
+#pragma omp task depend(out : deps[1], deps[3])
+          {
+          }
+#pragma omp task depend(out : deps[0], deps[2])
+          {
+          }
+#pragma omp task depend(inoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(inoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(inoutset : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(inoutset : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[2])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[2])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_7.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_7.cpp
index c01d4a080..384206793 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_7.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_7.cpp
@@ -1,32 +1,39 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[4];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[1])
-          { }
-          #pragma omp task depend(out: deps[2], deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[2], deps[3])
-          { }
-          #pragma omp task depend(in: deps[0], deps[1])
-          { }
-          #pragma omp task depend(in: deps[2], deps[3])
-          { }
+#pragma omp task depend(out : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(out : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[2], deps[3])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(in : deps[2], deps[3])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_8.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_8.cpp
index 179e8bb08..0dc4731cc 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_8.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_8.cpp
@@ -1,22 +1,24 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[2];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(in: deps[0], deps[1])
-          { }
+#pragma omp task depend(out : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[1])
+          {
+          }
         }
       }
     }
diff --git a/openmp/runtime/test/taskgraph/taskgraph_deps_9.cpp b/openmp/runtime/test/taskgraph/taskgraph_deps_9.cpp
index 7e9af09db..fcd8019e0 100644
--- a/openmp/runtime/test/taskgraph/taskgraph_deps_9.cpp
+++ b/openmp/runtime/test/taskgraph/taskgraph_deps_9.cpp
@@ -1,26 +1,30 @@
-// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck %s
+// RUN: %libomp-cxx-compile && env KMP_G_DEBUG=10 %libomp-run 2>&1 | FileCheck
+// %s
 
-int main()
-{
+int main() {
   int deps[3];
-  #pragma omp parallel
+#pragma omp parallel
   {
-    #pragma omp single
+#pragma omp single
     {
-      for (int i = 0; i < 2; i++)
-      {
-        #pragma omp taskgraph
+      for (int i = 0; i < 2; i++) {
+#pragma omp taskgraph
         {
-          #pragma omp task depend(out: deps[0], deps[1])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(in: deps[1]) depend(out: deps[2])
-          { }
-          #pragma omp task depend(mutexinoutset: deps[0], deps[1])
-          { }
-          #pragma omp task depend(in: deps[0], deps[1], deps[2])
-          { }
+#pragma omp task depend(out : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(in : deps[1]) depend(out : deps[2])
+          {
+          }
+#pragma omp task depend(mutexinoutset : deps[0], deps[1])
+          {
+          }
+#pragma omp task depend(in : deps[0], deps[1], deps[2])
+          {
+          }
         }
       }
     }

``````````

</details>


https://github.com/llvm/llvm-project/pull/188765


More information about the cfe-commits mailing list