[Openmp-commits] [openmp] OMP runtime support for efficient partitioning of collapsed triangular loops (PR #83939)

Mon Mar 4 17:37:41 PST 2024

https://github.com/vadikp-intel created https://github.com/llvm/llvm-project/pull/83939

This PR adds OMP runtime support for more efficient partitioning of certain types of collapsed loops that can be used by compilers that support loop collapsing (i.e. MSVC) to achieve more optimal thread load balancing. 

In particular, this PR addresses double nested upper and lower isosceles triangular loops of the following types 

1. lower triangular 'less_than'
   for (int i=0; i<N; i++)
     for (int j=0; i<j; j++)
2. lower triangular 'less_than_equal'
   for (int i=0; i<N; j++)
     for (int j=0; i<=j; j++)
3. upper triangular
   for (int i=0; i<N; i++)
     for (int j=i; i<N; j++)

Includes tests for the three supported loop types.


>From 59f908fa487ca9dcd94e9baeabc710ff072be862 Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Mon, 4 Mar 2024 15:59:48 -0800
Subject: [PATCH 1/2] OpenMP runtime support for efficient partitioning of
 collapsed triangular loops

---
 openmp/runtime/src/kmp_collapse.cpp           | 262 +++++++++++++++++-
 openmp/runtime/src/kmp_collapse.h             |   7 +
 .../omp_for_collapse_LowerTriangularLess.c    | 128 +++++++++
 ...mp_for_collapse_LowerTriangularLessEqual.c | 128 +++++++++
 .../for/omp_for_collapse_UpperTriangular.c    | 128 +++++++++
 5 files changed, 652 insertions(+), 1 deletion(-)
 create mode 100644 openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c
 create mode 100644 openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c
 create mode 100644 openmp/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c

diff --git a/openmp/runtime/src/kmp_collapse.cpp b/openmp/runtime/src/kmp_collapse.cpp
index 2c410ca9b6030e..2148dd249f6814 100644
--- a/openmp/runtime/src/kmp_collapse.cpp
+++ b/openmp/runtime/src/kmp_collapse.cpp
@@ -13,7 +13,7 @@
 #include "kmp.h"
 #include "kmp_error.h"
 #include "kmp_i18n.h"
-#include "kmp_itt.h"
+#include "kmp_itt.h" 
 #include "kmp_stats.h"
 #include "kmp_str.h"
 #include "kmp_collapse.h"
@@ -1272,6 +1272,254 @@ void kmp_calc_original_ivs_for_end(
   }
 }
 
+/**************************************************************************
+ * Identify nested loop structure - loops come in the canonical form
+ * Lower triangle matrix: i = 0; i <= N; i++        {0,0}:{N,0}
+ *                        j = 0; j <= 0/-1+1*i; j++ {0,0}:{0/-1,1}
+ * Upper Triangle matrix
+ *                        i = 0;     i <= N; i++    {0,0}:{N,0}
+ *                        j = 0+1*i; j <= N; j++    {0,1}:{N,0}
+ * ************************************************************************/
+nested_loop_type_t
+kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
+                                   /*in*/ kmp_index_t n) {
+  // only 2-level nested loops are supported
+  if (n != 2) {
+    return nested_loop_type_unkown;
+  }
+  // loops must be canonical
+  KMP_ASSERT((original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
+        (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
+  // check outer loop bounds: for triangular need to be {0,0}:{N,0}
+  kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, original_bounds_nest[0].lb0_u64);
+  kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, original_bounds_nest[0].ub0_u64);
+  kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, original_bounds_nest[0].lb1_u64);
+  kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, original_bounds_nest[0].ub1_u64);
+  if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
+    return nested_loop_type_unkown;
+  }
+  // check inner bounds to determine triangle type
+  kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].lb0_u64);
+  kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
+  kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].lb1_u64);
+  kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub1_u64);
+  // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
+  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 && (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
+     return nested_loop_type_lower_triangular_matrix;
+   }
+  // upper triangle loop inner bounds need to be {0,1}:{N,0}
+  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 && inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
+     return nested_loop_type_upper_triangular_matrix;
+  }
+  return nested_loop_type_unkown;
+}
+
+/**************************************************************************
+ * SQRT Approximation: https://math.mit.edu/~stevenj/18.335/newton-sqrt.pdf
+ * Start point is x so the result is always > sqrt(x)
+ * The method has uniform convergence, PRECISION is set to 0.1
+ * ************************************************************************/
+#define level_of_precision 0.1
+double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
+  double sqrt_old = 0.;
+  double sqrt_new = (double)x;
+  do {
+    sqrt_old = sqrt_new;
+    sqrt_new = (sqrt_old + x / sqrt_old) / 2;
+  } while ((sqrt_old - sqrt_new) > level_of_precision);
+  return sqrt_new;
+}
+
+/**************************************************************************
+*  Handle lower triangle matrix in the canonical form
+*  i = 0; i <= N; i++          {0,0}:{N,0}
+*  j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
+* ************************************************************************/
+void kmp_handle_lower_triangle_matrix( /*in*/ kmp_uint32 nth,
+                                       /*in*/ kmp_uint32 tid,
+                                       /*in */ kmp_index_t n,
+                                       /*in/out*/ bounds_info_t *original_bounds_nest,
+                                       /*out*/ bounds_info_t *chunk_bounds_nest) {
+  
+  // transfer loop types from the original loop to the chunks
+  for (kmp_index_t i = 0; i < n; ++i) {
+    chunk_bounds_nest[i] = original_bounds_nest[i];
+  }
+  // cleanup iv variables
+  kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].ub0_u64);
+  kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].lb0_u64);
+  kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                    original_bounds_nest[1].ub0_u64);
+  // calculate the chunk's lower and upper bounds
+  // the total number of iterations in the loop is the sum of the arithmetic progression
+  // from the outer lower to outer upper bound (inclusive since the loop is canonical)
+  // note that less_than inner loops (inner_ub0 = -1) effectively make the progression
+  // 1-based making N = (outer_ub0 - inner_lb0 + 1) -> N - 1
+  kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
+  kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
+  // the current thread's number of iterations:
+  // each thread gets an equal number of iterations: total number of iterations 
+  // divided by the number of threads plus, if there's a remainder, 
+  // the first threads with the number up to the remainder get an additional 
+  // iteration each to cover it
+  kmp_uint64 iter_current = iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
+  // cumulative number of iterations executed by all the previous threads:
+  // threads with the tid below the remainder will have (iter_total/nth+1) elements,
+  // and so will all threads before them so the cumulative number of iterations 
+  // executed by the all previous will be the current thread's number of iterations 
+  // multiplied by the number of previous threads which is equal to the current thread's tid; 
+  // threads with the number equal or above the remainder will have (iter_total/nth) elements
+  // so the cumulative number of iterations previously executed is its number of iterations 
+  // multipled by the number of previous threads which is again equal to the current thread's 
+  // tid PLUS all the remainder iterations that will have been executed by the previous threads
+  kmp_uint64 iter_before_current = tid * iter_current + ((tid < iter_total % nth) ? 0: (iter_total % nth));
+  // cumulative number of iterations executed with the current thread is 
+  // the cumulative number executed before it plus its own
+  kmp_uint64 iter_with_current = iter_before_current + iter_current;
+  // calculate the outer loop lower bound (lbo) which is the max outer iv value
+  // that gives the number of iterations that is equal or just below the total number 
+  // of iterations executed by the previous threads, 
+  // for less_than (1-based) inner loops (inner_ub0 == -1) it will be
+  // i.e. lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
+  // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
+  // i.e. lbo*(lbo+1)/2<=iter_before_current => lbo^2+lbo-2*iter_before_current<=0
+  // both cases can be handled similarily using a parameter to control the equation sign 
+  kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
+  kmp_uint64 lower_bound_outer = (kmp_uint64)(sqrt_newton_approx(inner_adjustment*inner_adjustment + 8 * iter_before_current) + inner_adjustment) / 2 - inner_adjustment;
+  // calculate the inner loop lower bound which is the remaining number of iterations required to hit 
+  // the total number of iterations executed by the previous threads giving the starting point of this thread
+  kmp_uint64 lower_bound_inner = iter_before_current - ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
+  // calculate the outer loop upper bound using the same approach as for the inner bound
+  // except using the total number of iterations executed with the current thread 
+  kmp_uint64 upper_bound_outer = (kmp_uint64)(sqrt_newton_approx(inner_adjustment*inner_adjustment + 8 * iter_with_current) + inner_adjustment) / 2 - inner_adjustment;
+  // calculate the inner loop upper bound which is the remaining number of iterations required to hit
+  // the total number of iterations executed after the current thread giving the starting point of the next thread
+  kmp_uint64 upper_bound_inner = iter_with_current - ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
+  // adjust the upper bounds down by 1 element to point at the last iteration of the current thread
+  // the first iteration of the next thread
+  if (upper_bound_inner == 0) {
+    // {n,0} => {n-1,n-1} 
+    upper_bound_outer -= 1;
+    upper_bound_inner = upper_bound_outer;
+  } else {
+    // {n,m} => {n,m-1} (m!=0)
+    upper_bound_inner -= 1;
+  }
+  
+  // assign the values, zeroing out lb1 and ub1 values since the iteration space is now one-dimensional
+  chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
+  chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
+  chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
+  chunk_bounds_nest[1].ub0_u64 = upper_bound_inner;
+  chunk_bounds_nest[0].lb1_u64 = 0;
+  chunk_bounds_nest[0].ub1_u64 = 0;
+  chunk_bounds_nest[1].lb1_u64 = 0;
+  chunk_bounds_nest[1].ub1_u64 = 0;
+
+#if 0
+  printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
+         tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
+         chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
+#endif
+
+}
+
+/**************************************************************************
+*  Handle upper triangle matrix in the canonical form
+*  i = 0; i <= N; i++     {0,0}:{N,0}
+*  j = 0+1*i; j <= N; j++ {0,1}:{N,0}
+* ************************************************************************/
+void kmp_handle_upper_triangle_matrix( /*in*/ kmp_uint32 nth,
+                                       /*in*/ kmp_uint32 tid,
+                                       /*in */ kmp_index_t n,
+                                       /*in/out*/ bounds_info_t *original_bounds_nest,
+                                       /*out*/ bounds_info_t *chunk_bounds_nest) {
+  
+  // transfer loop types from the original loop to the chunks
+  for (kmp_index_t i = 0; i < n; ++i) {
+    chunk_bounds_nest[i] = original_bounds_nest[i];
+  }
+  // cleanup iv variables
+  kmp_uint64 outer_ub0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].ub0_u64);
+  kmp_uint64 outer_lb0 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                    original_bounds_nest[0].lb0_u64);
+  kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                    original_bounds_nest[1].ub0_u64);
+  // calculate the chunk's lower and upper bounds
+  // the total number of iterations in the loop is the sum of the arithmetic progression
+  // from the outer lower to outer upper bound (inclusive since the loop is canonical)
+  // note that less_than inner loops (inner_ub0 = -1) effectively make the progression
+  // 1-based making N = (outer_ub0 - inner_lb0 + 1) -> N - 1
+  kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
+  kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
+  // the current thread's number of iterations:
+  // each thread gets an equal number of iterations: total number of iterations 
+  // divided by the number of threads plus, if there's a remainder, 
+  // the first threads with the number up to the remainder get an additional 
+  // iteration each to cover it
+  kmp_uint64 iter_current = iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
+  // cumulative number of iterations executed by all the previous threads:
+  // threads with the tid below the remainder will have (iter_total/nth+1) elements,
+  // and so will all threads before them so the cumulative number of iterations 
+  // executed by the all previous will be the current thread's number of iterations 
+  // multiplied by the number of previous threads which is equal to the current thread's tid; 
+  // threads with the number equal or above the remainder will have (iter_total/nth) elements
+  // so the cumulative number of iterations previously executed is its number of iterations 
+  // multipled by the number of previous threads which is again equal to the current thread's 
+  // tid PLUS all the remainder iterations that will have been executed by the previous threads
+  kmp_uint64 iter_before_current = tid * iter_current + ((tid < iter_total % nth) ? 0: (iter_total % nth));
+  // cumulative number of iterations executed with the current thread is 
+  // the cumulative number executed before it plus its own
+  kmp_uint64 iter_with_current = iter_before_current + iter_current;
+  // calculate the outer loop lower bound (lbo) which is the max outer iv value
+  // that gives the number of iterations that is equal or just below the total number 
+  // of iterations executed by the previous threads, 
+  // for less_than (1-based) inner loops (inner_ub0 == -1) it will be
+  // i.e. lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
+  // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
+  // i.e. lbo*(lbo+1)/2<=iter_before_current => lbo^2+lbo-2*iter_before_current<=0
+  // both cases can be handled similarily using a parameter to control the equatio sign 
+  kmp_uint64 lower_bound_outer = (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;;
+  // calculate the inner loop lower bound which is the remaining number of iterations required to hit 
+  // the total number of iterations executed by the previous threads giving the starting point of this thread
+  kmp_uint64 lower_bound_inner = iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
+  // calculate the outer loop upper bound using the same approach as for the inner bound
+  // except using the total number of iterations executed with the current thread 
+  kmp_uint64 upper_bound_outer = (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
+  // calculate the inner loop upper bound which is the remaining number of iterations required to hit
+  // the total number of iterations executed after the current thread giving the starting point of the next thread
+  kmp_uint64 upper_bound_inner = iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
+  // adjust the upper bounds down by 1 element to point at the last iteration of
+  // the current thread the first iteration of the next thread
+  if (upper_bound_inner == 0) {
+    // {n,0} => {n-1,n-1}
+    upper_bound_outer -= 1;
+    upper_bound_inner = upper_bound_outer;
+  } else {
+    // {n,m} => {n,m-1} (m!=0)
+    upper_bound_inner -= 1;
+  }
+  
+  // assign the values, zeroing out lb1 and ub1 values since the iteration space is now one-dimensional
+  chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
+  chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
+  chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
+  chunk_bounds_nest[1].ub0_u64 = (outer_iters - 1) - lower_bound_inner;
+  chunk_bounds_nest[0].lb1_u64 = 0;
+  chunk_bounds_nest[0].ub1_u64 = 0;
+  chunk_bounds_nest[1].lb1_u64 = 0;
+  chunk_bounds_nest[1].ub1_u64 = 0;
+
+#if 0
+  printf("tid/nth = %d/%d : From [%llu, %llu] To [%llu, %llu] : Chunks %llu/%llu\n",
+         tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
+         chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
+#endif
+
+}
 //----------Init API for non-rectangular loops--------------------------------
 
 // Init API for collapsed loops (static, no chunks defined).
@@ -1334,6 +1582,18 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
 
   KMP_DEBUG_ASSERT(tid < nth);
 
+  // Handle special cases
+  nested_loop_type_t loop_type = kmp_identify_nested_loop_structure(original_bounds_nest, n);
+  if (loop_type == nested_loop_type_lower_triangular_matrix) {
+    kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
+                                     chunk_bounds_nest);
+    return TRUE;
+  } else if (loop_type == nested_loop_type_upper_triangular_matrix) {
+    kmp_handle_upper_triangle_matrix(nth, tid, n, original_bounds_nest,
+                                     chunk_bounds_nest);
+    return TRUE;
+  }
+
   CollapseAllocator<kmp_uint64> original_ivs_start(n);
 
   if (!kmp_calc_original_ivs_for_start(original_bounds_nest, n,
diff --git a/openmp/runtime/src/kmp_collapse.h b/openmp/runtime/src/kmp_collapse.h
index e4870185645de0..2a4e14fc210074 100644
--- a/openmp/runtime/src/kmp_collapse.h
+++ b/openmp/runtime/src/kmp_collapse.h
@@ -45,6 +45,13 @@ enum loop_type_t : kmp_int32 {
   loop_type_int64 = 7
 };
 
+// Defining loop types to handle special cases
+enum nested_loop_type_t : kmp_int32 {
+  nested_loop_type_unkown = 0,
+  nested_loop_type_lower_triangular_matrix = 1,
+  nested_loop_type_upper_triangular_matrix = 2
+};
+
 /*!
  @ingroup WORK_SHARING
  * Describes the structure for rectangular nested loops.
diff --git a/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c b/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c
new file mode 100644
index 00000000000000..e2f3a9c69cc72b
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c
@@ -0,0 +1,128 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "omp.h"
+
+#ifndef MAX_BOUND
+#define MAX_BOUND 64
+#endif
+#ifndef _MSC_VER
+#define NO_EFFICIENCY_CHECK
+#endif
+
+/* To ensure Correctness, only valid iterations are executed and are executed
+   only once. Stores the number of times an iteration is executed. */
+unsigned* execution_count = NULL;
+/* Stores the number of iterations executed by each thread. */
+unsigned* iterations_per_thread = NULL;
+
+unsigned* Alloc(unsigned bound1, unsigned bound2)
+{
+  return (unsigned*)(malloc(bound1 * bound2 * sizeof(unsigned)));
+}
+
+void ZeroOut (unsigned* p, unsigned bound1, unsigned bound2) {
+  memset(p, 0, bound1 * bound2 * sizeof(unsigned));
+}
+
+void Free(unsigned *p) {
+  free((void*)p);
+}
+
+unsigned* Index(unsigned* p, unsigned i, unsigned j, unsigned bound2) {
+    return &p[i * bound2 + j];
+}
+
+int test(unsigned upper_bound) {
+
+    unsigned total_iterations = upper_bound * (upper_bound - 1) / 2;
+    unsigned num_threads = omp_get_max_threads();
+    unsigned lower_per_chunk = total_iterations / num_threads;
+    unsigned upper_per_chunk =
+        lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
+    int i, j;
+
+    omp_set_num_threads(num_threads);
+
+    ZeroOut (execution_count,upper_bound, upper_bound);
+    ZeroOut (iterations_per_thread, num_threads, 1);
+
+#ifdef VERBOSE
+    fprintf(stderr,
+        "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] chunks "
+        "loop type lower triangle <,< - ",
+        num_threads, upper_bound, total_iterations,
+            lower_per_chunk, upper_per_chunk);
+#endif
+
+#pragma omp parallel shared(iterations_per_thread, execution_count)
+    { /* begin of parallel */
+      /* Lower triangular execution_count matrix */
+#pragma omp for schedule (static) collapse(2)
+        for (i = 0; i < upper_bound; i++) {
+            for (j = 0; j < i; j++) {
+                (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
+                (*Index(execution_count,i, j, upper_bound))++;
+            }
+        }/* end of for*/
+    }/* end of parallel */
+
+    /* check the execution_count array */
+    for (i = 0; i < upper_bound; i++) {
+        for (j = 0; j < i; j++) {
+            unsigned value = *Index(execution_count, i, j, upper_bound);
+            /* iteration with j<=i are valid, but should have been executed only once */
+            if (value != 1) {
+                fprintf(
+                    stderr,
+                    "ERROR: valid iteration [%i,%i] executed %i times.\n", i,
+                    j, value);
+                return 0;
+            }
+        }
+        for (j = i; j < upper_bound; j++) {
+            unsigned value = *Index(execution_count, i, j, upper_bound);
+            /* iteration with j>=i are invalid and should not have been executed
+             */
+            if (value > 0) {
+                fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n", i,
+                    j, value);
+                return 0;
+            }
+        }
+    }
+
+#ifndef NO_EFFICIENCY_CHECK
+    /* Ensure the number of iterations executed by each thread is within bounds */
+    for (i = 0; i < num_threads; i++) {
+        unsigned value = *Index(iterations_per_thread, i, 0,
+                               1);
+      if (value < lower_per_chunk || value > upper_per_chunk) {
+        fprintf(stderr,
+                "ERROR: Inefficient Collapse thread %d of %d assigned %i "
+                "iterations; must be between %d and %d\n",
+                i, num_threads, value, lower_per_chunk, upper_per_chunk);
+            return 0;
+        }
+    }
+#endif
+#ifdef VERBOSE
+    fprintf(stderr, "PASSED\r\n");
+#endif
+    return 1;
+}
+
+int main() {
+
+    execution_count = Alloc(MAX_BOUND, MAX_BOUND);
+    iterations_per_thread = Alloc(omp_get_max_threads(), 1);
+
+    for (unsigned j = 0; j < MAX_BOUND; j++) {
+        if (!test(j))
+            return 1;
+    }
+    Free(execution_count);
+    Free(iterations_per_thread);
+    return 0;
+}
diff --git a/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c b/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c
new file mode 100644
index 00000000000000..22c688943f4343
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c
@@ -0,0 +1,128 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "omp.h"
+
+#ifndef MAX_BOUND
+#define MAX_BOUND 64
+#endif
+#ifndef _MSC_VER
+#define NO_EFFICIENCY_CHECK
+#endif
+
+/* To ensure Correctness, only valid iterations are executed and are executed
+   only once. Stores the number of times an iteration is executed. */
+unsigned* execution_count = NULL;
+/* Stores the number of iterations executed by each thread. */
+unsigned* iterations_per_thread = NULL;
+
+unsigned* Alloc(unsigned bound1, unsigned bound2)
+{
+  return (unsigned*)(malloc(bound1 * bound2 * sizeof(unsigned)));
+}
+
+void ZeroOut (unsigned* p, unsigned bound1, unsigned bound2) {
+  memset(p, 0, bound1 * bound2 * sizeof(unsigned));
+}
+
+void Free(unsigned *p) {
+  free((void*)p);
+}
+
+unsigned* Index(unsigned* p, unsigned i, unsigned j, unsigned bound2) {
+    return &p[i * bound2 + j];
+}
+
+int test(int upper_bound) {
+
+    unsigned total_iterations = upper_bound * (upper_bound + 1) / 2;
+    unsigned num_threads = omp_get_max_threads();
+    unsigned lower_per_chunk = total_iterations / num_threads;
+    unsigned upper_per_chunk =
+        lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
+    int i, j;
+
+    omp_set_num_threads(num_threads);
+
+    ZeroOut (execution_count,upper_bound, upper_bound);
+    ZeroOut (iterations_per_thread, num_threads, 1);
+
+#ifdef VERBOSE
+    fprintf(stderr,
+        "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] chunks "
+        "loop type lower triangle <,<= - ",
+        num_threads, upper_bound, total_iterations,
+        lower_per_chunk, upper_per_chunk);
+#endif
+
+#pragma omp parallel shared(iterations_per_thread, execution_count)
+    { /* begin of parallel */
+      /* Lower triangular execution_count matrix */
+#pragma omp for schedule (static) collapse(2)
+        for (i = 0; i < upper_bound; i++) {
+            for (j = 0; j <= i; j++) {
+                (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
+                (*Index(execution_count,i, j, upper_bound))++;
+            }
+        }/* end of for*/
+    }/* end of parallel */
+
+    /* check the execution_count array */
+    for (i = 0; i < upper_bound; i++) {
+        for (j = 0; j <= i; j++) {
+            unsigned value = *Index(execution_count, i, j, upper_bound);
+            /* iteration with j<=i are valid, but should have been executed only once */
+            if (value != 1) {
+                fprintf(
+                    stderr,
+                    "ERROR: valid iteration [%i,%i] executed %i times.\n", i,
+                    j, value);
+                return 0;
+            }
+        }
+        for (j = i + 1; j < upper_bound; j++) {
+            unsigned value = *Index(execution_count, i, j, upper_bound);
+            /* iteration with j>=i are invalid and should not have been executed
+             */
+            if (value > 0) {
+                fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n", i,
+                    j, value);
+                return 0;
+            }
+        }
+    }
+
+#ifndef NO_EFFICIENCY_CHECK
+    /* Ensure the number of iterations executed by each thread is within bounds */
+    for (i = 0; i < num_threads; i++) {
+        unsigned value = *Index(iterations_per_thread, i, 0,
+                               1);
+      if (value < lower_per_chunk || value > upper_per_chunk) {
+        fprintf(stderr,
+                "ERROR: Inefficient Collapse thread %d of %d assigned %i "
+                "iterations; must be between %d and %d\n",
+                i, num_threads, value, lower_per_chunk, upper_per_chunk);
+            return 0;
+        }
+    }
+#endif
+#ifdef VERBOSE
+    fprintf(stderr, "PASSED\r\n");
+#endif
+    return 1;
+}
+
+int main() {
+
+    execution_count = Alloc(MAX_BOUND, MAX_BOUND);
+    iterations_per_thread = Alloc(omp_get_max_threads(), 1);
+
+    for (unsigned j = 0; j < MAX_BOUND; j++) {
+        if (!test(j))
+            return 1;
+    }
+    Free(execution_count);
+    Free(iterations_per_thread);
+    return 0;
+}
diff --git a/openmp/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c b/openmp/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c
new file mode 100644
index 00000000000000..e438c61373ba4e
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c
@@ -0,0 +1,128 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "omp.h"
+
+#ifndef MAX_BOUND
+#define MAX_BOUND 64
+#endif
+#ifndef _MSC_VER
+#define NO_EFFICIENCY_CHECK
+#endif
+
+/* To ensure Correctness, only valid iterations are executed and are executed
+   only once. Stores the number of times an iteration is executed. */
+unsigned* execution_count = NULL;
+/* Stores the number of iterations executed by each thread. */
+unsigned* iterations_per_thread = NULL;
+
+unsigned* Alloc(unsigned bound1, unsigned bound2)
+{
+  return (unsigned*)(malloc(bound1 * bound2 * sizeof(unsigned)));
+}
+
+void ZeroOut (unsigned* p, unsigned bound1, unsigned bound2) {
+  memset(p, 0, bound1 * bound2 * sizeof(unsigned));
+}
+
+void Free(unsigned *p) {
+  free((void*)p);
+}
+
+unsigned* Index(unsigned* p, unsigned i, unsigned j, unsigned bound2) {
+    return &p[i * bound2 + j];
+}
+
+int test(unsigned upper_bound) {
+
+    unsigned total_iterations = upper_bound * (upper_bound + 1) / 2;
+    unsigned num_threads = omp_get_max_threads();
+    unsigned lower_per_chunk = total_iterations / num_threads;
+    unsigned upper_per_chunk =
+        lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
+    int i, j;
+
+    omp_set_num_threads(num_threads);
+
+    ZeroOut (execution_count,upper_bound, upper_bound);
+    ZeroOut (iterations_per_thread, num_threads, 1);
+
+#ifdef VERBOSE
+    fprintf(stderr,
+        "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] chunks "
+        "loop type upper triangle <,< - ",
+        num_threads, upper_bound, total_iterations,
+            lower_per_chunk, upper_per_chunk);
+#endif
+
+#pragma omp parallel shared(iterations_per_thread, execution_count)
+    { /* begin of parallel */
+      /* Lower triangular execution_count matrix */
+#pragma omp for schedule (static) collapse(2)
+        for (i = 0; i < upper_bound; i++) {
+            for (j = i; j < upper_bound; j++) {
+                (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
+                (*Index(execution_count,i, j, upper_bound))++;
+            }
+        }/* end of for*/
+    }/* end of parallel */
+
+    /* check the execution_count array */
+    for (i = 0; i < upper_bound; i++) {
+        for (j = i; j < upper_bound; j++) {
+            unsigned value = *Index(execution_count, i, j, upper_bound);
+            /* iteration with j<=i are valid, but should have been executed only once */
+            if (value != 1) {
+                fprintf(
+                    stderr,
+                    "ERROR: valid iteration [%i,%i] executed %i times.\n", i,
+                    j, value);
+                return 0;
+            }
+        }
+        for (j = 0; j < i; j++) {
+            unsigned value = *Index(execution_count, i, j, upper_bound);
+            /* iteration with j>=i are invalid and should not have been executed
+             */
+            if (value > 0) {
+                fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n", i,
+                    j, value);
+                return 0;
+            }
+        }
+    }
+
+#ifndef NO_EFFICIENCY_CHECK
+    /* Ensure the number of iterations executed by each thread is within bounds */
+    for (i = 0; i < num_threads; i++) {
+        unsigned value = *Index(iterations_per_thread, i, 0,
+                               1);
+      if (value < lower_per_chunk || value > upper_per_chunk) {
+        fprintf(stderr,
+                "ERROR: Inefficient Collapse thread %d of %d assigned %i "
+                "iterations; must be between %d and %d\n",
+                i, num_threads, value, lower_per_chunk, upper_per_chunk);
+            return 0;
+        }
+    }
+#endif
+#ifdef VERBOSE
+    fprintf(stderr,"PASSED\r\n");
+#endif
+    return 1;
+}
+
+int main() {
+
+    execution_count = Alloc(MAX_BOUND, MAX_BOUND);
+    iterations_per_thread = Alloc(omp_get_max_threads(), 1);
+
+    for (unsigned j = 0; j < MAX_BOUND; j++) {
+        if (!test(j))
+            return 1;
+    }
+    Free(execution_count);
+    Free(iterations_per_thread);
+    return 0;
+}

>From 6d77b654504e56932738d2b21dd5671fd41c8e83 Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Mon, 4 Mar 2024 16:07:41 -0800
Subject: [PATCH 2/2] clang format

---
 openmp/runtime/src/kmp_collapse.cpp           | 279 +++++++++++-------
 openmp/runtime/src/kmp_collapse.h             |   4 +-
 .../omp_for_collapse_LowerTriangularLess.c    | 156 +++++-----
 ...mp_for_collapse_LowerTriangularLessEqual.c | 156 +++++-----
 .../for/omp_for_collapse_UpperTriangular.c    | 156 +++++-----
 5 files changed, 395 insertions(+), 356 deletions(-)

diff --git a/openmp/runtime/src/kmp_collapse.cpp b/openmp/runtime/src/kmp_collapse.cpp
index 2148dd249f6814..569d2c1508319d 100644
--- a/openmp/runtime/src/kmp_collapse.cpp
+++ b/openmp/runtime/src/kmp_collapse.cpp
@@ -13,7 +13,7 @@
 #include "kmp.h"
 #include "kmp_error.h"
 #include "kmp_i18n.h"
-#include "kmp_itt.h" 
+#include "kmp_itt.h"
 #include "kmp_stats.h"
 #include "kmp_str.h"
 #include "kmp_collapse.h"
@@ -1288,28 +1288,39 @@ kmp_identify_nested_loop_structure(/*in*/ bounds_info_t *original_bounds_nest,
     return nested_loop_type_unkown;
   }
   // loops must be canonical
-  KMP_ASSERT((original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
-        (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
+  KMP_ASSERT(
+      (original_bounds_nest[0].comparison == comparison_t::comp_less_or_eq) &&
+      (original_bounds_nest[1].comparison == comparison_t::comp_less_or_eq));
   // check outer loop bounds: for triangular need to be {0,0}:{N,0}
-  kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, original_bounds_nest[0].lb0_u64);
-  kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, original_bounds_nest[0].ub0_u64);
-  kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, original_bounds_nest[0].lb1_u64);
-  kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type, original_bounds_nest[0].ub1_u64);
+  kmp_uint64 outer_lb0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].lb0_u64);
+  kmp_uint64 outer_ub0_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].ub0_u64);
+  kmp_uint64 outer_lb1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].lb1_u64);
+  kmp_uint64 outer_ub1_u64 = kmp_fix_iv(original_bounds_nest[0].loop_iv_type,
+                                        original_bounds_nest[0].ub1_u64);
   if (outer_lb0_u64 != 0 || outer_lb1_u64 != 0 || outer_ub1_u64 != 0) {
     return nested_loop_type_unkown;
   }
   // check inner bounds to determine triangle type
-  kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].lb0_u64);
-  kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub0_u64);
-  kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].lb1_u64);
-  kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type, original_bounds_nest[1].ub1_u64);
+  kmp_uint64 inner_lb0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].lb0_u64);
+  kmp_uint64 inner_ub0_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].ub0_u64);
+  kmp_uint64 inner_lb1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].lb1_u64);
+  kmp_uint64 inner_ub1_u64 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
+                                        original_bounds_nest[1].ub1_u64);
   // lower triangle loop inner bounds need to be {0,0}:{0/-1,1}
-  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 && (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
-     return nested_loop_type_lower_triangular_matrix;
-   }
+  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 0 &&
+      (inner_ub0_u64 == 0 || inner_ub0_u64 == -1) && inner_ub1_u64 == 1) {
+    return nested_loop_type_lower_triangular_matrix;
+  }
   // upper triangle loop inner bounds need to be {0,1}:{N,0}
-  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 && inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
-     return nested_loop_type_upper_triangular_matrix;
+  if (inner_lb0_u64 == 0 && inner_lb1_u64 == 1 &&
+      inner_ub0_u64 == outer_ub0_u64 && inner_ub1_u64 == 0) {
+    return nested_loop_type_upper_triangular_matrix;
   }
   return nested_loop_type_unkown;
 }
@@ -1331,16 +1342,17 @@ double sqrt_newton_approx(/*in*/ kmp_uint64 x) {
 }
 
 /**************************************************************************
-*  Handle lower triangle matrix in the canonical form
-*  i = 0; i <= N; i++          {0,0}:{N,0}
-*  j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
-* ************************************************************************/
-void kmp_handle_lower_triangle_matrix( /*in*/ kmp_uint32 nth,
-                                       /*in*/ kmp_uint32 tid,
-                                       /*in */ kmp_index_t n,
-                                       /*in/out*/ bounds_info_t *original_bounds_nest,
-                                       /*out*/ bounds_info_t *chunk_bounds_nest) {
-  
+ *  Handle lower triangle matrix in the canonical form
+ *  i = 0; i <= N; i++          {0,0}:{N,0}
+ *  j = 0; j <= 0/-1 + 1*i; j++ {0,0}:{0/-1,1}
+ * ************************************************************************/
+void kmp_handle_lower_triangle_matrix(
+    /*in*/ kmp_uint32 nth,
+    /*in*/ kmp_uint32 tid,
+    /*in */ kmp_index_t n,
+    /*in/out*/ bounds_info_t *original_bounds_nest,
+    /*out*/ bounds_info_t *chunk_bounds_nest) {
+
   // transfer loop types from the original loop to the chunks
   for (kmp_index_t i = 0; i < n; ++i) {
     chunk_bounds_nest[i] = original_bounds_nest[i];
@@ -1353,62 +1365,86 @@ void kmp_handle_lower_triangle_matrix( /*in*/ kmp_uint32 nth,
   kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
                                     original_bounds_nest[1].ub0_u64);
   // calculate the chunk's lower and upper bounds
-  // the total number of iterations in the loop is the sum of the arithmetic progression
-  // from the outer lower to outer upper bound (inclusive since the loop is canonical)
-  // note that less_than inner loops (inner_ub0 = -1) effectively make the progression
-  // 1-based making N = (outer_ub0 - inner_lb0 + 1) -> N - 1
+  // the total number of iterations in the loop is the sum of the arithmetic
+  // progression from the outer lower to outer upper bound (inclusive since the
+  // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
+  // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
+  // + 1) -> N - 1
   kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1) + inner_ub0;
   kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
   // the current thread's number of iterations:
-  // each thread gets an equal number of iterations: total number of iterations 
-  // divided by the number of threads plus, if there's a remainder, 
-  // the first threads with the number up to the remainder get an additional 
+  // each thread gets an equal number of iterations: total number of iterations
+  // divided by the number of threads plus, if there's a remainder,
+  // the first threads with the number up to the remainder get an additional
   // iteration each to cover it
-  kmp_uint64 iter_current = iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
+  kmp_uint64 iter_current =
+      iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
   // cumulative number of iterations executed by all the previous threads:
-  // threads with the tid below the remainder will have (iter_total/nth+1) elements,
-  // and so will all threads before them so the cumulative number of iterations 
-  // executed by the all previous will be the current thread's number of iterations 
-  // multiplied by the number of previous threads which is equal to the current thread's tid; 
-  // threads with the number equal or above the remainder will have (iter_total/nth) elements
-  // so the cumulative number of iterations previously executed is its number of iterations 
-  // multipled by the number of previous threads which is again equal to the current thread's 
-  // tid PLUS all the remainder iterations that will have been executed by the previous threads
-  kmp_uint64 iter_before_current = tid * iter_current + ((tid < iter_total % nth) ? 0: (iter_total % nth));
-  // cumulative number of iterations executed with the current thread is 
+  // threads with the tid below the remainder will have (iter_total/nth+1)
+  // elements, and so will all threads before them so the cumulative number of
+  // iterations executed by the all previous will be the current thread's number
+  // of iterations multiplied by the number of previous threads which is equal
+  // to the current thread's tid; threads with the number equal or above the
+  // remainder will have (iter_total/nth) elements so the cumulative number of
+  // iterations previously executed is its number of iterations multipled by the
+  // number of previous threads which is again equal to the current thread's tid
+  // PLUS all the remainder iterations that will have been executed by the
+  // previous threads
+  kmp_uint64 iter_before_current =
+      tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
+  // cumulative number of iterations executed with the current thread is
   // the cumulative number executed before it plus its own
   kmp_uint64 iter_with_current = iter_before_current + iter_current;
   // calculate the outer loop lower bound (lbo) which is the max outer iv value
-  // that gives the number of iterations that is equal or just below the total number 
-  // of iterations executed by the previous threads, 
-  // for less_than (1-based) inner loops (inner_ub0 == -1) it will be
-  // i.e. lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
+  // that gives the number of iterations that is equal or just below the total
+  // number of iterations executed by the previous threads, for less_than
+  // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
+  // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
   // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
-  // i.e. lbo*(lbo+1)/2<=iter_before_current => lbo^2+lbo-2*iter_before_current<=0
-  // both cases can be handled similarily using a parameter to control the equation sign 
+  // i.e. lbo*(lbo+1)/2<=iter_before_current =>
+  // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
+  // using a parameter to control the equation sign
   kmp_int64 inner_adjustment = 1 + 2 * inner_ub0;
-  kmp_uint64 lower_bound_outer = (kmp_uint64)(sqrt_newton_approx(inner_adjustment*inner_adjustment + 8 * iter_before_current) + inner_adjustment) / 2 - inner_adjustment;
-  // calculate the inner loop lower bound which is the remaining number of iterations required to hit 
-  // the total number of iterations executed by the previous threads giving the starting point of this thread
-  kmp_uint64 lower_bound_inner = iter_before_current - ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
-  // calculate the outer loop upper bound using the same approach as for the inner bound
-  // except using the total number of iterations executed with the current thread 
-  kmp_uint64 upper_bound_outer = (kmp_uint64)(sqrt_newton_approx(inner_adjustment*inner_adjustment + 8 * iter_with_current) + inner_adjustment) / 2 - inner_adjustment;
-  // calculate the inner loop upper bound which is the remaining number of iterations required to hit
-  // the total number of iterations executed after the current thread giving the starting point of the next thread
-  kmp_uint64 upper_bound_inner = iter_with_current - ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
-  // adjust the upper bounds down by 1 element to point at the last iteration of the current thread
-  // the first iteration of the next thread
+  kmp_uint64 lower_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
+                                      8 * iter_before_current) +
+                   inner_adjustment) /
+          2 -
+      inner_adjustment;
+  // calculate the inner loop lower bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed by the
+  // previous threads giving the starting point of this thread
+  kmp_uint64 lower_bound_inner =
+      iter_before_current -
+      ((lower_bound_outer + inner_adjustment) * lower_bound_outer) / 2;
+  // calculate the outer loop upper bound using the same approach as for the
+  // inner bound except using the total number of iterations executed with the
+  // current thread
+  kmp_uint64 upper_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(inner_adjustment * inner_adjustment +
+                                      8 * iter_with_current) +
+                   inner_adjustment) /
+          2 -
+      inner_adjustment;
+  // calculate the inner loop upper bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed after
+  // the current thread giving the starting point of the next thread
+  kmp_uint64 upper_bound_inner =
+      iter_with_current -
+      ((upper_bound_outer + inner_adjustment) * upper_bound_outer) / 2;
+  // adjust the upper bounds down by 1 element to point at the last iteration of
+  // the current thread the first iteration of the next thread
   if (upper_bound_inner == 0) {
-    // {n,0} => {n-1,n-1} 
+    // {n,0} => {n-1,n-1}
     upper_bound_outer -= 1;
     upper_bound_inner = upper_bound_outer;
   } else {
     // {n,m} => {n,m-1} (m!=0)
     upper_bound_inner -= 1;
   }
-  
-  // assign the values, zeroing out lb1 and ub1 values since the iteration space is now one-dimensional
+
+  // assign the values, zeroing out lb1 and ub1 values since the iteration space
+  // is now one-dimensional
   chunk_bounds_nest[0].lb0_u64 = lower_bound_outer;
   chunk_bounds_nest[1].lb0_u64 = lower_bound_inner;
   chunk_bounds_nest[0].ub0_u64 = upper_bound_outer;
@@ -1423,20 +1459,20 @@ void kmp_handle_lower_triangle_matrix( /*in*/ kmp_uint32 nth,
          tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
          chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
 #endif
-
 }
 
 /**************************************************************************
-*  Handle upper triangle matrix in the canonical form
-*  i = 0; i <= N; i++     {0,0}:{N,0}
-*  j = 0+1*i; j <= N; j++ {0,1}:{N,0}
-* ************************************************************************/
-void kmp_handle_upper_triangle_matrix( /*in*/ kmp_uint32 nth,
-                                       /*in*/ kmp_uint32 tid,
-                                       /*in */ kmp_index_t n,
-                                       /*in/out*/ bounds_info_t *original_bounds_nest,
-                                       /*out*/ bounds_info_t *chunk_bounds_nest) {
-  
+ *  Handle upper triangle matrix in the canonical form
+ *  i = 0; i <= N; i++     {0,0}:{N,0}
+ *  j = 0+1*i; j <= N; j++ {0,1}:{N,0}
+ * ************************************************************************/
+void kmp_handle_upper_triangle_matrix(
+    /*in*/ kmp_uint32 nth,
+    /*in*/ kmp_uint32 tid,
+    /*in */ kmp_index_t n,
+    /*in/out*/ bounds_info_t *original_bounds_nest,
+    /*out*/ bounds_info_t *chunk_bounds_nest) {
+
   // transfer loop types from the original loop to the chunks
   for (kmp_index_t i = 0; i < n; ++i) {
     chunk_bounds_nest[i] = original_bounds_nest[i];
@@ -1449,49 +1485,63 @@ void kmp_handle_upper_triangle_matrix( /*in*/ kmp_uint32 nth,
   kmp_uint64 inner_ub0 = kmp_fix_iv(original_bounds_nest[1].loop_iv_type,
                                     original_bounds_nest[1].ub0_u64);
   // calculate the chunk's lower and upper bounds
-  // the total number of iterations in the loop is the sum of the arithmetic progression
-  // from the outer lower to outer upper bound (inclusive since the loop is canonical)
-  // note that less_than inner loops (inner_ub0 = -1) effectively make the progression
-  // 1-based making N = (outer_ub0 - inner_lb0 + 1) -> N - 1
+  // the total number of iterations in the loop is the sum of the arithmetic
+  // progression from the outer lower to outer upper bound (inclusive since the
+  // loop is canonical) note that less_than inner loops (inner_ub0 = -1)
+  // effectively make the progression 1-based making N = (outer_ub0 - inner_lb0
+  // + 1) -> N - 1
   kmp_uint64 outer_iters = (outer_ub0 - outer_lb0 + 1);
   kmp_uint64 iter_total = outer_iters * (outer_iters + 1) / 2;
   // the current thread's number of iterations:
-  // each thread gets an equal number of iterations: total number of iterations 
-  // divided by the number of threads plus, if there's a remainder, 
-  // the first threads with the number up to the remainder get an additional 
+  // each thread gets an equal number of iterations: total number of iterations
+  // divided by the number of threads plus, if there's a remainder,
+  // the first threads with the number up to the remainder get an additional
   // iteration each to cover it
-  kmp_uint64 iter_current = iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
+  kmp_uint64 iter_current =
+      iter_total / nth + ((tid < (iter_total % nth)) ? 1 : 0);
   // cumulative number of iterations executed by all the previous threads:
-  // threads with the tid below the remainder will have (iter_total/nth+1) elements,
-  // and so will all threads before them so the cumulative number of iterations 
-  // executed by the all previous will be the current thread's number of iterations 
-  // multiplied by the number of previous threads which is equal to the current thread's tid; 
-  // threads with the number equal or above the remainder will have (iter_total/nth) elements
-  // so the cumulative number of iterations previously executed is its number of iterations 
-  // multipled by the number of previous threads which is again equal to the current thread's 
-  // tid PLUS all the remainder iterations that will have been executed by the previous threads
-  kmp_uint64 iter_before_current = tid * iter_current + ((tid < iter_total % nth) ? 0: (iter_total % nth));
-  // cumulative number of iterations executed with the current thread is 
+  // threads with the tid below the remainder will have (iter_total/nth+1)
+  // elements, and so will all threads before them so the cumulative number of
+  // iterations executed by the all previous will be the current thread's number
+  // of iterations multiplied by the number of previous threads which is equal
+  // to the current thread's tid; threads with the number equal or above the
+  // remainder will have (iter_total/nth) elements so the cumulative number of
+  // iterations previously executed is its number of iterations multipled by the
+  // number of previous threads which is again equal to the current thread's tid
+  // PLUS all the remainder iterations that will have been executed by the
+  // previous threads
+  kmp_uint64 iter_before_current =
+      tid * iter_current + ((tid < iter_total % nth) ? 0 : (iter_total % nth));
+  // cumulative number of iterations executed with the current thread is
   // the cumulative number executed before it plus its own
   kmp_uint64 iter_with_current = iter_before_current + iter_current;
   // calculate the outer loop lower bound (lbo) which is the max outer iv value
-  // that gives the number of iterations that is equal or just below the total number 
-  // of iterations executed by the previous threads, 
-  // for less_than (1-based) inner loops (inner_ub0 == -1) it will be
-  // i.e. lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
+  // that gives the number of iterations that is equal or just below the total
+  // number of iterations executed by the previous threads, for less_than
+  // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
+  // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
   // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
-  // i.e. lbo*(lbo+1)/2<=iter_before_current => lbo^2+lbo-2*iter_before_current<=0
-  // both cases can be handled similarily using a parameter to control the equatio sign 
-  kmp_uint64 lower_bound_outer = (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;;
-  // calculate the inner loop lower bound which is the remaining number of iterations required to hit 
-  // the total number of iterations executed by the previous threads giving the starting point of this thread
-  kmp_uint64 lower_bound_inner = iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
-  // calculate the outer loop upper bound using the same approach as for the inner bound
-  // except using the total number of iterations executed with the current thread 
-  kmp_uint64 upper_bound_outer = (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
-  // calculate the inner loop upper bound which is the remaining number of iterations required to hit
-  // the total number of iterations executed after the current thread giving the starting point of the next thread
-  kmp_uint64 upper_bound_inner = iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
+  // i.e. lbo*(lbo+1)/2<=iter_before_current =>
+  // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
+  // using a parameter to control the equatio sign
+  kmp_uint64 lower_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
+  ;
+  // calculate the inner loop lower bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed by the
+  // previous threads giving the starting point of this thread
+  kmp_uint64 lower_bound_inner =
+      iter_before_current - ((lower_bound_outer + 1) * lower_bound_outer) / 2;
+  // calculate the outer loop upper bound using the same approach as for the
+  // inner bound except using the total number of iterations executed with the
+  // current thread
+  kmp_uint64 upper_bound_outer =
+      (kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_with_current) + 1) / 2 - 1;
+  // calculate the inner loop upper bound which is the remaining number of
+  // iterations required to hit the total number of iterations executed after
+  // the current thread giving the starting point of the next thread
+  kmp_uint64 upper_bound_inner =
+      iter_with_current - ((upper_bound_outer + 1) * upper_bound_outer) / 2;
   // adjust the upper bounds down by 1 element to point at the last iteration of
   // the current thread the first iteration of the next thread
   if (upper_bound_inner == 0) {
@@ -1502,8 +1552,9 @@ void kmp_handle_upper_triangle_matrix( /*in*/ kmp_uint32 nth,
     // {n,m} => {n,m-1} (m!=0)
     upper_bound_inner -= 1;
   }
-  
-  // assign the values, zeroing out lb1 and ub1 values since the iteration space is now one-dimensional
+
+  // assign the values, zeroing out lb1 and ub1 values since the iteration space
+  // is now one-dimensional
   chunk_bounds_nest[0].lb0_u64 = (outer_iters - 1) - upper_bound_outer;
   chunk_bounds_nest[1].lb0_u64 = (outer_iters - 1) - upper_bound_inner;
   chunk_bounds_nest[0].ub0_u64 = (outer_iters - 1) - lower_bound_outer;
@@ -1518,7 +1569,6 @@ void kmp_handle_upper_triangle_matrix( /*in*/ kmp_uint32 nth,
          tid, nth, chunk_bounds_nest[0].lb0_u64, chunk_bounds_nest[1].lb0_u64,
          chunk_bounds_nest[0].ub0_u64, chunk_bounds_nest[1].ub0_u64, iter_current, iter_total);
 #endif
-
 }
 //----------Init API for non-rectangular loops--------------------------------
 
@@ -1583,7 +1633,8 @@ __kmpc_for_collapsed_init(ident_t *loc, kmp_int32 gtid,
   KMP_DEBUG_ASSERT(tid < nth);
 
   // Handle special cases
-  nested_loop_type_t loop_type = kmp_identify_nested_loop_structure(original_bounds_nest, n);
+  nested_loop_type_t loop_type =
+      kmp_identify_nested_loop_structure(original_bounds_nest, n);
   if (loop_type == nested_loop_type_lower_triangular_matrix) {
     kmp_handle_lower_triangle_matrix(nth, tid, n, original_bounds_nest,
                                      chunk_bounds_nest);
diff --git a/openmp/runtime/src/kmp_collapse.h b/openmp/runtime/src/kmp_collapse.h
index 2a4e14fc210074..1044478554a022 100644
--- a/openmp/runtime/src/kmp_collapse.h
+++ b/openmp/runtime/src/kmp_collapse.h
@@ -131,14 +131,14 @@ struct bounds_info_t {
 // It's represented in kmp_uint64, but each dimention is calculated in
 // that loop IV type. Also dimentions have to be converted to those types
 // when used in generated code.
-typedef kmp_uint64* kmp_point_t;
+typedef kmp_uint64 *kmp_point_t;
 
 // Array: Number of loop iterations on each nesting level to achieve some point,
 // in expanded space or in original space.
 // OMPTODO: move from using iterations to using offsets (iterations multiplied
 // by steps). For those we need to be careful with the types, as step can be
 // negative, but it'll remove multiplications and divisions in several places.
-typedef kmp_loop_nest_iv_t* kmp_iterations_t;
+typedef kmp_loop_nest_iv_t *kmp_iterations_t;
 
 // Internal struct with additional info:
 template <typename T> struct bounds_info_internalXX_template {
diff --git a/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c b/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c
index e2f3a9c69cc72b..9d742066cf1fc2 100644
--- a/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c
+++ b/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLess.c
@@ -13,116 +13,112 @@
 
 /* To ensure Correctness, only valid iterations are executed and are executed
    only once. Stores the number of times an iteration is executed. */
-unsigned* execution_count = NULL;
+unsigned *execution_count = NULL;
 /* Stores the number of iterations executed by each thread. */
-unsigned* iterations_per_thread = NULL;
+unsigned *iterations_per_thread = NULL;
 
-unsigned* Alloc(unsigned bound1, unsigned bound2)
-{
-  return (unsigned*)(malloc(bound1 * bound2 * sizeof(unsigned)));
+unsigned *Alloc(unsigned bound1, unsigned bound2) {
+  return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned)));
 }
 
-void ZeroOut (unsigned* p, unsigned bound1, unsigned bound2) {
+void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) {
   memset(p, 0, bound1 * bound2 * sizeof(unsigned));
 }
 
-void Free(unsigned *p) {
-  free((void*)p);
-}
+void Free(unsigned *p) { free((void *)p); }
 
-unsigned* Index(unsigned* p, unsigned i, unsigned j, unsigned bound2) {
-    return &p[i * bound2 + j];
+unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) {
+  return &p[i * bound2 + j];
 }
 
 int test(unsigned upper_bound) {
 
-    unsigned total_iterations = upper_bound * (upper_bound - 1) / 2;
-    unsigned num_threads = omp_get_max_threads();
-    unsigned lower_per_chunk = total_iterations / num_threads;
-    unsigned upper_per_chunk =
-        lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
-    int i, j;
+  unsigned total_iterations = upper_bound * (upper_bound - 1) / 2;
+  unsigned num_threads = omp_get_max_threads();
+  unsigned lower_per_chunk = total_iterations / num_threads;
+  unsigned upper_per_chunk =
+      lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
+  int i, j;
 
-    omp_set_num_threads(num_threads);
+  omp_set_num_threads(num_threads);
 
-    ZeroOut (execution_count,upper_bound, upper_bound);
-    ZeroOut (iterations_per_thread, num_threads, 1);
+  ZeroOut(execution_count, upper_bound, upper_bound);
+  ZeroOut(iterations_per_thread, num_threads, 1);
 
 #ifdef VERBOSE
-    fprintf(stderr,
-        "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] chunks "
-        "loop type lower triangle <,< - ",
-        num_threads, upper_bound, total_iterations,
-            lower_per_chunk, upper_per_chunk);
+  fprintf(stderr,
+          "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] "
+          "chunks "
+          "loop type lower triangle <,< - ",
+          num_threads, upper_bound, total_iterations, lower_per_chunk,
+          upper_per_chunk);
 #endif
 
 #pragma omp parallel shared(iterations_per_thread, execution_count)
-    { /* begin of parallel */
-      /* Lower triangular execution_count matrix */
-#pragma omp for schedule (static) collapse(2)
-        for (i = 0; i < upper_bound; i++) {
-            for (j = 0; j < i; j++) {
-                (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
-                (*Index(execution_count,i, j, upper_bound))++;
-            }
-        }/* end of for*/
-    }/* end of parallel */
-
-    /* check the execution_count array */
+  { /* begin of parallel */
+    /* Lower triangular execution_count matrix */
+#pragma omp for schedule(static) collapse(2)
     for (i = 0; i < upper_bound; i++) {
-        for (j = 0; j < i; j++) {
-            unsigned value = *Index(execution_count, i, j, upper_bound);
-            /* iteration with j<=i are valid, but should have been executed only once */
-            if (value != 1) {
-                fprintf(
-                    stderr,
-                    "ERROR: valid iteration [%i,%i] executed %i times.\n", i,
-                    j, value);
-                return 0;
-            }
-        }
-        for (j = i; j < upper_bound; j++) {
-            unsigned value = *Index(execution_count, i, j, upper_bound);
-            /* iteration with j>=i are invalid and should not have been executed
-             */
-            if (value > 0) {
-                fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n", i,
-                    j, value);
-                return 0;
-            }
-        }
+      for (j = 0; j < i; j++) {
+        (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
+        (*Index(execution_count, i, j, upper_bound))++;
+      }
+    } /* end of for*/
+  } /* end of parallel */
+
+  /* check the execution_count array */
+  for (i = 0; i < upper_bound; i++) {
+    for (j = 0; j < i; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j<=i are valid, but should have been executed only once
+       */
+      if (value != 1) {
+        fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
+    }
+    for (j = i; j < upper_bound; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j>=i are invalid and should not have been executed
+       */
+      if (value > 0) {
+        fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
     }
+  }
 
 #ifndef NO_EFFICIENCY_CHECK
-    /* Ensure the number of iterations executed by each thread is within bounds */
-    for (i = 0; i < num_threads; i++) {
-        unsigned value = *Index(iterations_per_thread, i, 0,
-                               1);
-      if (value < lower_per_chunk || value > upper_per_chunk) {
-        fprintf(stderr,
-                "ERROR: Inefficient Collapse thread %d of %d assigned %i "
-                "iterations; must be between %d and %d\n",
-                i, num_threads, value, lower_per_chunk, upper_per_chunk);
-            return 0;
-        }
+  /* Ensure the number of iterations executed by each thread is within bounds */
+  for (i = 0; i < num_threads; i++) {
+    unsigned value = *Index(iterations_per_thread, i, 0, 1);
+    if (value < lower_per_chunk || value > upper_per_chunk) {
+      fprintf(stderr,
+              "ERROR: Inefficient Collapse thread %d of %d assigned %i "
+              "iterations; must be between %d and %d\n",
+              i, num_threads, value, lower_per_chunk, upper_per_chunk);
+      return 0;
     }
+  }
 #endif
 #ifdef VERBOSE
-    fprintf(stderr, "PASSED\r\n");
+  fprintf(stderr, "PASSED\r\n");
 #endif
-    return 1;
+  return 1;
 }
 
 int main() {
 
-    execution_count = Alloc(MAX_BOUND, MAX_BOUND);
-    iterations_per_thread = Alloc(omp_get_max_threads(), 1);
+  execution_count = Alloc(MAX_BOUND, MAX_BOUND);
+  iterations_per_thread = Alloc(omp_get_max_threads(), 1);
 
-    for (unsigned j = 0; j < MAX_BOUND; j++) {
-        if (!test(j))
-            return 1;
-    }
-    Free(execution_count);
-    Free(iterations_per_thread);
-    return 0;
+  for (unsigned j = 0; j < MAX_BOUND; j++) {
+    if (!test(j))
+      return 1;
+  }
+  Free(execution_count);
+  Free(iterations_per_thread);
+  return 0;
 }
diff --git a/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c b/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c
index 22c688943f4343..154ee0f69daa56 100644
--- a/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c
+++ b/openmp/runtime/test/worksharing/for/omp_for_collapse_LowerTriangularLessEqual.c
@@ -13,116 +13,112 @@
 
 /* To ensure Correctness, only valid iterations are executed and are executed
    only once. Stores the number of times an iteration is executed. */
-unsigned* execution_count = NULL;
+unsigned *execution_count = NULL;
 /* Stores the number of iterations executed by each thread. */
-unsigned* iterations_per_thread = NULL;
+unsigned *iterations_per_thread = NULL;
 
-unsigned* Alloc(unsigned bound1, unsigned bound2)
-{
-  return (unsigned*)(malloc(bound1 * bound2 * sizeof(unsigned)));
+unsigned *Alloc(unsigned bound1, unsigned bound2) {
+  return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned)));
 }
 
-void ZeroOut (unsigned* p, unsigned bound1, unsigned bound2) {
+void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) {
   memset(p, 0, bound1 * bound2 * sizeof(unsigned));
 }
 
-void Free(unsigned *p) {
-  free((void*)p);
-}
+void Free(unsigned *p) { free((void *)p); }
 
-unsigned* Index(unsigned* p, unsigned i, unsigned j, unsigned bound2) {
-    return &p[i * bound2 + j];
+unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) {
+  return &p[i * bound2 + j];
 }
 
 int test(int upper_bound) {
 
-    unsigned total_iterations = upper_bound * (upper_bound + 1) / 2;
-    unsigned num_threads = omp_get_max_threads();
-    unsigned lower_per_chunk = total_iterations / num_threads;
-    unsigned upper_per_chunk =
-        lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
-    int i, j;
+  unsigned total_iterations = upper_bound * (upper_bound + 1) / 2;
+  unsigned num_threads = omp_get_max_threads();
+  unsigned lower_per_chunk = total_iterations / num_threads;
+  unsigned upper_per_chunk =
+      lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
+  int i, j;
 
-    omp_set_num_threads(num_threads);
+  omp_set_num_threads(num_threads);
 
-    ZeroOut (execution_count,upper_bound, upper_bound);
-    ZeroOut (iterations_per_thread, num_threads, 1);
+  ZeroOut(execution_count, upper_bound, upper_bound);
+  ZeroOut(iterations_per_thread, num_threads, 1);
 
 #ifdef VERBOSE
-    fprintf(stderr,
-        "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] chunks "
-        "loop type lower triangle <,<= - ",
-        num_threads, upper_bound, total_iterations,
-        lower_per_chunk, upper_per_chunk);
+  fprintf(stderr,
+          "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] "
+          "chunks "
+          "loop type lower triangle <,<= - ",
+          num_threads, upper_bound, total_iterations, lower_per_chunk,
+          upper_per_chunk);
 #endif
 
 #pragma omp parallel shared(iterations_per_thread, execution_count)
-    { /* begin of parallel */
-      /* Lower triangular execution_count matrix */
-#pragma omp for schedule (static) collapse(2)
-        for (i = 0; i < upper_bound; i++) {
-            for (j = 0; j <= i; j++) {
-                (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
-                (*Index(execution_count,i, j, upper_bound))++;
-            }
-        }/* end of for*/
-    }/* end of parallel */
-
-    /* check the execution_count array */
+  { /* begin of parallel */
+    /* Lower triangular execution_count matrix */
+#pragma omp for schedule(static) collapse(2)
     for (i = 0; i < upper_bound; i++) {
-        for (j = 0; j <= i; j++) {
-            unsigned value = *Index(execution_count, i, j, upper_bound);
-            /* iteration with j<=i are valid, but should have been executed only once */
-            if (value != 1) {
-                fprintf(
-                    stderr,
-                    "ERROR: valid iteration [%i,%i] executed %i times.\n", i,
-                    j, value);
-                return 0;
-            }
-        }
-        for (j = i + 1; j < upper_bound; j++) {
-            unsigned value = *Index(execution_count, i, j, upper_bound);
-            /* iteration with j>=i are invalid and should not have been executed
-             */
-            if (value > 0) {
-                fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n", i,
-                    j, value);
-                return 0;
-            }
-        }
+      for (j = 0; j <= i; j++) {
+        (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
+        (*Index(execution_count, i, j, upper_bound))++;
+      }
+    } /* end of for*/
+  } /* end of parallel */
+
+  /* check the execution_count array */
+  for (i = 0; i < upper_bound; i++) {
+    for (j = 0; j <= i; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j<=i are valid, but should have been executed only once
+       */
+      if (value != 1) {
+        fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
+    }
+    for (j = i + 1; j < upper_bound; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j>=i are invalid and should not have been executed
+       */
+      if (value > 0) {
+        fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
     }
+  }
 
 #ifndef NO_EFFICIENCY_CHECK
-    /* Ensure the number of iterations executed by each thread is within bounds */
-    for (i = 0; i < num_threads; i++) {
-        unsigned value = *Index(iterations_per_thread, i, 0,
-                               1);
-      if (value < lower_per_chunk || value > upper_per_chunk) {
-        fprintf(stderr,
-                "ERROR: Inefficient Collapse thread %d of %d assigned %i "
-                "iterations; must be between %d and %d\n",
-                i, num_threads, value, lower_per_chunk, upper_per_chunk);
-            return 0;
-        }
+  /* Ensure the number of iterations executed by each thread is within bounds */
+  for (i = 0; i < num_threads; i++) {
+    unsigned value = *Index(iterations_per_thread, i, 0, 1);
+    if (value < lower_per_chunk || value > upper_per_chunk) {
+      fprintf(stderr,
+              "ERROR: Inefficient Collapse thread %d of %d assigned %i "
+              "iterations; must be between %d and %d\n",
+              i, num_threads, value, lower_per_chunk, upper_per_chunk);
+      return 0;
     }
+  }
 #endif
 #ifdef VERBOSE
-    fprintf(stderr, "PASSED\r\n");
+  fprintf(stderr, "PASSED\r\n");
 #endif
-    return 1;
+  return 1;
 }
 
 int main() {
 
-    execution_count = Alloc(MAX_BOUND, MAX_BOUND);
-    iterations_per_thread = Alloc(omp_get_max_threads(), 1);
+  execution_count = Alloc(MAX_BOUND, MAX_BOUND);
+  iterations_per_thread = Alloc(omp_get_max_threads(), 1);
 
-    for (unsigned j = 0; j < MAX_BOUND; j++) {
-        if (!test(j))
-            return 1;
-    }
-    Free(execution_count);
-    Free(iterations_per_thread);
-    return 0;
+  for (unsigned j = 0; j < MAX_BOUND; j++) {
+    if (!test(j))
+      return 1;
+  }
+  Free(execution_count);
+  Free(iterations_per_thread);
+  return 0;
 }
diff --git a/openmp/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c b/openmp/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c
index e438c61373ba4e..452410025be0c9 100644
--- a/openmp/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c
+++ b/openmp/runtime/test/worksharing/for/omp_for_collapse_UpperTriangular.c
@@ -13,116 +13,112 @@
 
 /* To ensure Correctness, only valid iterations are executed and are executed
    only once. Stores the number of times an iteration is executed. */
-unsigned* execution_count = NULL;
+unsigned *execution_count = NULL;
 /* Stores the number of iterations executed by each thread. */
-unsigned* iterations_per_thread = NULL;
+unsigned *iterations_per_thread = NULL;
 
-unsigned* Alloc(unsigned bound1, unsigned bound2)
-{
-  return (unsigned*)(malloc(bound1 * bound2 * sizeof(unsigned)));
+unsigned *Alloc(unsigned bound1, unsigned bound2) {
+  return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned)));
 }
 
-void ZeroOut (unsigned* p, unsigned bound1, unsigned bound2) {
+void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) {
   memset(p, 0, bound1 * bound2 * sizeof(unsigned));
 }
 
-void Free(unsigned *p) {
-  free((void*)p);
-}
+void Free(unsigned *p) { free((void *)p); }
 
-unsigned* Index(unsigned* p, unsigned i, unsigned j, unsigned bound2) {
-    return &p[i * bound2 + j];
+unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) {
+  return &p[i * bound2 + j];
 }
 
 int test(unsigned upper_bound) {
 
-    unsigned total_iterations = upper_bound * (upper_bound + 1) / 2;
-    unsigned num_threads = omp_get_max_threads();
-    unsigned lower_per_chunk = total_iterations / num_threads;
-    unsigned upper_per_chunk =
-        lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
-    int i, j;
+  unsigned total_iterations = upper_bound * (upper_bound + 1) / 2;
+  unsigned num_threads = omp_get_max_threads();
+  unsigned lower_per_chunk = total_iterations / num_threads;
+  unsigned upper_per_chunk =
+      lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0);
+  int i, j;
 
-    omp_set_num_threads(num_threads);
+  omp_set_num_threads(num_threads);
 
-    ZeroOut (execution_count,upper_bound, upper_bound);
-    ZeroOut (iterations_per_thread, num_threads, 1);
+  ZeroOut(execution_count, upper_bound, upper_bound);
+  ZeroOut(iterations_per_thread, num_threads, 1);
 
 #ifdef VERBOSE
-    fprintf(stderr,
-        "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] chunks "
-        "loop type upper triangle <,< - ",
-        num_threads, upper_bound, total_iterations,
-            lower_per_chunk, upper_per_chunk);
+  fprintf(stderr,
+          "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] "
+          "chunks "
+          "loop type upper triangle <,< - ",
+          num_threads, upper_bound, total_iterations, lower_per_chunk,
+          upper_per_chunk);
 #endif
 
 #pragma omp parallel shared(iterations_per_thread, execution_count)
-    { /* begin of parallel */
-      /* Lower triangular execution_count matrix */
-#pragma omp for schedule (static) collapse(2)
-        for (i = 0; i < upper_bound; i++) {
-            for (j = i; j < upper_bound; j++) {
-                (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
-                (*Index(execution_count,i, j, upper_bound))++;
-            }
-        }/* end of for*/
-    }/* end of parallel */
-
-    /* check the execution_count array */
+  { /* begin of parallel */
+    /* Lower triangular execution_count matrix */
+#pragma omp for schedule(static) collapse(2)
     for (i = 0; i < upper_bound; i++) {
-        for (j = i; j < upper_bound; j++) {
-            unsigned value = *Index(execution_count, i, j, upper_bound);
-            /* iteration with j<=i are valid, but should have been executed only once */
-            if (value != 1) {
-                fprintf(
-                    stderr,
-                    "ERROR: valid iteration [%i,%i] executed %i times.\n", i,
-                    j, value);
-                return 0;
-            }
-        }
-        for (j = 0; j < i; j++) {
-            unsigned value = *Index(execution_count, i, j, upper_bound);
-            /* iteration with j>=i are invalid and should not have been executed
-             */
-            if (value > 0) {
-                fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n", i,
-                    j, value);
-                return 0;
-            }
-        }
+      for (j = i; j < upper_bound; j++) {
+        (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++;
+        (*Index(execution_count, i, j, upper_bound))++;
+      }
+    } /* end of for*/
+  } /* end of parallel */
+
+  /* check the execution_count array */
+  for (i = 0; i < upper_bound; i++) {
+    for (j = i; j < upper_bound; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j<=i are valid, but should have been executed only once
+       */
+      if (value != 1) {
+        fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
+    }
+    for (j = 0; j < i; j++) {
+      unsigned value = *Index(execution_count, i, j, upper_bound);
+      /* iteration with j>=i are invalid and should not have been executed
+       */
+      if (value > 0) {
+        fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n",
+                i, j, value);
+        return 0;
+      }
     }
+  }
 
 #ifndef NO_EFFICIENCY_CHECK
-    /* Ensure the number of iterations executed by each thread is within bounds */
-    for (i = 0; i < num_threads; i++) {
-        unsigned value = *Index(iterations_per_thread, i, 0,
-                               1);
-      if (value < lower_per_chunk || value > upper_per_chunk) {
-        fprintf(stderr,
-                "ERROR: Inefficient Collapse thread %d of %d assigned %i "
-                "iterations; must be between %d and %d\n",
-                i, num_threads, value, lower_per_chunk, upper_per_chunk);
-            return 0;
-        }
+  /* Ensure the number of iterations executed by each thread is within bounds */
+  for (i = 0; i < num_threads; i++) {
+    unsigned value = *Index(iterations_per_thread, i, 0, 1);
+    if (value < lower_per_chunk || value > upper_per_chunk) {
+      fprintf(stderr,
+              "ERROR: Inefficient Collapse thread %d of %d assigned %i "
+              "iterations; must be between %d and %d\n",
+              i, num_threads, value, lower_per_chunk, upper_per_chunk);
+      return 0;
     }
+  }
 #endif
 #ifdef VERBOSE
-    fprintf(stderr,"PASSED\r\n");
+  fprintf(stderr, "PASSED\r\n");
 #endif
-    return 1;
+  return 1;
 }
 
 int main() {
 
-    execution_count = Alloc(MAX_BOUND, MAX_BOUND);
-    iterations_per_thread = Alloc(omp_get_max_threads(), 1);
+  execution_count = Alloc(MAX_BOUND, MAX_BOUND);
+  iterations_per_thread = Alloc(omp_get_max_threads(), 1);
 
-    for (unsigned j = 0; j < MAX_BOUND; j++) {
-        if (!test(j))
-            return 1;
-    }
-    Free(execution_count);
-    Free(iterations_per_thread);
-    return 0;
+  for (unsigned j = 0; j < MAX_BOUND; j++) {
+    if (!test(j))
+      return 1;
+  }
+  Free(execution_count);
+  Free(iterations_per_thread);
+  return 0;
 }