[Openmp-commits] [openmp] [OpenMP] add loop collapse tests (PR #86243)
Vadim Paretsky via Openmp-commits
openmp-commits at lists.llvm.org
Thu Mar 21 21:40:09 PDT 2024
https://github.com/vadikp-intel updated https://github.com/llvm/llvm-project/pull/86243
>From 2b7d3459db79f6882891e6714550d0d192fe0e7d Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Fri, 15 Mar 2024 17:38:36 -0700
Subject: [PATCH 1/6] rectangular loop collapse tests
---
.../for/collapse_many_GELTGT_int.c | 54 +++++++++++++++++
.../for/collapse_many_GTGEGT_int.c | 60 +++++++++++++++++++
.../for/collapse_many_LTLEGE_int.c | 55 +++++++++++++++++
.../test/worksharing/for/collapse_many_int.c | 52 ++++++++++++++++
.../test/worksharing/for/collapse_one_int.c | 25 ++++++++
5 files changed, 246 insertions(+)
create mode 100644 openmp/runtime/test/worksharing/for/collapse_many_GELTGT_int.c
create mode 100644 openmp/runtime/test/worksharing/for/collapse_many_GTGEGT_int.c
create mode 100644 openmp/runtime/test/worksharing/for/collapse_many_LTLEGE_int.c
create mode 100644 openmp/runtime/test/worksharing/for/collapse_many_int.c
create mode 100644 openmp/runtime/test/worksharing/for/collapse_one_int.c
diff --git a/openmp/runtime/test/worksharing/for/collapse_many_GELTGT_int.c b/openmp/runtime/test/worksharing/for/collapse_many_GELTGT_int.c
new file mode 100644
index 00000000000000..23808244db4475
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/collapse_many_GELTGT_int.c
@@ -0,0 +1,54 @@
+// RUN: %libomp-compile-and-run
+
+// Non-rectangular loop collapsing.
+//
+// Nested loops conform to OpenMP 5.2 standard,
+// inner loops bounds may depend on outer loops induction variables.
+
+#define LOOP_TYPES int
+#define COMPARE0 >=
+#define COMPARE1 <
+#define COMPARE2 >
+#define LOOP \
+ for (i = iLB; i COMPARE0 iUB; i += iStep) \
+ for (j = jA0; j COMPARE1 jB0; j += jStep) \
+ for (k = kA0; k COMPARE2 kB0; k += kStep)
+#include "collapse_test.inc"
+
+int main() {
+ int fail;
+
+ iLB = 3; iUB = -2; jA0 = -3; jA1 = 0; jB0 = -6; jB1 = 0; kA0 = -2; kA1 = 0; kB0 = -4; kB1 = 0; iStep = -1; jStep = -1; kStep = -4;
+ PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
+ "kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
+ fail = (test() == 0);
+
+ if (!fail) {
+ for (iStep = -3; iStep >= -6; iStep -= 2) {
+ for (jA0 = -6; jA0 <= 6; jA0 += 3) {
+ for (jB0 = -3; jB0 <= 10; jB0 += 3) {
+ for (jStep = 1; jStep <= 10; jStep += 2) {
+ for (kA0 = -2; kA0 <= 4; ++kA0) {
+ for (kB0 = -4; kB0 <= 2; ++kB0) {
+ for (kStep = -2; kStep >= -10; kStep -= 4) {
+ {
+ PRINTF("\nTrying iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; "
+ "jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; "
+ "jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1,
+ iStep, jStep, kStep);
+ fail = fail || (test() == 0);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+return fail;
+}
+
diff --git a/openmp/runtime/test/worksharing/for/collapse_many_GTGEGT_int.c b/openmp/runtime/test/worksharing/for/collapse_many_GTGEGT_int.c
new file mode 100644
index 00000000000000..9a10b5d01895cd
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/collapse_many_GTGEGT_int.c
@@ -0,0 +1,60 @@
+// RUN: %libomp-compile-and-run
+
+// Non-rectangular loop collapsing.
+//
+// Nested loops conform to OpenMP 5.2 standard,
+// inner loops bounds may depend on outer loops induction variables.
+
+#define LOOP_TYPES int
+#define COMPARE0 >
+#define COMPARE1 >=
+#define COMPARE2 >
+
+#define DLOOP_GT0
+#define DLOOP_GE1
+#define DLOOP_GT2
+
+#define LOOP \
+ for (i = iLB; i COMPARE0 iUB; i += iStep) \
+ for (j = jA0; j COMPARE1 jB0; j += jStep) \
+ for (k = kA0; k COMPARE2 kB0; k += kStep)
+#include "collapse_test.inc"
+
+int main() {
+ int fail;
+
+ iLB = 3; iUB = -2; jA0 = -3; jA1 = 0; jB0 = -6; jB1 = 0; kA0 = -2; kA1 = 0; kB0 = -4; kB1 = 0; iStep = -1; jStep = -1; kStep = -4;
+ PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
+ "kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
+ fail = (test() == 0);
+
+ if (!fail) {
+
+ for (iStep = -3; iStep >= -6; iStep -= 2) {
+ for (jA0 = -3; jA0 <= 10; jA0 += 3) {
+ for (jB0 = -6; jB0 <= 6; jB0 += 3) {
+ for (jStep = -1; jStep >= -10; jStep -= 2) {
+ for (kA0 = -2; kA0 <= 4; ++kA0) {
+ for (kB0 = -4; kB0 <= 2; ++kB0) {
+ for (kStep = -2; kStep >= -10; kStep -= 4) {
+ {
+ PRINTF("\nTrying iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; "
+ "jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; "
+ "jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1,
+ iStep, jStep, kStep);
+ fail = fail || (test() == 0);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+return fail;
+}
+
diff --git a/openmp/runtime/test/worksharing/for/collapse_many_LTLEGE_int.c b/openmp/runtime/test/worksharing/for/collapse_many_LTLEGE_int.c
new file mode 100644
index 00000000000000..171b02daf8b24f
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/collapse_many_LTLEGE_int.c
@@ -0,0 +1,55 @@
+// RUN: %libomp-compile-and-run
+
+// Non-rectangular loop collapsing.
+//
+// Nested loops conform to OpenMP 5.2 standard,
+// inner loops bounds may depend on outer loops induction variables.
+
+#define LOOP_TYPES int
+#define COMPARE0 <
+#define COMPARE1 <=
+#define COMPARE2 >=
+#define LOOP \
+ for (i = iLB; i COMPARE0 iUB; i += iStep) \
+ for (j = jA0; j COMPARE1 jB0; j += jStep) \
+ for (k = kA0; k COMPARE2 kB0; k += kStep)
+#include "collapse_test.inc"
+
+int main() {
+ int fail;
+
+ iLB = -2; iUB = 3; jA0 = -3; jA1 = 0; jB0 = -6; jB1 = 0; kA0 = -2; kA1 = 0; kB0 = -4; kB1 = 0; iStep = -1; jStep = -1; kStep = -4;
+ PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
+ "kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
+ fail = (test() == 0);
+
+ if (!fail) {
+
+ for (iStep = 2; iStep <= 6; iStep += 2) {
+ for (jA0 = -6; jA0 <= 6; jA0 += 3) {
+ for (jB0 = -3; jB0 <= 10; jB0 += 3) {
+ for (jStep = 1; jStep <= 10; jStep += 2) {
+ for (kA0 = -2; kA0 <= 4; ++kA0) {
+ for (kB0 = -4; kB0 <= 2; ++kB0) {
+ for (kStep = -2; kStep >= -10; kStep -= 4) {
+ {
+ PRINTF("\nTrying iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; "
+ "jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; "
+ "jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1,
+ iStep, jStep, kStep);
+ fail = fail || (test() == 0);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+return fail;
+}
+
diff --git a/openmp/runtime/test/worksharing/for/collapse_many_int.c b/openmp/runtime/test/worksharing/for/collapse_many_int.c
new file mode 100644
index 00000000000000..d834c5014ed5df
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/collapse_many_int.c
@@ -0,0 +1,52 @@
+// RUN: %libomp-compile-and-run
+
+// Non-rectangular loop collapsing.
+//
+// Nested loops conform to OpenMP 5.2 standard,
+// inner loops bounds may depend on outer loops induction variables.
+
+#define LOOP_TYPES int
+#define LOOP \
+ for (i = iLB; i <= iUB; i += iStep) \
+ for (j = i * jA1 + jA0; j <= i * jB1 + jB0; j += jStep) \
+ for (k = j * kA1 + kA0; k <= j * kB1 + kB0; k += kStep)
+#include "collapse_test.inc"
+
+int main()
+{
+ int fail = 0;
+
+ iLB = -2; iUB = 3; jA0 = -7; jA1 = -1; jB0 = 13; jB1 = 3; kA0 = -20; kA1 = -2; kB0 = 111; kB1 = -1; iStep = 5; jStep = 9; kStep = 10;
+ PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
+ fail = fail || (test()==0);
+
+ if (!fail) {
+
+ // NOTE: if a loop on some level won't execute for all iterations of an outer loop, it still should work.
+ // Runtime doesn't require lower bounds to be <= upper bounds for all possible i, j, k.
+
+ iLB = -2; iUB = 3; jA0 = -7; jB0 = 5; kA0 = -13; kB0 = 37;
+
+ for (kA1 = -2; kA1 <= 2; ++kA1) { // <=
+ for (kB1 = -2; kB1 <= 2; ++kB1) {
+ for (jA1 = -3; jA1 <= 3; ++jA1) {
+ for (jB1 = -3; jB1 <= 3; ++jB1) {
+ for (iStep = 1; iStep <= 3; ++iStep) {
+ for (jStep = 2; jStep <= 6; jStep += 2) {
+ for (kStep = 2; kStep <= 8; kStep += 3) {
+ PRINTF("\nTrying iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
+ fail = fail || (test() == 0);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return fail;
+}
+
diff --git a/openmp/runtime/test/worksharing/for/collapse_one_int.c b/openmp/runtime/test/worksharing/for/collapse_one_int.c
new file mode 100644
index 00000000000000..122f9a5a87b160
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/collapse_one_int.c
@@ -0,0 +1,25 @@
+// RUN: %libomp-compile-and-run
+
+// Non-rectangular loop collapsing.
+//
+// Nested loops conform to OpenMP 5.2 standard,
+// inner loops bounds may depend on outer loops induction variables.
+
+#define LOOP_TYPES int
+#define LOOP \
+ for (i = iLB; i <= iUB; i += iStep) \
+ for (j = i + jA0; j <= i + jB0; j += jStep) \
+ for (k = j + kA0; k <= j + kB0; k += kStep)
+
+#include "collapse_test.inc"
+
+int main()
+{
+ int fail;
+ iLB = -2; iUB = 3; jA0 = -7; jB0 = 13; kA0 = -20; kB0 = 111; iStep = 5; jStep = 9; kStep = 10;
+ PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jB0=%d; kA0=%d; kB0=%d; iStep=%d; jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jB0, kA0, kB0, iStep, jStep, kStep);
+ fail = (test() == 0);
+ return fail;
+}
+
>From 68bfd9648c9af499ede6d74290ad2ea5f76c2ec0 Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Thu, 21 Mar 2024 21:01:09 -0700
Subject: [PATCH 2/6] missing file
---
.../test/worksharing/for/collapse_test.inc | 204 ++++++++++++++++++
1 file changed, 204 insertions(+)
create mode 100644 openmp/runtime/test/worksharing/for/collapse_test.inc
diff --git a/openmp/runtime/test/worksharing/for/collapse_test.inc b/openmp/runtime/test/worksharing/for/collapse_test.inc
new file mode 100644
index 00000000000000..8c88dc9860626e
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/collapse_test.inc
@@ -0,0 +1,204 @@
+#include <omp.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <memory.h>
+
+#define LOOP_IV_TYPE0 LOOP_TYPES
+#define LOOP_TYPE0 LOOP_TYPES
+#define LOOP_STYPE0 LOOP_TYPES
+
+#define LOOP_IV_TYPE1 LOOP_TYPES
+#define LOOP_TYPE1 LOOP_TYPES
+#define LOOP_STYPE1 LOOP_TYPES
+
+#define LOOP_IV_TYPE2 LOOP_TYPES
+#define LOOP_TYPE2 LOOP_TYPES
+#define LOOP_STYPE2 LOOP_TYPES
+
+#define MAX_THREADS 256
+
+#if defined VERBOSE
+#define PRINTF printf
+#else
+#define PRINTF
+#endif
+
+LOOP_TYPE0 iLB, iUB;
+LOOP_TYPE1 jA0, jB0;
+LOOP_TYPE2 kA0, kB0;
+
+LOOP_STYPE0 iStep;
+LOOP_STYPE1 jA1, jB1, jStep;
+LOOP_STYPE2 kA1, kB1, kStep;
+
+// We can check <=, <, >=, > (!= has different pattern)
+// Additional definition of LOOP_LEi, LOOP_LTi, etc. is helpful to build calls
+// of the test from main
+
+#if defined LOOP_LE0
+#define COMPARE0 <=
+#elif defined LOOP_LT0
+#define COMPARE0 <
+#elif defined LOOP_GE0
+#define COMPARE0 >=
+#elif defined LOOP_GT0
+#define COMPARE0 >
+#endif
+
+#if defined LOOP_LE1
+#define COMPARE1 <=
+#elif defined LOOP_LT1
+#define COMPARE1 <
+#elif defined LOOP_GE1
+#define COMPARE1 >=
+#elif defined LOOP_GT1
+#define COMPARE1 >
+#endif
+
+#if defined LOOP_LE2
+#define COMPARE2 <=
+#elif defined LOOP_LT2
+#define COMPARE2 <
+#elif defined LOOP_GE2
+#define COMPARE2 >=
+#elif defined LOOP_GT2
+#define COMPARE2 >
+#endif
+
+
+typedef struct
+{
+ LOOP_IV_TYPE0 i;
+ LOOP_IV_TYPE1 j;
+ LOOP_IV_TYPE2 k;
+} spaceType;
+
+spaceType* AllocSpace(unsigned size)
+{
+
+ spaceType *p = (spaceType*) malloc(size * sizeof(spaceType));
+ memset(p, 0, size * sizeof(spaceType));
+ return p;
+}
+
+void FreeSpace(spaceType* space)
+{
+ free(space);
+}
+
+// record an iteration
+void Set(spaceType* space, unsigned count, unsigned trueCount, LOOP_IV_TYPE0 i, LOOP_IV_TYPE1 j, LOOP_IV_TYPE0 k)
+{
+ if (count > trueCount) {
+ // number of iterations exceeded
+ // will be reported with checks
+ return;
+ }
+ space[count-1].i = i;
+ space[count-1].j = j;
+ space[count-1].k = k;
+}
+int test()
+{
+ int pass = 1;
+ LOOP_IV_TYPE0 i;
+ LOOP_IV_TYPE1 j;
+ LOOP_IV_TYPE2 k;
+
+ spaceType* openmpSpace;
+ spaceType* scalarSpace;
+
+ unsigned trueCount = 0;
+ unsigned openmpCount = 0;
+ unsigned scalarCount = 0;
+ unsigned uselessThreadsOpenMP = 0;
+ unsigned usefulThreadsOpenMP = 0;
+ unsigned chunkSizesOpenmp[MAX_THREADS] = {0};
+
+ unsigned num_threads = omp_get_max_threads();
+ if (num_threads > MAX_THREADS) num_threads = MAX_THREADS;
+ omp_set_num_threads(num_threads);
+
+ // count iterations and allocate space
+ LOOP {
+ ++trueCount;
+ }
+
+ openmpSpace = AllocSpace(trueCount);
+ scalarSpace = AllocSpace(trueCount);
+
+ // fill the scalar (compare) space
+ LOOP {
+ ++scalarCount;
+ Set(scalarSpace, scalarCount, trueCount, i, j, k);
+ }
+
+ // test run body:
+ // perform and record OpenMP iterations and thread use
+#pragma omp parallel num_threads(num_threads)
+ {
+#pragma omp for collapse(3) private (i, j, k)
+ LOOP
+ {
+ unsigned count;
+ unsigned gtid = omp_get_thread_num();
+#pragma omp atomic update
+ ++chunkSizesOpenmp[gtid];
+#pragma omp atomic capture
+ count = ++openmpCount;
+ Set(openmpSpace, count, trueCount, i, j, k);
+ }
+ }
+
+ // check for the right number of iterations processed
+ // (only need to check for less, greater is checked when recording)
+ if (openmpCount < trueCount) {
+ PRINTF("OpenMP FAILURE: Openmp processed fewer iterations: %d vs %d\n", openmpCount, trueCount);
+ pass = 0;
+ } else if (openmpCount > trueCount) {
+ PRINTF("OpenMP FAILURE: Openmp processed more iterations: %d vs %d\n", openmpCount, trueCount);
+ pass = 0;
+ }
+
+ // check openMP for iteration correctnes against scalar
+ for (unsigned i = 0; i < trueCount; i++) {
+ unsigned j;
+ for (j = 0; j < openmpCount; j++) {
+ if ((scalarSpace[i].i == openmpSpace[j].i) && (scalarSpace[i].j == openmpSpace[j].j) && (scalarSpace[i].k == openmpSpace[j].k)) {
+ break;
+ }
+ }
+ if (j == openmpCount) {
+ PRINTF("OpenMP FAILURE: (%d %d %d) not processed\n", scalarSpace[i].i, scalarSpace[i].j, scalarSpace[i].k);
+ pass = 0;
+ }
+ }
+
+ // check for efficient thread use
+ for (unsigned i = 0; i < num_threads; ++i) {
+ if (chunkSizesOpenmp[i] == 0) {
+ ++uselessThreadsOpenMP;
+ }
+ }
+
+ // a check to see if at least more than one thread was used (weakish)
+ if ((uselessThreadsOpenMP == num_threads - 1) && (trueCount > 1)) {
+ PRINTF("OpenMP FAILURE: threads are not used\n");
+ pass = 0;
+ }
+
+#if 0
+ // a check to see if the load was spread more or less evenly so that
+ // when there was more work than threads each one got at least something
+ // (stronger, but may currently fail for a general collapse case)
+ if ((trueCount >= num_threads) && (uselessThreadsOpenMP > 0)) {
+ PRINTF("OpenMP FAILURE: %d threads not used with %d iterations\n", uselessThreadsOpenMP, openmpCount);
+ pass = 0;
+ }
+#endif
+
+ // clean up space
+ FreeSpace(openmpSpace);
+ FreeSpace(scalarSpace);
+ return pass;
+}
>From 2f007182addfc760965b69948900ec3140d66b82 Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Thu, 21 Mar 2024 21:11:54 -0700
Subject: [PATCH 3/6] updated comment
---
openmp/runtime/src/kmp_collapse.cpp | 11 +++--------
1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/openmp/runtime/src/kmp_collapse.cpp b/openmp/runtime/src/kmp_collapse.cpp
index 569d2c1508319d..52e0f194e0f25a 100644
--- a/openmp/runtime/src/kmp_collapse.cpp
+++ b/openmp/runtime/src/kmp_collapse.cpp
@@ -1517,16 +1517,11 @@ void kmp_handle_upper_triangle_matrix(
kmp_uint64 iter_with_current = iter_before_current + iter_current;
// calculate the outer loop lower bound (lbo) which is the max outer iv value
// that gives the number of iterations that is equal or just below the total
- // number of iterations executed by the previous threads, for less_than
- // (1-based) inner loops (inner_ub0 == -1) it will be i.e.
- // lbo*(lbo-1)/2<=iter_before_current => lbo^2-lbo-2*iter_before_current<=0
- // for less_than_equal (0-based) inner loops (inner_ub == 0) it will be:
- // i.e. lbo*(lbo+1)/2<=iter_before_current =>
- // lbo^2+lbo-2*iter_before_current<=0 both cases can be handled similarily
- // using a parameter to control the equatio sign
+ // number of iterations executed by the previous threads,
+ // lbo*(lbo+1)/2<=iter_before_current =>
+ // lbo^2+lbo-2*iter_before_current<=0
kmp_uint64 lower_bound_outer =
(kmp_uint64)(sqrt_newton_approx(1 + 8 * iter_before_current) + 1) / 2 - 1;
- ;
// calculate the inner loop lower bound which is the remaining number of
// iterations required to hit the total number of iterations executed by the
// previous threads giving the starting point of this thread
>From ea5890cabc72fd6c4cd8f15644548111f2375065 Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Thu, 21 Mar 2024 21:13:14 -0700
Subject: [PATCH 4/6] disabled a failing test
---
openmp/runtime/test/worksharing/for/collapse_many_int.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/openmp/runtime/test/worksharing/for/collapse_many_int.c b/openmp/runtime/test/worksharing/for/collapse_many_int.c
index d834c5014ed5df..6a126e1b49b31b 100644
--- a/openmp/runtime/test/worksharing/for/collapse_many_int.c
+++ b/openmp/runtime/test/worksharing/for/collapse_many_int.c
@@ -1,4 +1,5 @@
// RUN: %libomp-compile-and-run
+// XFAIL: true
// Non-rectangular loop collapsing.
//
>From d8655f4ffed2b3e9c99c4a9e1f7e0c567ff8920f Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Thu, 21 Mar 2024 21:15:35 -0700
Subject: [PATCH 5/6] clang format
---
openmp/runtime/src/kmp_collapse.cpp | 2 +-
.../for/collapse_many_GELTGT_int.c | 25 ++++--
.../for/collapse_many_GTGEGT_int.c | 25 ++++--
.../for/collapse_many_LTLEGE_int.c | 25 ++++--
.../test/worksharing/for/collapse_many_int.c | 84 ++++++++++++-------
.../test/worksharing/for/collapse_one_int.c | 25 ++++--
6 files changed, 123 insertions(+), 63 deletions(-)
diff --git a/openmp/runtime/src/kmp_collapse.cpp b/openmp/runtime/src/kmp_collapse.cpp
index 52e0f194e0f25a..e63a98081db9b8 100644
--- a/openmp/runtime/src/kmp_collapse.cpp
+++ b/openmp/runtime/src/kmp_collapse.cpp
@@ -1517,7 +1517,7 @@ void kmp_handle_upper_triangle_matrix(
kmp_uint64 iter_with_current = iter_before_current + iter_current;
// calculate the outer loop lower bound (lbo) which is the max outer iv value
// that gives the number of iterations that is equal or just below the total
- // number of iterations executed by the previous threads,
+ // number of iterations executed by the previous threads:
// lbo*(lbo+1)/2<=iter_before_current =>
// lbo^2+lbo-2*iter_before_current<=0
kmp_uint64 lower_bound_outer =
diff --git a/openmp/runtime/test/worksharing/for/collapse_many_GELTGT_int.c b/openmp/runtime/test/worksharing/for/collapse_many_GELTGT_int.c
index 23808244db4475..77b2d6918d8721 100644
--- a/openmp/runtime/test/worksharing/for/collapse_many_GELTGT_int.c
+++ b/openmp/runtime/test/worksharing/for/collapse_many_GELTGT_int.c
@@ -9,16 +9,28 @@
#define COMPARE0 >=
#define COMPARE1 <
#define COMPARE2 >
-#define LOOP \
- for (i = iLB; i COMPARE0 iUB; i += iStep) \
- for (j = jA0; j COMPARE1 jB0; j += jStep) \
- for (k = kA0; k COMPARE2 kB0; k += kStep)
+#define LOOP \
+ for (i = iLB; i COMPARE0 iUB; i += iStep) \
+ for (j = jA0; j COMPARE1 jB0; j += jStep) \
+ for (k = kA0; k COMPARE2 kB0; k += kStep)
#include "collapse_test.inc"
int main() {
int fail;
- iLB = 3; iUB = -2; jA0 = -3; jA1 = 0; jB0 = -6; jB1 = 0; kA0 = -2; kA1 = 0; kB0 = -4; kB1 = 0; iStep = -1; jStep = -1; kStep = -4;
+ iLB = 3;
+ iUB = -2;
+ jA0 = -3;
+ jA1 = 0;
+ jB0 = -6;
+ jB1 = 0;
+ kA0 = -2;
+ kA1 = 0;
+ kB0 = -4;
+ kB1 = 0;
+ iStep = -1;
+ jStep = -1;
+ kStep = -4;
PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
"kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
@@ -49,6 +61,5 @@ int main() {
}
}
-return fail;
+ return fail;
}
-
diff --git a/openmp/runtime/test/worksharing/for/collapse_many_GTGEGT_int.c b/openmp/runtime/test/worksharing/for/collapse_many_GTGEGT_int.c
index 9a10b5d01895cd..985211172e6283 100644
--- a/openmp/runtime/test/worksharing/for/collapse_many_GTGEGT_int.c
+++ b/openmp/runtime/test/worksharing/for/collapse_many_GTGEGT_int.c
@@ -14,16 +14,28 @@
#define DLOOP_GE1
#define DLOOP_GT2
-#define LOOP \
- for (i = iLB; i COMPARE0 iUB; i += iStep) \
- for (j = jA0; j COMPARE1 jB0; j += jStep) \
- for (k = kA0; k COMPARE2 kB0; k += kStep)
+#define LOOP \
+ for (i = iLB; i COMPARE0 iUB; i += iStep) \
+ for (j = jA0; j COMPARE1 jB0; j += jStep) \
+ for (k = kA0; k COMPARE2 kB0; k += kStep)
#include "collapse_test.inc"
int main() {
int fail;
- iLB = 3; iUB = -2; jA0 = -3; jA1 = 0; jB0 = -6; jB1 = 0; kA0 = -2; kA1 = 0; kB0 = -4; kB1 = 0; iStep = -1; jStep = -1; kStep = -4;
+ iLB = 3;
+ iUB = -2;
+ jA0 = -3;
+ jA1 = 0;
+ jB0 = -6;
+ jB1 = 0;
+ kA0 = -2;
+ kA1 = 0;
+ kB0 = -4;
+ kB1 = 0;
+ iStep = -1;
+ jStep = -1;
+ kStep = -4;
PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
"kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
@@ -55,6 +67,5 @@ int main() {
}
}
-return fail;
+ return fail;
}
-
diff --git a/openmp/runtime/test/worksharing/for/collapse_many_LTLEGE_int.c b/openmp/runtime/test/worksharing/for/collapse_many_LTLEGE_int.c
index 171b02daf8b24f..47e3b42226c838 100644
--- a/openmp/runtime/test/worksharing/for/collapse_many_LTLEGE_int.c
+++ b/openmp/runtime/test/worksharing/for/collapse_many_LTLEGE_int.c
@@ -9,16 +9,28 @@
#define COMPARE0 <
#define COMPARE1 <=
#define COMPARE2 >=
-#define LOOP \
- for (i = iLB; i COMPARE0 iUB; i += iStep) \
- for (j = jA0; j COMPARE1 jB0; j += jStep) \
- for (k = kA0; k COMPARE2 kB0; k += kStep)
+#define LOOP \
+ for (i = iLB; i COMPARE0 iUB; i += iStep) \
+ for (j = jA0; j COMPARE1 jB0; j += jStep) \
+ for (k = kA0; k COMPARE2 kB0; k += kStep)
#include "collapse_test.inc"
int main() {
int fail;
- iLB = -2; iUB = 3; jA0 = -3; jA1 = 0; jB0 = -6; jB1 = 0; kA0 = -2; kA1 = 0; kB0 = -4; kB1 = 0; iStep = -1; jStep = -1; kStep = -4;
+ iLB = -2;
+ iUB = 3;
+ jA0 = -3;
+ jA1 = 0;
+ jB0 = -6;
+ jB1 = 0;
+ kA0 = -2;
+ kA1 = 0;
+ kB0 = -4;
+ kB1 = 0;
+ iStep = -1;
+ jStep = -1;
+ kStep = -4;
PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
"kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
@@ -50,6 +62,5 @@ int main() {
}
}
-return fail;
+ return fail;
}
-
diff --git a/openmp/runtime/test/worksharing/for/collapse_many_int.c b/openmp/runtime/test/worksharing/for/collapse_many_int.c
index 6a126e1b49b31b..4455602df8a23b 100644
--- a/openmp/runtime/test/worksharing/for/collapse_many_int.c
+++ b/openmp/runtime/test/worksharing/for/collapse_many_int.c
@@ -13,41 +13,61 @@
for (k = j * kA1 + kA0; k <= j * kB1 + kB0; k += kStep)
#include "collapse_test.inc"
-int main()
-{
- int fail = 0;
-
- iLB = -2; iUB = 3; jA0 = -7; jA1 = -1; jB0 = 13; jB1 = 3; kA0 = -20; kA1 = -2; kB0 = 111; kB1 = -1; iStep = 5; jStep = 9; kStep = 10;
- PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
- iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
- fail = fail || (test()==0);
-
- if (!fail) {
-
- // NOTE: if a loop on some level won't execute for all iterations of an outer loop, it still should work.
- // Runtime doesn't require lower bounds to be <= upper bounds for all possible i, j, k.
-
- iLB = -2; iUB = 3; jA0 = -7; jB0 = 5; kA0 = -13; kB0 = 37;
-
- for (kA1 = -2; kA1 <= 2; ++kA1) { // <=
- for (kB1 = -2; kB1 <= 2; ++kB1) {
- for (jA1 = -3; jA1 <= 3; ++jA1) {
- for (jB1 = -3; jB1 <= 3; ++jB1) {
- for (iStep = 1; iStep <= 3; ++iStep) {
- for (jStep = 2; jStep <= 6; jStep += 2) {
- for (kStep = 2; kStep <= 8; kStep += 3) {
- PRINTF("\nTrying iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
- iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
- fail = fail || (test() == 0);
- }
- }
- }
- }
+int main() {
+ int fail = 0;
+
+ iLB = -2;
+ iUB = 3;
+ jA0 = -7;
+ jA1 = -1;
+ jB0 = 13;
+ jB1 = 3;
+ kA0 = -20;
+ kA1 = -2;
+ kB0 = 111;
+ kB1 = -1;
+ iStep = 5;
+ jStep = 9;
+ kStep = 10;
+ PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; jB1=%d; kA0=%d; "
+ "kA1=%d; kB0=%d; kB1=%d; iStep=%d; jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1, iStep, jStep, kStep);
+ fail = fail || (test() == 0);
+
+ if (!fail) {
+
+ // NOTE: if a loop on some level won't execute for all iterations of an
+ // outer loop, it still should work. Runtime doesn't require lower bounds to
+ // be <= upper bounds for all possible i, j, k.
+
+ iLB = -2;
+ iUB = 3;
+ jA0 = -7;
+ jB0 = 5;
+ kA0 = -13;
+ kB0 = 37;
+
+ for (kA1 = -2; kA1 <= 2; ++kA1) { // <=
+ for (kB1 = -2; kB1 <= 2; ++kB1) {
+ for (jA1 = -3; jA1 <= 3; ++jA1) {
+ for (jB1 = -3; jB1 <= 3; ++jB1) {
+ for (iStep = 1; iStep <= 3; ++iStep) {
+ for (jStep = 2; jStep <= 6; jStep += 2) {
+ for (kStep = 2; kStep <= 8; kStep += 3) {
+ PRINTF("\nTrying iLB=%d; iUB=%d; jA0=%d; jA1=%d; jB0=%d; "
+ "jB1=%d; kA0=%d; kA1=%d; kB0=%d; kB1=%d; iStep=%d; "
+ "jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jA1, jB0, jB1, kA0, kA1, kB0, kB1,
+ iStep, jStep, kStep);
+ fail = fail || (test() == 0);
}
+ }
}
+ }
}
+ }
}
+ }
- return fail;
+ return fail;
}
-
diff --git a/openmp/runtime/test/worksharing/for/collapse_one_int.c b/openmp/runtime/test/worksharing/for/collapse_one_int.c
index 122f9a5a87b160..437d4bff31eb37 100644
--- a/openmp/runtime/test/worksharing/for/collapse_one_int.c
+++ b/openmp/runtime/test/worksharing/for/collapse_one_int.c
@@ -13,13 +13,20 @@
#include "collapse_test.inc"
-int main()
-{
- int fail;
- iLB = -2; iUB = 3; jA0 = -7; jB0 = 13; kA0 = -20; kB0 = 111; iStep = 5; jStep = 9; kStep = 10;
- PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jB0=%d; kA0=%d; kB0=%d; iStep=%d; jStep=%d; kStep=%d;\n",
- iLB, iUB, jA0, jB0, kA0, kB0, iStep, jStep, kStep);
- fail = (test() == 0);
- return fail;
+int main() {
+ int fail;
+ iLB = -2;
+ iUB = 3;
+ jA0 = -7;
+ jB0 = 13;
+ kA0 = -20;
+ kB0 = 111;
+ iStep = 5;
+ jStep = 9;
+ kStep = 10;
+ PRINTF("\nOne off iLB=%d; iUB=%d; jA0=%d; jB0=%d; kA0=%d; kB0=%d; iStep=%d; "
+ "jStep=%d; kStep=%d;\n",
+ iLB, iUB, jA0, jB0, kA0, kB0, iStep, jStep, kStep);
+ fail = (test() == 0);
+ return fail;
}
-
>From 7aec61515636156f186ebf17e28e8e104ba9bbe7 Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Thu, 21 Mar 2024 21:39:56 -0700
Subject: [PATCH 6/6] clang format
---
.../test/worksharing/for/collapse_test.inc | 215 +++++++++---------
1 file changed, 106 insertions(+), 109 deletions(-)
diff --git a/openmp/runtime/test/worksharing/for/collapse_test.inc b/openmp/runtime/test/worksharing/for/collapse_test.inc
index 8c88dc9860626e..de0e7e4e57f30d 100644
--- a/openmp/runtime/test/worksharing/for/collapse_test.inc
+++ b/openmp/runtime/test/worksharing/for/collapse_test.inc
@@ -65,140 +65,137 @@ LOOP_STYPE2 kA1, kB1, kStep;
#define COMPARE2 >
#endif
-
-typedef struct
-{
- LOOP_IV_TYPE0 i;
- LOOP_IV_TYPE1 j;
- LOOP_IV_TYPE2 k;
+typedef struct {
+ LOOP_IV_TYPE0 i;
+ LOOP_IV_TYPE1 j;
+ LOOP_IV_TYPE2 k;
} spaceType;
-spaceType* AllocSpace(unsigned size)
-{
+spaceType *AllocSpace(unsigned size) {
- spaceType *p = (spaceType*) malloc(size * sizeof(spaceType));
- memset(p, 0, size * sizeof(spaceType));
- return p;
+ spaceType *p = (spaceType *)malloc(size * sizeof(spaceType));
+ memset(p, 0, size * sizeof(spaceType));
+ return p;
}
-void FreeSpace(spaceType* space)
-{
- free(space);
-}
+void FreeSpace(spaceType *space) { free(space); }
// record an iteration
-void Set(spaceType* space, unsigned count, unsigned trueCount, LOOP_IV_TYPE0 i, LOOP_IV_TYPE1 j, LOOP_IV_TYPE0 k)
-{
- if (count > trueCount) {
- // number of iterations exceeded
- // will be reported with checks
- return;
- }
- space[count-1].i = i;
- space[count-1].j = j;
- space[count-1].k = k;
+void Set(spaceType *space, unsigned count, unsigned trueCount, LOOP_IV_TYPE0 i,
+ LOOP_IV_TYPE1 j, LOOP_IV_TYPE0 k) {
+ if (count > trueCount) {
+ // number of iterations exceeded
+ // will be reported with checks
+ return;
+ }
+ space[count - 1].i = i;
+ space[count - 1].j = j;
+ space[count - 1].k = k;
}
-int test()
-{
- int pass = 1;
- LOOP_IV_TYPE0 i;
- LOOP_IV_TYPE1 j;
- LOOP_IV_TYPE2 k;
-
- spaceType* openmpSpace;
- spaceType* scalarSpace;
-
- unsigned trueCount = 0;
- unsigned openmpCount = 0;
- unsigned scalarCount = 0;
- unsigned uselessThreadsOpenMP = 0;
- unsigned usefulThreadsOpenMP = 0;
- unsigned chunkSizesOpenmp[MAX_THREADS] = {0};
-
- unsigned num_threads = omp_get_max_threads();
- if (num_threads > MAX_THREADS) num_threads = MAX_THREADS;
- omp_set_num_threads(num_threads);
-
- // count iterations and allocate space
- LOOP {
- ++trueCount;
- }
-
- openmpSpace = AllocSpace(trueCount);
- scalarSpace = AllocSpace(trueCount);
-
- // fill the scalar (compare) space
- LOOP {
- ++scalarCount;
- Set(scalarSpace, scalarCount, trueCount, i, j, k);
- }
-
- // test run body:
- // perform and record OpenMP iterations and thread use
+int test() {
+ int pass = 1;
+ LOOP_IV_TYPE0 i;
+ LOOP_IV_TYPE1 j;
+ LOOP_IV_TYPE2 k;
+
+ spaceType *openmpSpace;
+ spaceType *scalarSpace;
+
+ unsigned trueCount = 0;
+ unsigned openmpCount = 0;
+ unsigned scalarCount = 0;
+ unsigned uselessThreadsOpenMP = 0;
+ unsigned usefulThreadsOpenMP = 0;
+ unsigned chunkSizesOpenmp[MAX_THREADS] = {0};
+
+ unsigned num_threads = omp_get_max_threads();
+ if (num_threads > MAX_THREADS)
+ num_threads = MAX_THREADS;
+ omp_set_num_threads(num_threads);
+
+ // count iterations and allocate space
+ LOOP { ++trueCount; }
+
+ openmpSpace = AllocSpace(trueCount);
+ scalarSpace = AllocSpace(trueCount);
+
+ // fill the scalar (compare) space
+ LOOP {
+ ++scalarCount;
+ Set(scalarSpace, scalarCount, trueCount, i, j, k);
+ }
+
+ // test run body:
+ // perform and record OpenMP iterations and thread use
#pragma omp parallel num_threads(num_threads)
- {
-#pragma omp for collapse(3) private (i, j, k)
- LOOP
- {
- unsigned count;
- unsigned gtid = omp_get_thread_num();
+ {
+#pragma omp for collapse(3) private(i, j, k)
+ LOOP {
+ unsigned count;
+ unsigned gtid = omp_get_thread_num();
#pragma omp atomic update
- ++chunkSizesOpenmp[gtid];
+ ++chunkSizesOpenmp[gtid];
#pragma omp atomic capture
- count = ++openmpCount;
- Set(openmpSpace, count, trueCount, i, j, k);
- }
- }
-
- // check for the right number of iterations processed
- // (only need to check for less, greater is checked when recording)
- if (openmpCount < trueCount) {
- PRINTF("OpenMP FAILURE: Openmp processed fewer iterations: %d vs %d\n", openmpCount, trueCount);
- pass = 0;
- } else if (openmpCount > trueCount) {
- PRINTF("OpenMP FAILURE: Openmp processed more iterations: %d vs %d\n", openmpCount, trueCount);
- pass = 0;
+ count = ++openmpCount;
+ Set(openmpSpace, count, trueCount, i, j, k);
}
-
- // check openMP for iteration correctnes against scalar
- for (unsigned i = 0; i < trueCount; i++) {
- unsigned j;
- for (j = 0; j < openmpCount; j++) {
- if ((scalarSpace[i].i == openmpSpace[j].i) && (scalarSpace[i].j == openmpSpace[j].j) && (scalarSpace[i].k == openmpSpace[j].k)) {
- break;
- }
- }
- if (j == openmpCount) {
- PRINTF("OpenMP FAILURE: (%d %d %d) not processed\n", scalarSpace[i].i, scalarSpace[i].j, scalarSpace[i].k);
- pass = 0;
+ }
+
+ // check for the right number of iterations processed
+ // (only need to check for less, greater is checked when recording)
+ if (openmpCount < trueCount) {
+ PRINTF("OpenMP FAILURE: Openmp processed fewer iterations: %d vs %d\n",
+ openmpCount, trueCount);
+ pass = 0;
+ } else if (openmpCount > trueCount) {
+ PRINTF("OpenMP FAILURE: Openmp processed more iterations: %d vs %d\n",
+ openmpCount, trueCount);
+ pass = 0;
+ }
+
+ // check openMP for iteration correctnes against scalar
+ for (unsigned i = 0; i < trueCount; i++) {
+ unsigned j;
+ for (j = 0; j < openmpCount; j++) {
+ if ((scalarSpace[i].i == openmpSpace[j].i) &&
+ (scalarSpace[i].j == openmpSpace[j].j) &&
+ (scalarSpace[i].k == openmpSpace[j].k)) {
+ break;
}
}
-
- // check for efficient thread use
- for (unsigned i = 0; i < num_threads; ++i) {
- if (chunkSizesOpenmp[i] == 0) {
- ++uselessThreadsOpenMP;
- }
+ if (j == openmpCount) {
+ PRINTF("OpenMP FAILURE: (%d %d %d) not processed\n", scalarSpace[i].i,
+ scalarSpace[i].j, scalarSpace[i].k);
+ pass = 0;
}
+ }
- // a check to see if at least more than one thread was used (weakish)
- if ((uselessThreadsOpenMP == num_threads - 1) && (trueCount > 1)) {
- PRINTF("OpenMP FAILURE: threads are not used\n");
- pass = 0;
+ // check for efficient thread use
+ for (unsigned i = 0; i < num_threads; ++i) {
+ if (chunkSizesOpenmp[i] == 0) {
+ ++uselessThreadsOpenMP;
}
+ }
+
+ // a check to see if at least more than one thread was used (weakish)
+ if ((uselessThreadsOpenMP == num_threads - 1) && (trueCount > 1)) {
+ PRINTF("OpenMP FAILURE: threads are not used\n");
+ pass = 0;
+ }
#if 0
// a check to see if the load was spread more or less evenly so that
// when there was more work than threads each one got at least something
// (stronger, but may currently fail for a general collapse case)
if ((trueCount >= num_threads) && (uselessThreadsOpenMP > 0)) {
- PRINTF("OpenMP FAILURE: %d threads not used with %d iterations\n", uselessThreadsOpenMP, openmpCount);
+ PRINTF("OpenMP FAILURE: %d threads not used with %d iterations\n",
+ uselessThreadsOpenMP, openmpCount);
pass = 0;
}
#endif
- // clean up space
- FreeSpace(openmpSpace);
- FreeSpace(scalarSpace);
- return pass;
+ // clean up space
+ FreeSpace(openmpSpace);
+ FreeSpace(scalarSpace);
+ return pass;
}
More information about the Openmp-commits
mailing list