[Openmp-commits] [openmp] fix endianness dependent definitions in kmp.h for MSVC (PR #84530)

Fri Mar 8 11:03:17 PST 2024

https://github.com/vadikp-intel updated https://github.com/llvm/llvm-project/pull/84530

>From 75d7c576bf4e5a0f13a4fdb0fda036e3955bed89 Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Fri, 8 Mar 2024 10:17:46 -0800
Subject: [PATCH 1/3] fix endianess dependent definitions for MSVC in kmp.h

---
 openmp/runtime/src/kmp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 121e7e959129ea..59a90ea702f688 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -2507,7 +2507,7 @@ typedef struct kmp_depend_info {
   union {
     kmp_uint8 flag; // flag as an unsigned char
     struct { // flag as a set of 8 bits
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
       /* Same fields as in the #else branch, but in reverse order */
       unsigned all : 1;
       unsigned unused : 3;
@@ -2672,7 +2672,7 @@ typedef struct kmp_task_stack {
 #endif // BUILD_TIED_TASK_STACK
 
 typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
   /* Same fields as in the #else branch, but in reverse order */
 #if OMPX_TASKGRAPH
   unsigned reserved31 : 6;

>From 570edd0dcb4d76e1714612328fdf14ca5c57d52f Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Fri, 8 Mar 2024 10:34:46 -0800
Subject: [PATCH 2/3] additional instance

---
 openmp/runtime/src/kmp_lock.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h
index e2a0cda01a9718..16eef38ac4fe7b 100644
--- a/openmp/runtime/src/kmp_lock.h
+++ b/openmp/runtime/src/kmp_lock.h
@@ -120,7 +120,8 @@ extern void __kmp_validate_locks(void);
 
 struct kmp_base_tas_lock {
   // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ && __LP64__
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) &&       \
+    __LP64__
   // Flip the ordering of the high and low 32-bit member to be consistent
   // with the memory layout of the address in 64-bit big-endian.
   kmp_int32 depth_locked; // depth locked, for nested locks only

>From 919bc9588f817401d23b2934d385e519ec44b4da Mon Sep 17 00:00:00 2001
From: Vadim Paretsky <b-vadipa at microsoft.com>
Date: Fri, 8 Mar 2024 11:03:04 -0800
Subject: [PATCH 3/3] clang format

---
 openmp/runtime/src/kmp.h                      |  13 +-
 openmp/runtime/src/kmp_lock.h                 |   7 +-
 .../for/omp_for_schedule_static_collapse.c    | 203 ++++++++++++++++++
 3 files changed, 214 insertions(+), 9 deletions(-)
 create mode 100644 openmp/runtime/test/worksharing/for/omp_for_schedule_static_collapse.c

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 59a90ea702f688..b5c0307b49e192 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -20,7 +20,7 @@
 
 /* This fix replaces gettimeofday with clock_gettime for better scalability on
    the Altix.  Requires user code to be linked with -lrt. */
-//#define FIX_SGI_CLOCK
+// #define FIX_SGI_CLOCK
 
 /* Defines for OpenMP 3.0 tasking and auto scheduling */
 
@@ -463,8 +463,9 @@ enum sched_type : kmp_int32 {
 #define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
   (enum sched_type)(                                                           \
       (s) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic))
-#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sch_modifier_monotonic) != 0)
-#define SCHEDULE_HAS_NONMONOTONIC(s) (((s)&kmp_sch_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_MONOTONIC(s) (((s) & kmp_sch_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
+  (((s) & kmp_sch_modifier_nonmonotonic) != 0)
 #define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
   (((s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)) == 0)
 #define SCHEDULE_GET_MODIFIERS(s)                                              \
@@ -2729,7 +2730,8 @@ typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
   unsigned freed : 1; /* 1==freed, 0==allocated        */
   unsigned native : 1; /* 1==gcc-compiled task, 0==intel */
 #if OMPX_TASKGRAPH
-  unsigned onced : 1; /* 1==ran once already, 0==never ran, record & replay purposes */
+  unsigned onced : 1; /* 1==ran once already, 0==never ran, record & replay
+                         purposes */
   unsigned reserved31 : 6; /* reserved for library use */
 #else
   unsigned reserved31 : 7; /* reserved for library use */
@@ -3881,7 +3883,8 @@ extern void __kmp_check_stack_overlap(kmp_info_t *thr);
 extern void __kmp_expand_host_name(char *buffer, size_t size);
 extern void __kmp_expand_file_name(char *result, size_t rlen, char *pattern);
 
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64 || (KMP_OS_WINDOWS && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM))
+#if KMP_ARCH_X86 || KMP_ARCH_X86_64 ||                                         \
+    (KMP_OS_WINDOWS && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM))
 extern void
 __kmp_initialize_system_tick(void); /* Initialize timer tick value */
 #endif
diff --git a/openmp/runtime/src/kmp_lock.h b/openmp/runtime/src/kmp_lock.h
index 16eef38ac4fe7b..cd8927095e7048 100644
--- a/openmp/runtime/src/kmp_lock.h
+++ b/openmp/runtime/src/kmp_lock.h
@@ -120,7 +120,7 @@ extern void __kmp_validate_locks(void);
 
 struct kmp_base_tas_lock {
   // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
-#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) &&       \
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) &&     \
     __LP64__
   // Flip the ordering of the high and low 32-bit member to be consistent
   // with the memory layout of the address in 64-bit big-endian.
@@ -661,9 +661,8 @@ extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
       KMP_INIT_BACKOFF(time);                                                  \
       do {                                                                     \
         KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);                              \
-      } while (                                                                \
-          lck->tas.lk.poll != 0 ||                                             \
-          !__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1));    \
+      } while (lck->tas.lk.poll != 0 || !__kmp_atomic_compare_store_acq(       \
+                                            &lck->tas.lk.poll, 0, gtid + 1));  \
     }                                                                          \
     KMP_FSYNC_ACQUIRED(lck);                                                   \
   } else {                                                                     \
diff --git a/openmp/runtime/test/worksharing/for/omp_for_schedule_static_collapse.c b/openmp/runtime/test/worksharing/for/omp_for_schedule_static_collapse.c
new file mode 100644
index 00000000000000..637eb6a3e573b9
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_for_schedule_static_collapse.c
@@ -0,0 +1,203 @@
+// RUN: %libomp-compile-and-run
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "omp_testsuite.h"
+
+#ifdef _MSC_VER
+#define EFFICIENCY_CHECK
+#endif
+
+int* Alloc(unsigned size) { 
+  int* p = malloc(size * sizeof(int));
+  memset(p, 0, size * sizeof(int));
+  return p;
+}
+
+#define I(i,j) (i * (upper_bound + outer_eq) + j)
+char *loop_type[] = {"< ,<", "< ,<=", "<=,< ", "<=,<="};
+
+int test_omp_for_schedule_static_collapse_lower_triangle(unsigned num_threads, unsigned upper_bound, int outer_eq, int inner_eq) {
+
+  unsigned outer_iterations = upper_bound + outer_eq;
+  unsigned first_inner_element = inner_eq;
+  unsigned last_inner_element = first_inner_element + (outer_iterations - 1);
+  unsigned total_iterations =
+      (first_inner_element + last_inner_element) * outer_iterations / 2;
+  int i, j;
+
+  /* To ensure Correctness, only valid iterations are executed and are executed
+     only once. Stores the number of times an iteration is executed. */
+  int *execution_count = Alloc(outer_iterations * outer_iterations);
+  /* Stores the number of iterations executed by each thread. */
+  int* iterations_per_thread = Alloc(num_threads);
+
+  char *loop_type[] = {"< ,<", "< ,<=", "<=,< ", "<=,<="};
+
+  omp_set_num_threads(num_threads);
+
+#ifdef VERBOSE
+  fprintf(stderr, "INFO: Using %6d threads for %6d outer iterations (%6d chunks) loop type lower triangle: %s - ", num_threads, upper_bound, total_iterations, loop_type[outer_eq*2 + inner_eq]);
+#endif  
+
+#pragma omp parallel shared(iterations_per_thread, execution_count)
+  { /* begin of parallel */
+    /* Lower triangular execution_count matrix */
+    #pragma omp for schedule (static) collapse(2)
+    for(i = 0; i < upper_bound + outer_eq; i++) {
+      for (j = 0; j < i + inner_eq; j++) {
+        iterations_per_thread[omp_get_thread_num()]++;
+        execution_count[I(i, j)]++;
+      }
+    }/* end of for*/
+  }/* end of parallel */
+
+  /* check the execution_count array */
+  for (i = 0; i < upper_bound + outer_eq; i++) {
+    for (j = 0; j < i + inner_eq; j++) {
+      /* iteration with j<=i are valid, but should be executed only once */
+      if (execution_count[I(i, j)] != 1) {
+#ifdef VERBOSE
+        fprintf(
+            stderr,
+            "ERROR: valid iteration [%i,%i]:%i not executed exactly once.\n", i,
+            j, execution_count[I(i, j)]);
+#endif            
+        return 0;
+      }
+    }
+    for (j = i + inner_eq; j < upper_bound + outer_eq; j++) {
+      /* iteration with j>=i are invalid should not have executed */
+      if (execution_count[I(i, j)] > 0) {
+#ifdef VERBOSE
+        fprintf(stderr, "ERROR: invalid iteration [%i,%i]:%i executed.\n", i,
+                j, execution_count[I(i, j)]);
+#endif                
+        return 0;
+      }
+    }
+  }
+#ifdef EFFICIENCY_CHECK
+  /* Ensure the number of iterations executed by each thread is within bounds */
+  for(i = 0;i < num_threads; i++) {
+    if (iterations_per_thread[i] < total_iterations / num_threads ||
+        iterations_per_thread[i] > total_iterations / num_threads + 1) {
+#ifdef VERBOSE
+      fprintf(stderr, "ERROR: Inefficient Collapse thread:%i [%i,%i]:%i\n", i,
+              total_iterations / num_threads,
+              total_iterations / num_threads + 1, iterations_per_thread[i]);
+#endif                
+      return 0;
+    }
+  }
+#endif  
+#ifdef VERBOSE
+  fprintf(stderr, "PASSED\r");
+#endif
+  
+  free(execution_count);
+  free(iterations_per_thread);
+  return 1;
+}
+
+int test_omp_for_schedule_static_collapse_upper_triangle(unsigned num_threads, unsigned upper_bound) {
+
+  int outer_eq = 0;
+  int inner_eq = 0;
+  unsigned outer_iterations = upper_bound + outer_eq;
+  unsigned last_inner_element = inner_eq;
+  unsigned first_inner_element = last_inner_element + outer_iterations + 1;
+  unsigned total_iterations =
+      (first_inner_element + last_inner_element) * outer_iterations / 2;
+  int i, j;
+
+  /* To ensure Correctness, only valid iterations are executed and are executed
+     only once. Stores the number of times an iteration is executed. */
+  int *execution_count = Alloc(outer_iterations * outer_iterations);
+  /* Stores the number of iterations executed by each thread. */
+  int* iterations_per_thread = Alloc(num_threads);
+
+  omp_set_num_threads(num_threads);
+#ifdef VERBOSE
+  fprintf(stderr, "INFO: Using %6d threads for %6d outer iterations (%6d chunks) loop type upper triangle: %s - ", num_threads, upper_bound, total_iterations, loop_type[outer_eq*2 + inner_eq]);
+#endif
+
+#pragma omp parallel shared(iterations_per_thread, execution_count)
+  { /* begin of parallel */
+    /* Lower triangular execution_count matrix */
+    #pragma omp for schedule (static) collapse(2)
+    for(i = 0; i < upper_bound + outer_eq; i++) {
+      for (j = i; j < upper_bound + inner_eq; j++) {
+        iterations_per_thread[omp_get_thread_num()]++;
+        execution_count[I(i, j)]++;
+      }
+    }/* end of for*/
+  }/* end of parallel */
+
+  /* check the execution_count array */
+  for (i = 0; i < upper_bound + outer_eq; i++) {
+    for (j = i; j < upper_bound + inner_eq; j++) {
+      /* iteration with j>=i are valid, but should be executed only once */
+      if (execution_count[I(i, j)] != 1) {
+#ifdef VERBOSE
+        fprintf(
+            stderr,
+            "ERROR: valid iteration [%i,%i]:%i not executed exactly once.\n", i,
+            j, execution_count[I(i, j)]);
+#endif            
+        return 0;
+      }
+    }
+    for (j = 0; j < i; j++) {
+      /* iteration with j<i are invalid should not have executed */
+      if (execution_count[I(i, j)] > 0) {
+#ifdef VERBOSE
+        fprintf(stderr, "ERROR: invalid iteration [%i,%i]:%i executed.\n", i,
+                j, execution_count[I(i, j)]);
+#endif                
+        return 0;
+      }
+    }
+  }
+#ifdef EFFICIENCY_CHECK
+  /* Ensure the number of iterations executed by each thread is within bounds */
+  for(i = 0;i < num_threads; i++) {
+    if (iterations_per_thread[i] < total_iterations / num_threads ||
+        iterations_per_thread[i] > total_iterations / num_threads + 1) {
+#ifdef VERBOSE
+      fprintf(stderr, "ERROR: Inefficient Collapse thread:%i [%i,%i]:%i\n", i,
+              total_iterations / num_threads,
+              total_iterations / num_threads + 1, iterations_per_thread[i]);
+#endif
+      return 0;
+    }
+  }
+#endif  
+#ifdef VERBOSE
+  fprintf(stderr, "PASSED\r");
+#endif
+
+  free(execution_count);
+  free(iterations_per_thread);
+  return 1;
+}
+
+
+int main(int narg, char* argv[]) {
+  unsigned min_threads = 0;
+  unsigned max_threads = 64;
+  unsigned min_iter = 0;
+  unsigned max_iter = 64;
+  int i, j, outer_eq, inner_eq;
+
+  for(i = min_threads; i <= max_threads; i++)
+    for (j = min_iter; j <= max_iter; j++) {
+      for (outer_eq = 0;outer_eq <= 1; outer_eq++)
+        for (inner_eq = 0; inner_eq <= 1; inner_eq++)
+          if (!test_omp_for_schedule_static_collapse_lower_triangle(i, j, outer_eq, inner_eq))
+             return 1;
+      if (!test_omp_for_schedule_static_collapse_upper_triangle(i, j))
+        return 1;
+  }
+  return 0;
+}