[Openmp-commits] [openmp] r274894 - Improving EPCC performance when linking with hwloc

Fri Jul 8 10:43:21 PDT 2016

Author: jlpeyton
Date: Fri Jul  8 12:43:21 2016
New Revision: 274894

URL: http://llvm.org/viewvc/llvm-project?rev=274894&view=rev
Log:
Improving EPCC performance when linking with hwloc

When linking with libhwloc, the ORDERED EPCC test slows down on big
machines (> 48 cores). Performance analysis showed that a cache thrash
was occurring and this padding helps alleviate the problem.

Also, inside the main spin-wait loop in kmp_wait_release.h, we can eliminate
the references to the global shared variables by instead creating a local
variable, oversubscribed and instead checking that.

Differential Revision: http://reviews.llvm.org/D22093

Modified:
    openmp/trunk/runtime/src/kmp.h
    openmp/trunk/runtime/src/kmp_dispatch.cpp
    openmp/trunk/runtime/src/kmp_wait_release.h

Modified: openmp/trunk/runtime/src/kmp.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp.h?rev=274894&r1=274893&r2=274894&view=diff
==============================================================================

--- openmp/trunk/runtime/src/kmp.h (original)
+++ openmp/trunk/runtime/src/kmp.h Fri Jul  8 12:43:21 2016
@@ -1706,6 +1706,12 @@ typedef struct dispatch_shared_info {
     volatile kmp_uint32    *doacross_flags;    // shared array of iteration flags (0/1)
     kmp_int32               doacross_num_done; // count finished threads
 #endif
+#if KMP_USE_HWLOC
+    // When linking with libhwloc, the ORDERED EPCC test slows down on big
+    // machines (> 48 cores). Performance analysis showed that a cache thrash
+    // was occurring and this padding helps alleviate the problem.
+    char padding[64];
+#endif
 } dispatch_shared_info_t;
 
 typedef struct kmp_disp {
@@ -2567,7 +2573,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_
     int t_size_changed; // team size was changed?: 0: no, 1: yes, -1: changed via omp_set_num_threads() call
 
     // Read/write by workers as well -----------------------------------------------------------------------
-#if KMP_ARCH_X86 || KMP_ARCH_X86_64
+#if (KMP_ARCH_X86 || KMP_ARCH_X86_64) && !KMP_USE_HWLOC
     // Using CACHE_LINE=64 reduces memory footprint, but causes a big perf regression of epcc 'parallel'
     // and 'barrier' on fxe256lin01. This extra padding serves to fix the performance of epcc 'parallel'
     // and 'barrier' when CACHE_LINE=64. TODO: investigate more and get rid if this padding.

Modified: openmp/trunk/runtime/src/kmp_dispatch.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_dispatch.cpp?rev=274894&r1=274893&r2=274894&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_dispatch.cpp (original)
+++ openmp/trunk/runtime/src/kmp_dispatch.cpp Fri Jul  8 12:43:21 2016
@@ -180,6 +180,12 @@ struct dispatch_shared_info_template {
     kmp_uint32             *doacross_flags;    // array of iteration flags (0/1)
     kmp_int32               doacross_num_done; // count finished threads
 #endif
+#if KMP_USE_HWLOC
+    // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
+    // machines (> 48 cores). Performance analysis showed that a cache thrash
+    // was occurring and this padding helps alleviate the problem.
+    char padding[64];
+#endif
 };
 
 /* ------------------------------------------------------------------------ */

Modified: openmp/trunk/runtime/src/kmp_wait_release.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_wait_release.h?rev=274894&r1=274893&r2=274894&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_wait_release.h (original)
+++ openmp/trunk/runtime/src/kmp_wait_release.h Fri Jul  8 12:43:21 2016
@@ -97,6 +97,7 @@ __kmp_wait_template(kmp_info_t *this_thr
     kmp_uint32 hibernate;
     int th_gtid;
     int tasks_completed = FALSE;
+    int oversubscribed;
 
     KMP_FSYNC_SPIN_INIT(spin, NULL);
     if (flag->done_check()) {
@@ -166,6 +167,7 @@ __kmp_wait_template(kmp_info_t *this_thr
                       hibernate - __kmp_global.g.g_time.dt.t_value));
     }
 
+    oversubscribed = (TCR_4(__kmp_nth) > __kmp_avail_proc);
     KMP_MB();
 
     // Main wait spin loop
@@ -201,7 +203,7 @@ __kmp_wait_template(kmp_info_t *this_thr
         }
 
         // If we are oversubscribed, or have waited a bit (and KMP_LIBRARY=throughput), then yield
-        KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
+        KMP_YIELD(oversubscribed);
         // TODO: Should it be number of cores instead of thread contexts? Like:
         // KMP_YIELD(TCR_4(__kmp_nth) > __kmp_ncores);
         // Need performance improvement data to make the change...