[Openmp-commits] [openmp] r338276 - [OpenMP][Stats] Cleanup stats gathering code

Jonathan Peyton via Openmp-commits openmp-commits at lists.llvm.org
Mon Jul 30 10:41:08 PDT 2018


Author: jlpeyton
Date: Mon Jul 30 10:41:08 2018
New Revision: 338276

URL: http://llvm.org/viewvc/llvm-project?rev=338276&view=rev
Log:
[OpenMP][Stats] Cleanup stats gathering code

1) Remove unnecessary data from list node structure
2) Remove timerPair in favor of pushing/popping explicitTimers.
   This way, nested timers will work properly.
3) Fix #pragma omp critical timers
4) Add histogram capability
5) Add KMP_STATS_FILE formatting capability
6) Have time partitioned into serial & parallel by introducing
   partitionedTimers::exchange(). This also counts the number of serial regions
   in the executable.
7) Fix up the timers around OMP loops so that scheduling overhead and work are
   both counted correctly.
8) Fix up the iterations statistics so they count the number of iterations the
   thread receives at each loop scheduling event
9) Change timers so there is only one RDTSC read per event change
10) Fix up the outdated comments for the timers

Differential Revision: https://reviews.llvm.org/D49699

Modified:
    openmp/trunk/runtime/src/kmp_csupport.cpp
    openmp/trunk/runtime/src/kmp_dispatch.cpp
    openmp/trunk/runtime/src/kmp_runtime.cpp
    openmp/trunk/runtime/src/kmp_sched.cpp
    openmp/trunk/runtime/src/kmp_stats.cpp
    openmp/trunk/runtime/src/kmp_stats.h
    openmp/trunk/runtime/src/kmp_stats_timing.cpp
    openmp/trunk/runtime/src/z_Linux_util.cpp

Modified: openmp/trunk/runtime/src/kmp_csupport.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_csupport.cpp?rev=338276&r1=338275&r2=338276&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_csupport.cpp (original)
+++ openmp/trunk/runtime/src/kmp_csupport.cpp Mon Jul 30 10:41:08 2018
@@ -262,6 +262,14 @@ void __kmpc_fork_call(ident_t *loc, kmp_
   int gtid = __kmp_entry_gtid();
 
 #if (KMP_STATS_ENABLED)
+  // If we were in a serial region, then stop the serial timer, record
+  // the event, and start parallel region timer
+  stats_state_e previous_state = KMP_GET_THREAD_STATE();
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_parallel_overhead);
+  } else {
+    KMP_PUSH_PARTITIONED_TIMER(OMP_parallel_overhead);
+  }
   int inParallel = __kmpc_in_parallel(loc);
   if (inParallel) {
     KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
@@ -318,6 +326,14 @@ void __kmpc_fork_call(ident_t *loc, kmp_
 
     va_end(ap);
   }
+
+#if KMP_STATS_ENABLED
+  if (previous_state == stats_state_e::SERIAL_REGION) {
+    KMP_EXCHANGE_PARTITIONED_TIMER(OMP_serial);
+  } else {
+    KMP_POP_PARTITIONED_TIMER();
+  }
+#endif // KMP_STATS_ENABLED
 }
 
 #if OMP_40_ENABLED
@@ -1115,8 +1131,6 @@ void __kmpc_critical(ident_t *loc, kmp_i
   __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
 #else
   KMP_COUNT_BLOCK(OMP_CRITICAL);
-  KMP_TIME_PARTITIONED_BLOCK(
-      OMP_critical_wait); /* Time spent waiting to enter the critical section */
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   omp_state_t prev_state = omp_state_undefined;
   ompt_thread_info_t ti;
@@ -1127,6 +1141,7 @@ void __kmpc_critical(ident_t *loc, kmp_i
 
   // TODO: add THR_OVHD_STATE
 
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
   KMP_CHECK_USER_LOCK_INIT();
 
   if ((__kmp_user_lock_kind == lk_tas) &&
@@ -1193,8 +1208,9 @@ void __kmpc_critical(ident_t *loc, kmp_i
     }
   }
 #endif
+  KMP_POP_PARTITIONED_TIMER();
 
-  KMP_START_EXPLICIT_TIMER(OMP_critical);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
   KA_TRACE(15, ("__kmpc_critical: done T#%d\n", global_tid));
 #endif // KMP_USE_DYNAMIC_LOCK
 }
@@ -1345,6 +1361,7 @@ void __kmpc_critical_with_hint(ident_t *
 
   kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
   // Check if it is initialized.
+  KMP_PUSH_PARTITIONED_TIMER(OMP_critical_wait);
   if (*lk == 0) {
     kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
     if (KMP_IS_D_LOCK(lckseq)) {
@@ -1422,6 +1439,7 @@ void __kmpc_critical_with_hint(ident_t *
 #endif
     KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
   }
+  KMP_POP_PARTITIONED_TIMER();
 
 #if USE_ITT_BUILD
   __kmp_itt_critical_acquired(lck);
@@ -1753,6 +1771,7 @@ void __kmpc_end_single(ident_t *loc, kmp
 Mark the end of a statically scheduled loop.
 */
 void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid) {
+  KMP_POP_PARTITIONED_TIMER();
   KE_TRACE(10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
@@ -1779,7 +1798,6 @@ void __kmpc_for_static_fini(ident_t *loc
         &(task_info->task_data), 0, OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
-
   if (__kmp_env_consistency_check)
     __kmp_pop_workshare(global_tid, ct_pdo, loc);
 }

Modified: openmp/trunk/runtime/src/kmp_dispatch.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_dispatch.cpp?rev=338276&r1=338275&r2=338276&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_dispatch.cpp (original)
+++ openmp/trunk/runtime/src/kmp_dispatch.cpp Mon Jul 30 10:41:08 2018
@@ -321,7 +321,7 @@ void __kmp_dispatch_init_algorithm(ident
 
     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
     if (nproc > 1 && ntc >= nproc) {
-      KMP_COUNT_BLOCK(OMP_FOR_static_steal);
+      KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
       T id = tid;
       T small_chunk, extras;
 
@@ -770,6 +770,15 @@ __kmp_dispatch_init(ident_t *loc, int gt
   active = !team->t.t_serialized;
   th->th.th_ident = loc;
 
+  // Any half-decent optimizer will remove this test when the blocks are empty
+  // since the macros expand to nothing
+  // when statistics are disabled.
+  if (schedule == __kmp_static) {
+    KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
+  } else {
+    KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
+  }
+
 #if KMP_USE_HIER_SCHED
   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
   // Hierarchical scheduling does not work with ordered, so if ordered is
@@ -846,17 +855,6 @@ __kmp_dispatch_init(ident_t *loc, int gt
     }
   }
 
-  // Any half-decent optimizer will remove this test when the blocks are empty
-  // since the macros expand to nothing
-  // when statistics are disabled.
-  if (schedule == __kmp_static) {
-    KMP_COUNT_BLOCK(OMP_FOR_static);
-    KMP_COUNT_VALUE(FOR_static_iterations, pr->u.p.tc);
-  } else {
-    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
-    KMP_COUNT_VALUE(FOR_dynamic_iterations, pr->u.p.tc);
-  }
-
   if (active) {
     /* The name of this buffer should be my_buffer_index when it's free to use
      * it */
@@ -962,6 +960,7 @@ __kmp_dispatch_init(ident_t *loc, int gt
         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
   }
 #endif
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
 }
 
 /* For ordered loops, either __kmp_dispatch_finish() should be called after
@@ -1229,11 +1228,11 @@ int __kmp_dispatch_next_algorithm(int gt
           // by 1
           if (remaining > 3) {
             // steal 1/4 of remaining
-            KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
+            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
             init = (victim->u.p.ub -= (remaining >> 2));
           } else {
             // steal 1 chunk of 2 or 3 remaining
-            KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
+            KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
             init = (victim->u.p.ub -= 1);
           }
           __kmp_release_lock(lck, gtid);
@@ -1333,7 +1332,8 @@ int __kmp_dispatch_next_algorithm(int gt
                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
               // stealing succedded
-              KMP_COUNT_VALUE(FOR_static_steal_stolen, vold.p.ub - vnew.p.ub);
+              KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
+                                        vold.p.ub - vnew.p.ub);
               status = 1;
               while_index = 0;
               // now update own count and ub
@@ -1361,7 +1361,7 @@ int __kmp_dispatch_next_algorithm(int gt
       init *= chunk;
       limit = chunk + init - 1;
       incr = pr->u.p.st;
-      KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
+      KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
 
       KMP_DEBUG_ASSERT(init <= trip);
       if ((last = (limit >= trip)) != 0)
@@ -1823,6 +1823,38 @@ int __kmp_dispatch_next_algorithm(int gt
 #define OMPT_LOOP_END // no-op
 #endif
 
+#if KMP_STATS_ENABLED
+#define KMP_STATS_LOOP_END                                                     \
+  {                                                                            \
+    kmp_int64 u, l, t, i;                                                      \
+    l = (kmp_int64)(*p_lb);                                                    \
+    u = (kmp_int64)(*p_ub);                                                    \
+    i = (kmp_int64)(pr->u.p.st);                                               \
+    if (status == 0) {                                                         \
+      t = 0;                                                                   \
+      KMP_POP_PARTITIONED_TIMER();                                             \
+    } else if (i == 1) {                                                       \
+      if (u >= l)                                                              \
+        t = u - l + 1;                                                         \
+      else                                                                     \
+        t = 0;                                                                 \
+    } else if (i < 0) {                                                        \
+      if (l >= u)                                                              \
+        t = (l - u) / (-i) + 1;                                                \
+      else                                                                     \
+        t = 0;                                                                 \
+    } else {                                                                   \
+      if (u >= l)                                                              \
+        t = (u - l) / i + 1;                                                   \
+      else                                                                     \
+        t = 0;                                                                 \
+    }                                                                          \
+    KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
+  }
+#else
+#define KMP_STATS_LOOP_END /* Nothing */
+#endif
+
 template <typename T>
 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
                                T *p_lb, T *p_ub,
@@ -1840,7 +1872,7 @@ static int __kmp_dispatch_next(ident_t *
   // even if the actual runtme schedule is static. (Which points out a
   // disadavantage of schedule(runtime): even when static scheduling is used it
   // costs more than a compile time choice to use static scheduling would.)
-  KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
+  KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
 
   int status;
   dispatch_private_info_template<T> *pr;
@@ -1964,6 +1996,7 @@ static int __kmp_dispatch_next(ident_t *
     SSC_MARK_DISPATCH_NEXT();
 #endif
     OMPT_LOOP_END;
+    KMP_STATS_LOOP_END;
     return status;
   } else {
     kmp_int32 last = 0;
@@ -2081,6 +2114,7 @@ static int __kmp_dispatch_next(ident_t *
   SSC_MARK_DISPATCH_NEXT();
 #endif
   OMPT_LOOP_END;
+  KMP_STATS_LOOP_END;
   return status;
 }
 

Modified: openmp/trunk/runtime/src/kmp_runtime.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_runtime.cpp?rev=338276&r1=338275&r2=338276&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_runtime.cpp (original)
+++ openmp/trunk/runtime/src/kmp_runtime.cpp Mon Jul 30 10:41:08 2018
@@ -1644,12 +1644,8 @@ int __kmp_fork_call(ident_t *loc, int gt
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
 
-      {
-        KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-        KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-        if (!parent_team->t.t_invoke(gtid)) {
-          KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
-        }
+      if (!parent_team->t.t_invoke(gtid)) {
+        KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
       }
       KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                     parent_team->t.t_id, parent_team->t.t_pkfn));
@@ -1839,11 +1835,7 @@ int __kmp_fork_call(ident_t *loc, int gt
           //     because initial code in teams should have level=0
           team->t.t_level--;
           // AC: call special invoker for outer "parallel" of teams construct
-          {
-            KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-            KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-            invoker(gtid);
-          }
+          invoker(gtid);
         } else {
 #endif /* OMP_40_ENABLED */
           argv = args;
@@ -2256,12 +2248,8 @@ int __kmp_fork_call(ident_t *loc, int gt
                   team->t.t_id, team->t.t_pkfn));
   } // END of timer KMP_fork_call block
 
-  {
-    KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-    KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-    if (!team->t.t_invoke(gtid)) {
-      KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
-    }
+  if (!team->t.t_invoke(gtid)) {
+    KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
   }
   KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
                 team->t.t_id, team->t.t_pkfn));
@@ -3700,7 +3688,7 @@ int __kmp_register_root(int initial_thre
 #if KMP_STATS_ENABLED
   // Initialize stats as soon as possible (right after gtid assignment).
   __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
-  KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
+  __kmp_stats_thread_ptr->startLife();
   KMP_SET_THREAD_STATE(SERIAL_REGION);
   KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
 #endif
@@ -5639,11 +5627,7 @@ void *__kmp_launch_thread(kmp_info_t *th
         }
 #endif
 
-        {
-          KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
-          KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
-          rc = (*pteam)->t.t_invoke(gtid);
-        }
+        rc = (*pteam)->t.t_invoke(gtid);
         KMP_ASSERT(rc);
 
         KMP_MB();

Modified: openmp/trunk/runtime/src/kmp_sched.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_sched.cpp?rev=338276&r1=338275&r2=338276&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_sched.cpp (original)
+++ openmp/trunk/runtime/src/kmp_sched.cpp Mon Jul 30 10:41:08 2018
@@ -51,8 +51,9 @@ static void __kmp_for_static_init(ident_
                                   void *codeptr
 #endif
                                   ) {
-  KMP_COUNT_BLOCK(OMP_FOR_static);
-  KMP_TIME_PARTITIONED_BLOCK(FOR_static_scheduling);
+  KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static);
+  KMP_PUSH_PARTITIONED_TIMER(OMP_loop_static_scheduling);
 
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
@@ -151,7 +152,6 @@ static void __kmp_for_static_init(ident_
           &(task_info->task_data), 0, codeptr);
     }
 #endif
-    KMP_COUNT_VALUE(FOR_static_iterations, 0);
     return;
   }
 
@@ -254,7 +254,6 @@ static void __kmp_for_static_init(ident_
                             loc);
     }
   }
-  KMP_COUNT_VALUE(FOR_static_iterations, trip_count);
 
   /* compute remaining parameters */
   switch (schedtype) {
@@ -390,6 +389,26 @@ static void __kmp_for_static_init(ident_
   }
 #endif
 
+#if KMP_STATS_ENABLED
+  {
+    kmp_int64 t;
+    kmp_int64 u = (kmp_int64)(*pupper);
+    kmp_int64 l = (kmp_int64)(*plower);
+    kmp_int64 i = (kmp_int64)incr;
+    /* compute trip count */
+    if (i == 1) {
+      t = u - l + 1;
+    } else if (i == -1) {
+      t = l - u + 1;
+    } else if (i > 0) {
+      t = (u - l) / i + 1;
+    } else {
+      t = (l - u) / (-i) + 1;
+    }
+    KMP_COUNT_VALUE(OMP_loop_static_iterations, t);
+    KMP_POP_PARTITIONED_TIMER();
+  }
+#endif
   return;
 }
 

Modified: openmp/trunk/runtime/src/kmp_stats.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_stats.cpp?rev=338276&r1=338275&r2=338276&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_stats.cpp (original)
+++ openmp/trunk/runtime/src/kmp_stats.cpp Mon Jul 30 10:41:08 2018
@@ -21,6 +21,7 @@
 #include <iomanip>
 #include <sstream>
 #include <stdlib.h> // for atexit
+#include <cmath>
 
 #define STRINGIZE2(x) #x
 #define STRINGIZE(x) STRINGIZE2(x)
@@ -67,9 +68,18 @@ static uint32_t statsPrinted = 0;
 // output interface
 static kmp_stats_output_module *__kmp_stats_global_output = NULL;
 
+double logHistogram::binMax[] = {
+    1.e1l,  1.e2l,  1.e3l,  1.e4l,  1.e5l,  1.e6l,  1.e7l,  1.e8l,
+    1.e9l,  1.e10l, 1.e11l, 1.e12l, 1.e13l, 1.e14l, 1.e15l, 1.e16l,
+    1.e17l, 1.e18l, 1.e19l, 1.e20l, 1.e21l, 1.e22l, 1.e23l, 1.e24l,
+    1.e25l, 1.e26l, 1.e27l, 1.e28l, 1.e29l, 1.e30l};
+
 /* ************* statistic member functions ************* */
 
 void statistic::addSample(double sample) {
+  sample -= offset;
+  KMP_DEBUG_ASSERT(std::isfinite(sample));
+
   double delta = sample - meanVal;
 
   sampleCount = sampleCount + 1;
@@ -78,9 +88,14 @@ void statistic::addSample(double sample)
 
   minVal = std::min(minVal, sample);
   maxVal = std::max(maxVal, sample);
+  if (collectingHist)
+    hist.addSample(sample);
 }
 
 statistic &statistic::operator+=(const statistic &other) {
+  if (other.sampleCount == 0)
+    return *this;
+
   if (sampleCount == 0) {
     *this = other;
     return *this;
@@ -104,6 +119,8 @@ statistic &statistic::operator+=(const s
   minVal = std::min(minVal, other.minVal);
   maxVal = std::max(maxVal, other.maxVal);
   sampleCount = newSampleCount;
+  if (collectingHist)
+    hist += other.hist;
 
   return *this;
 }
@@ -138,10 +155,89 @@ std::string statistic::format(char unit,
   return result;
 }
 
+/* ************* histogram member functions ************* */
+
+// Lowest bin that has anything in it
+int logHistogram::minBin() const {
+  for (int i = 0; i < numBins; i++) {
+    if (bins[i].count != 0)
+      return i - logOffset;
+  }
+  return -logOffset;
+}
+
+// Highest bin that has anything in it
+int logHistogram::maxBin() const {
+  for (int i = numBins - 1; i >= 0; i--) {
+    if (bins[i].count != 0)
+      return i - logOffset;
+  }
+  return -logOffset;
+}
+
+// Which bin does this sample belong in ?
+uint32_t logHistogram::findBin(double sample) {
+  double v = std::fabs(sample);
+  // Simply loop up looking which bin to put it in.
+  // According to a micro-architect this is likely to be faster than a binary
+  // search, since
+  // it will only have one branch mis-predict
+  for (int b = 0; b < numBins; b++)
+    if (binMax[b] > v)
+      return b;
+  fprintf(stderr,
+          "Trying to add a sample that is too large into a histogram\n");
+  KMP_ASSERT(0);
+  return -1;
+}
+
+void logHistogram::addSample(double sample) {
+  if (sample == 0.0) {
+    zeroCount += 1;
+#ifdef KMP_DEBUG
+    _total++;
+    check();
+#endif
+    return;
+  }
+  KMP_DEBUG_ASSERT(std::isfinite(sample));
+  uint32_t bin = findBin(sample);
+  KMP_DEBUG_ASSERT(0 <= bin && bin < numBins);
+
+  bins[bin].count += 1;
+  bins[bin].total += sample;
+#ifdef KMP_DEBUG
+  _total++;
+  check();
+#endif
+}
+
+// This may not be the format we want, but it'll do for now
+std::string logHistogram::format(char unit) const {
+  std::stringstream result;
+
+  result << "Bin,                Count,     Total\n";
+  if (zeroCount) {
+    result << "0,              " << formatSI(zeroCount, 9, ' ') << ", ",
+        formatSI(0.0, 9, unit);
+    if (count(minBin()) == 0)
+      return result.str();
+    result << "\n";
+  }
+  for (int i = minBin(); i <= maxBin(); i++) {
+    result << "10**" << i << "<=v<10**" << (i + 1) << ", "
+           << formatSI(count(i), 9, ' ') << ", " << formatSI(total(i), 9, unit);
+    if (i != maxBin())
+      result << "\n";
+  }
+
+  return result.str();
+}
+
 /* ************* explicitTimer member functions ************* */
 
-void explicitTimer::start(timer_e timerEnumValue) {
-  startTime = tsc_tick_count::now();
+void explicitTimer::start(tsc_tick_count tick) {
+  startTime = tick;
   totalPauseTime = 0;
   if (timeStat::logEvent(timerEnumValue)) {
     __kmp_stats_thread_ptr->incrementNestValue();
@@ -149,22 +245,19 @@ void explicitTimer::start(timer_e timerE
   return;
 }
 
-void explicitTimer::stop(timer_e timerEnumValue,
+void explicitTimer::stop(tsc_tick_count tick,
                          kmp_stats_list *stats_ptr /* = nullptr */) {
   if (startTime.getValue() == 0)
     return;
 
-  tsc_tick_count finishTime = tsc_tick_count::now();
-
-  // stat->addSample ((tsc_tick_count::now() - startTime).ticks());
-  stat->addSample(((finishTime - startTime) - totalPauseTime).ticks());
+  stat->addSample(((tick - startTime) - totalPauseTime).ticks());
 
   if (timeStat::logEvent(timerEnumValue)) {
     if (!stats_ptr)
       stats_ptr = __kmp_stats_thread_ptr;
     stats_ptr->push_event(
         startTime.getValue() - __kmp_stats_start_time.getValue(),
-        finishTime.getValue() - __kmp_stats_start_time.getValue(),
+        tick.getValue() - __kmp_stats_start_time.getValue(),
         __kmp_stats_thread_ptr->getNestValue(), timerEnumValue);
     stats_ptr->decrementNestValue();
   }
@@ -178,52 +271,67 @@ void explicitTimer::stop(timer_e timerEn
 /* ************* partitionedTimers member functions ************* */
 partitionedTimers::partitionedTimers() { timer_stack.reserve(8); }
 
-// add a timer to this collection of partitioned timers.
-void partitionedTimers::add_timer(explicit_timer_e timer_index,
-                                  explicitTimer *timer_pointer) {
-  KMP_DEBUG_ASSERT((int)timer_index < (int)EXPLICIT_TIMER_LAST + 1);
-  timers[timer_index] = timer_pointer;
-}
-
 // initialize the paritioned timers to an initial timer
-void partitionedTimers::init(timerPair init_timer_pair) {
+void partitionedTimers::init(explicitTimer timer) {
   KMP_DEBUG_ASSERT(this->timer_stack.size() == 0);
-  timer_stack.push_back(init_timer_pair);
-  timers[init_timer_pair.get_index()]->start(init_timer_pair.get_timer());
+  timer_stack.push_back(timer);
+  timer_stack.back().start(tsc_tick_count::now());
 }
 
 // stop/save the current timer, and start the new timer (timer_pair)
 // There is a special condition where if the current timer is equal to
 // the one you are trying to push, then it only manipulates the stack,
 // and it won't stop/start the currently running timer.
-void partitionedTimers::push(timerPair timer_pair) {
+void partitionedTimers::push(explicitTimer timer) {
   // get the current timer
-  // stop current timer
+  // pause current timer
   // push new timer
   // start the new timer
+  explicitTimer *current_timer, *new_timer;
+  size_t stack_size;
   KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
-  timerPair current_timer = timer_stack.back();
-  timer_stack.push_back(timer_pair);
-  if (current_timer != timer_pair) {
-    timers[current_timer.get_index()]->pause();
-    timers[timer_pair.get_index()]->start(timer_pair.get_timer());
-  }
+  timer_stack.push_back(timer);
+  stack_size = timer_stack.size();
+  current_timer = &(timer_stack[stack_size - 2]);
+  new_timer = &(timer_stack[stack_size - 1]);
+  tsc_tick_count tick = tsc_tick_count::now();
+  current_timer->pause(tick);
+  new_timer->start(tick);
 }
 
 // stop/discard the current timer, and start the previously saved timer
 void partitionedTimers::pop() {
   // get the current timer
-  // stop current timer
+  // stop current timer (record event/sample)
   // pop current timer
-  // get the new current timer and start it back up
-  KMP_DEBUG_ASSERT(this->timer_stack.size() > 1);
-  timerPair current_timer = timer_stack.back();
+  // get the new current timer and resume
+  explicitTimer *old_timer, *new_timer;
+  size_t stack_size = timer_stack.size();
+  KMP_DEBUG_ASSERT(stack_size > 1);
+  old_timer = &(timer_stack[stack_size - 1]);
+  new_timer = &(timer_stack[stack_size - 2]);
+  tsc_tick_count tick = tsc_tick_count::now();
+  old_timer->stop(tick);
+  new_timer->resume(tick);
   timer_stack.pop_back();
-  timerPair new_timer = timer_stack.back();
-  if (current_timer != new_timer) {
-    timers[current_timer.get_index()]->stop(current_timer.get_timer());
-    timers[new_timer.get_index()]->resume();
-  }
+}
+
+void partitionedTimers::exchange(explicitTimer timer) {
+  // get the current timer
+  // stop current timer (record event/sample)
+  // push new timer
+  // start the new timer
+  explicitTimer *current_timer, *new_timer;
+  size_t stack_size;
+  KMP_DEBUG_ASSERT(this->timer_stack.size() > 0);
+  tsc_tick_count tick = tsc_tick_count::now();
+  stack_size = timer_stack.size();
+  current_timer = &(timer_stack[stack_size - 1]);
+  current_timer->stop(tick);
+  timer_stack.pop_back();
+  timer_stack.push_back(timer);
+  new_timer = &(timer_stack[stack_size - 1]);
+  new_timer->start(tick);
 }
 
 // Wind up all the currently running timers.
@@ -234,10 +342,10 @@ void partitionedTimers::windup() {
   while (timer_stack.size() > 1) {
     this->pop();
   }
+  // Pop the timer from the init() call
   if (timer_stack.size() > 0) {
-    timerPair last_timer = timer_stack.back();
+    timer_stack.back().stop(tsc_tick_count::now());
     timer_stack.pop_back();
-    timers[last_timer.get_index()]->stop(last_timer.get_timer());
   }
 }
 
@@ -255,8 +363,8 @@ void kmp_stats_event_vector::deallocate(
 // event2 or zero if event1 == event2. This sorts by start time (lowest to
 // highest).
 int compare_two_events(const void *event1, const void *event2) {
-  kmp_stats_event *ev1 = (kmp_stats_event *)event1;
-  kmp_stats_event *ev2 = (kmp_stats_event *)event2;
+  const kmp_stats_event *ev1 = RCAST(const kmp_stats_event *, event1);
+  const kmp_stats_event *ev2 = RCAST(const kmp_stats_event *, event2);
 
   if (ev1->getStart() < ev2->getStart())
     return -1;
@@ -353,9 +461,93 @@ const char *kmp_stats_output_module::plo
 int kmp_stats_output_module::printPerThreadFlag = 0;
 int kmp_stats_output_module::printPerThreadEventsFlag = 0;
 
+static char const *lastName(char *name) {
+  int l = strlen(name);
+  for (int i = l - 1; i >= 0; --i) {
+    if (name[i] == '.')
+      name[i] = '_';
+    if (name[i] == '/')
+      return name + i + 1;
+  }
+  return name;
+}
+
+/* Read the name of the executable from /proc/self/cmdline */
+static char const *getImageName(char *buffer, size_t buflen) {
+  FILE *f = fopen("/proc/self/cmdline", "r");
+  buffer[0] = char(0);
+  if (!f)
+    return buffer;
+
+  // The file contains char(0) delimited words from the commandline.
+  // This just returns the last filename component of the first word on the
+  // line.
+  size_t n = fread(buffer, 1, buflen, f);
+  if (n == 0) {
+    fclose(f);
+    KMP_CHECK_SYSFAIL("fread", 1)
+  }
+  fclose(f);
+  buffer[buflen - 1] = char(0);
+  return lastName(buffer);
+}
+
+static void getTime(char *buffer, size_t buflen, bool underscores = false) {
+  time_t timer;
+
+  time(&timer);
+
+  struct tm *tm_info = localtime(&timer);
+  if (underscores)
+    strftime(buffer, buflen, "%Y-%m-%d_%H%M%S", tm_info);
+  else
+    strftime(buffer, buflen, "%Y-%m-%d %H%M%S", tm_info);
+}
+
+/* Generate a stats file name, expanding prototypes */
+static std::string generateFilename(char const *prototype,
+                                    char const *imageName) {
+  std::string res;
+
+  for (int i = 0; prototype[i] != char(0); i++) {
+    char ch = prototype[i];
+
+    if (ch == '%') {
+      i++;
+      if (prototype[i] == char(0))
+        break;
+
+      switch (prototype[i]) {
+      case 't': // Insert time and date
+      {
+        char date[26];
+        getTime(date, sizeof(date), true);
+        res += date;
+      } break;
+      case 'e': // Insert executable name
+        res += imageName;
+        break;
+      case 'p': // Insert pid
+      {
+        std::stringstream ss;
+        ss << getpid();
+        res += ss.str();
+      } break;
+      default:
+        res += prototype[i];
+        break;
+      }
+    } else
+      res += ch;
+  }
+  return res;
+}
+
 // init() is called very near the beginning of execution time in the constructor
 // of __kmp_stats_global_output
 void kmp_stats_output_module::init() {
+
+  fprintf(stderr, "*** Stats enabled OpenMP* runtime ***\n");
   char *statsFileName = getenv("KMP_STATS_FILE");
   eventsFileName = getenv("KMP_STATS_EVENTS_FILE");
   plotFileName = getenv("KMP_STATS_PLOT_FILE");
@@ -364,22 +556,10 @@ void kmp_stats_output_module::init() {
 
   // set the stats output filenames based on environment variables and defaults
   if (statsFileName) {
-    // append the process id to the output filename
-    // events.csv --> events-pid.csv
-    size_t index;
-    std::string baseFileName, pid, suffix;
-    std::stringstream ss;
-    outputFileName = std::string(statsFileName);
-    index = outputFileName.find_last_of('.');
-    if (index == std::string::npos) {
-      baseFileName = outputFileName;
-    } else {
-      baseFileName = outputFileName.substr(0, index);
-      suffix = outputFileName.substr(index);
-    }
-    ss << getpid();
-    pid = ss.str();
-    outputFileName = baseFileName + "-" + pid + suffix;
+    char imageName[1024];
+    // Process any escapes (e.g., %p, %e, %t) in the name
+    outputFileName = generateFilename(
+        statsFileName, getImageName(&imageName[0], sizeof(imageName)));
   }
   eventsFileName = eventsFileName ? eventsFileName : "events.dat";
   plotFileName = plotFileName ? plotFileName : "events.plt";
@@ -396,8 +576,6 @@ void kmp_stats_output_module::init() {
     // will clear flag so that no event will be logged
     timeStat::clearEventFlags();
   }
-
-  return;
 }
 
 void kmp_stats_output_module::setupEventColors() {
@@ -410,28 +588,43 @@ void kmp_stats_output_module::setupEvent
       globalColorIndex = (globalColorIndex + 1) % numGlobalColors;
     }
   }
-  return;
 }
 
 void kmp_stats_output_module::printTimerStats(FILE *statsOut,
                                               statistic const *theStats,
                                               statistic const *totalStats) {
-  fprintf(statsOut, "Timer,                      SampleCount,    Min,      "
-                    "Mean,       Max,     Total,        SD\n");
+  fprintf(statsOut,
+          "Timer,                             SampleCount,    Min,      "
+          "Mean,       Max,     Total,        SD\n");
   for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
     statistic const *stat = &theStats[s];
     char tag = timeStat::noUnits(s) ? ' ' : 'T';
 
-    fprintf(statsOut, "%-28s, %s\n", timeStat::name(s),
+    fprintf(statsOut, "%-35s, %s\n", timeStat::name(s),
             stat->format(tag, true).c_str());
   }
   // Also print the Total_ versions of times.
   for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
     char tag = timeStat::noUnits(s) ? ' ' : 'T';
     if (totalStats && !timeStat::noTotal(s))
-      fprintf(statsOut, "Total_%-22s, %s\n", timeStat::name(s),
+      fprintf(statsOut, "Total_%-29s, %s\n", timeStat::name(s),
               totalStats[s].format(tag, true).c_str());
   }
+
+  // Print historgram of statistics
+  if (theStats[0].haveHist()) {
+    fprintf(statsOut, "\nTimer distributions\n");
+    for (int s = 0; s < TIMER_LAST; s++) {
+      statistic const *stat = &theStats[s];
+
+      if (stat->getCount() != 0) {
+        char tag = timeStat::noUnits(timer_e(s)) ? ' ' : 'T';
+
+        fprintf(statsOut, "%s\n", timeStat::name(timer_e(s)));
+        fprintf(statsOut, "%s\n", stat->getHist()->format(tag).c_str());
+      }
+    }
+  }
 }
 
 void kmp_stats_output_module::printCounterStats(FILE *statsOut,
@@ -443,6 +636,18 @@ void kmp_stats_output_module::printCount
     fprintf(statsOut, "%-25s, %s\n", counter::name(counter_e(s)),
             stat->format(' ', true).c_str());
   }
+  // Print histogram of counters
+  if (theStats[0].haveHist()) {
+    fprintf(statsOut, "\nCounter distributions\n");
+    for (int s = 0; s < COUNTER_LAST; s++) {
+      statistic const *stat = &theStats[s];
+
+      if (stat->getCount() != 0) {
+        fprintf(statsOut, "%s\n", counter::name(counter_e(s)));
+        fprintf(statsOut, "%s\n", stat->getHist()->format(' ').c_str());
+      }
+    }
+  }
 }
 
 void kmp_stats_output_module::printCounters(FILE *statsOut,
@@ -480,9 +685,7 @@ void kmp_stats_output_module::windupExpl
   for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
     kmp_stats_list *ptr = *it;
     ptr->getPartitionedTimers()->windup();
-    for (int timer = 0; timer < EXPLICIT_TIMER_LAST; timer++) {
-      ptr->getExplicitTimer(explicit_timer_e(timer))->stop((timer_e)timer, ptr);
-    }
+    ptr->endLife();
   }
 }
 
@@ -547,6 +750,11 @@ void kmp_stats_output_module::printPloti
   return;
 }
 
+static void outputEnvVariable(FILE *statsOut, char const *name) {
+  char const *value = getenv(name);
+  fprintf(statsOut, "# %s = %s\n", name, value ? value : "*unspecified*");
+}
+
 /* Print some useful information about
    * the date and time this experiment ran.
    * the machine on which it ran.
@@ -570,6 +778,11 @@ void kmp_stats_output_module::printHeade
   else
     fprintf(statsOut, "# Nominal frequency: %sz\n",
             formatSI(double(__kmp_cpuinfo.frequency), 9, 'H').c_str());
+  outputEnvVariable(statsOut, "KMP_HW_SUBSET");
+  outputEnvVariable(statsOut, "KMP_AFFINITY");
+  outputEnvVariable(statsOut, "KMP_BLOCKTIME");
+  outputEnvVariable(statsOut, "KMP_LIBRARY");
+  fprintf(statsOut, "# Production runtime built " __DATE__ " " __TIME__ "\n");
 #endif
 }
 
@@ -665,7 +878,6 @@ void __kmp_reset_stats() {
   for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
     timeStat *timers = (*it)->getTimers();
     counter *counters = (*it)->getCounters();
-    explicitTimer *eTimers = (*it)->getExplicitTimers();
 
     for (int t = 0; t < TIMER_LAST; t++)
       timers[t].reset();
@@ -673,9 +885,6 @@ void __kmp_reset_stats() {
     for (int c = 0; c < COUNTER_LAST; c++)
       counters[c].reset();
 
-    for (int t = 0; t < EXPLICIT_TIMER_LAST; t++)
-      eTimers[t].reset();
-
     // reset the event vector so all previous events are "erased"
     (*it)->resetEventVector();
   }

Modified: openmp/trunk/runtime/src/kmp_stats.h
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_stats.h?rev=338276&r1=338275&r2=338276&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_stats.h (original)
+++ openmp/trunk/runtime/src/kmp_stats.h Mon Jul 30 10:41:08 2018
@@ -15,6 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "kmp_config.h"
+#include "kmp_debug.h"
 
 #if KMP_STATS_ENABLED
 /* Statistics accumulator.
@@ -36,7 +37,10 @@
 /* Enable developer statistics here if you want them. They are more detailed
    than is useful for application characterisation and are intended for the
    runtime library developer. */
-// #define KMP_DEVELOPER_STATS 1
+#define KMP_DEVELOPER_STATS 0
+
+/* Enable/Disable histogram output */
+#define KMP_STATS_HIST 0
 
 /*!
  * @ingroup STATS_GATHERING
@@ -91,9 +95,9 @@ enum stats_state_e {
 #define KMP_FOREACH_COUNTER(macro, arg)                                        \
   macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
   macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
-  macro(OMP_FOR_static, 0, arg)                                                \
-  macro(OMP_FOR_static_steal, 0, arg)                                          \
-  macro(OMP_FOR_dynamic, 0, arg)                                               \
+  macro(OMP_LOOP_STATIC, 0, arg)                                               \
+  macro(OMP_LOOP_STATIC_STEAL, 0, arg)                                         \
+  macro(OMP_LOOP_DYNAMIC, 0, arg)                                              \
   macro(OMP_DISTRIBUTE, 0, arg)                                                \
   macro(OMP_BARRIER, 0, arg)                                                   \
   macro(OMP_CRITICAL, 0, arg)                                                  \
@@ -119,7 +123,7 @@ enum stats_state_e {
  * @param arg a user defined argument to send to the user defined macro
  *
  * \details A timer collects multiple samples of some count in each thread and
- * then finally aggregates alll of the samples from all of the threads. For most
+ * then finally aggregates all of the samples from all of the threads. For most
  * timers the printing code also provides an aggregation over the thread totals.
  * These are printed as TOTAL_foo. The count is normally a time (in ticks),
  * hence the name "timer". (But can be any value, so we use this for "number of
@@ -131,55 +135,59 @@ enum stats_state_e {
  */
 // clang-format off
 #define KMP_FOREACH_TIMER(macro, arg)                                          \
-    macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)               \
-    macro (FOR_static_scheduling, 0, arg)                                      \
-    macro (FOR_dynamic_scheduling, 0, arg)                                     \
-    macro (OMP_critical, 0, arg)                                               \
-    macro (OMP_critical_wait, 0, arg)                                          \
-    macro (OMP_single, 0, arg)                                                 \
-    macro (OMP_master, 0, arg)                                                 \
-    macro (OMP_idle, stats_flags_e::logEvent, arg)                             \
-    macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                    \
-    macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                     \
-    macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                     \
-    macro (OMP_parallel, stats_flags_e::logEvent, arg)                         \
-    macro (OMP_task_immediate, 0, arg)                                         \
-    macro (OMP_task_taskwait, 0, arg)                                          \
-    macro (OMP_task_taskyield, 0, arg)                                         \
-    macro (OMP_task_taskgroup, 0, arg)                                         \
-    macro (OMP_task_join_bar, 0, arg)                                          \
-    macro (OMP_task_plain_bar, 0, arg)                                         \
-    macro (OMP_serial, stats_flags_e::logEvent, arg)                           \
-    macro (OMP_taskloop_scheduling, 0, arg)                                    \
-    macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,\
-           arg)                                                                \
-    macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \
-           arg)                                                                \
-    macro (FOR_static_iterations,                                              \
-           stats_flags_e::noUnits | stats_flags_e::noTotal, arg)               \
-    macro (FOR_dynamic_iterations,                                             \
-           stats_flags_e::noUnits | stats_flags_e::noTotal, arg)               \
-    macro (FOR_static_steal_stolen,                                            \
-           stats_flags_e::noUnits | stats_flags_e::noTotal, arg)               \
-    macro (FOR_static_steal_chunks,                                            \
-           stats_flags_e::noUnits | stats_flags_e::noTotal, arg)               \
-    KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
+  macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg)                 \
+  macro (OMP_parallel, stats_flags_e::logEvent, arg)                           \
+  macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg)                  \
+  macro (OMP_loop_static, 0, arg)                                              \
+  macro (OMP_loop_static_scheduling, 0, arg)                                   \
+  macro (OMP_loop_dynamic, 0, arg)                                             \
+  macro (OMP_loop_dynamic_scheduling, 0, arg)                                  \
+  macro (OMP_critical, 0, arg)                                                 \
+  macro (OMP_critical_wait, 0, arg)                                            \
+  macro (OMP_single, 0, arg)                                                   \
+  macro (OMP_master, 0, arg)                                                   \
+  macro (OMP_task_immediate, 0, arg)                                           \
+  macro (OMP_task_taskwait, 0, arg)                                            \
+  macro (OMP_task_taskyield, 0, arg)                                           \
+  macro (OMP_task_taskgroup, 0, arg)                                           \
+  macro (OMP_task_join_bar, 0, arg)                                            \
+  macro (OMP_task_plain_bar, 0, arg)                                           \
+  macro (OMP_taskloop_scheduling, 0, arg)                                      \
+  macro (OMP_plain_barrier, stats_flags_e::logEvent, arg)                      \
+  macro (OMP_idle, stats_flags_e::logEvent, arg)                               \
+  macro (OMP_fork_barrier, stats_flags_e::logEvent, arg)                       \
+  macro (OMP_join_barrier, stats_flags_e::logEvent, arg)                       \
+  macro (OMP_serial, stats_flags_e::logEvent, arg)                             \
+  macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal,  \
+         arg)                                                                  \
+  macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal,   \
+         arg)                                                                  \
+  macro (OMP_loop_static_iterations,                                           \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (OMP_loop_dynamic_iterations,                                          \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
 // clang-format on
 
-// OMP_start_end          -- Time from when OpenMP is initialized until the
-//                           stats are printed at exit
-// OMP_serial             -- Thread zero time executing serial code
-// OMP_work               -- Elapsed time in code dispatched by a fork (measured
-//                           in the thread)
-// OMP_barrier            -- Time at "real" barriers (includes task time)
-// FOR_static_scheduling  -- Time spent doing scheduling for a static "for"
-// FOR_dynamic_scheduling -- Time spent doing scheduling for a dynamic "for"
-// OMP_idle               -- Worker threads time spent waiting for inclusion in
-//                           a parallel region
-// OMP_plain_barrier      -- Time spent in a barrier construct
-// OMP_fork_join_barrier  -- Time spent in a the fork-join barrier surrounding a
-//                           parallel region
-// OMP_parallel           -- Time spent inside a parallel construct
+// OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either
+//                           initializing OpenMP or being created by a master)
+//                           until the thread is destroyed
+// OMP_parallel           -- Time thread spends executing work directly
+//                           within a #pragma omp parallel
+// OMP_parallel_overhead  -- Time thread spends setting up a parallel region
+// OMP_loop_static        -- Time thread spends executing loop iterations from
+//                           a statically scheduled loop
+// OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations
+//                               from a statically scheduled loop
+// OMP_loop_dynamic       -- Time thread spends executing loop iterations from
+//                           a dynamically scheduled loop
+// OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations
+//                                from a dynamically scheduled loop
+// OMP_critical           -- Time thread spends executing critical section
+// OMP_critical_wait      -- Time thread spends waiting to enter
+//                           a critcal seciton
+// OMP_single             -- Time spent executing a "single" region
+// OMP_master             -- Time spent executing a "master" region
 // OMP_task_immediate     -- Time spent executing non-deferred tasks
 // OMP_task_taskwait      -- Time spent executing tasks inside a taskwait
 //                           construct
@@ -190,17 +198,24 @@ enum stats_state_e {
 // OMP_task_join_bar      -- Time spent executing tasks inside a join barrier
 // OMP_task_plain_bar     -- Time spent executing tasks inside a barrier
 //                           construct
-// OMP_single             -- Time spent executing a "single" region
-// OMP_master             -- Time spent executing a "master" region
+// OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop
+//                            construct
+// OMP_plain_barrier      -- Time spent in a #pragma omp barrier construct or
+//                           inside implicit barrier at end of worksharing
+//                           construct
+// OMP_idle               -- Time worker threads spend waiting for next
+//                           parallel region
+// OMP_fork_barrier       -- Time spent in a the fork barrier surrounding a
+//                           parallel region
+// OMP_join_barrier       -- Time spent in a the join barrier surrounding a
+//                           parallel region
+// OMP_serial             -- Time thread zero spends executing serial code
 // OMP_set_numthreads     -- Values passed to omp_set_num_threads
 // OMP_PARALLEL_args      -- Number of arguments passed to a parallel region
-// FOR_static_iterations  -- Number of available parallel chunks of work in a
-//                           static for
-// FOR_dynamic_iterations -- Number of available parallel chunks of work in a
-//                           dynamic for
-//                           Both adjust for any chunking, so if there were an
-//                           iteration count of 20 but a chunk size of 10, we'd
-//                           record 2.
+// OMP_loop_static_iterations -- Number of iterations thread is assigned for
+//                               statically scheduled loops
+// OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for
+//                                dynamically scheduled loops
 
 #if (KMP_DEVELOPER_STATS)
 // Timers which are of interest to runtime library developers, not end users.
@@ -235,7 +250,11 @@ enum stats_state_e {
   macro(USER_suspend, 0, arg)                                                  \
   macro(KMP_allocate_team, 0, arg)                                             \
   macro(KMP_setup_icv_copy, 0, arg)                                            \
-  macro(USER_icv_copy, 0, arg)
+  macro(USER_icv_copy, 0, arg)                                                 \
+  macro (FOR_static_steal_stolen,                                              \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)                 \
+  macro (FOR_static_steal_chunks,                                              \
+         stats_flags_e::noUnits | stats_flags_e::noTotal, arg)
 #else
 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
 #endif
@@ -272,53 +291,143 @@ enum explicit_timer_e {
 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST };
 #undef ENUMERATE
 
-class timerPair {
-  explicit_timer_e timer_index;
-  timer_e timer;
+/*
+ * A logarithmic histogram. It accumulates the number of values in each power of
+ * ten bin.  So 1<=x<10, 10<=x<100, ...
+ * Mostly useful where we have some big outliers and want to see information
+ * about them.
+ */
+class logHistogram {
+  enum {
+    numBins = 31, /* Number of powers of 10. If this changes you need to change
+                   * the initializer for binMax */
+
+    /*
+     * If you want to use this to analyse values that may be less than 1, (for
+     * instance times in s), then the logOffset gives you negative powers.
+     * In our case here, we're just looking at times in ticks, or counts, so we
+     * can never see values with magnitude < 1 (other than zero), so we can set
+     * it to 0.  As above change the initializer if you change this.
+     */
+    logOffset = 0
+  };
+  uint32_t KMP_ALIGN_CACHE zeroCount;
+  struct {
+    uint32_t count;
+    double total;
+  } bins[numBins];
+
+  static double binMax[numBins];
+
+#ifdef KMP_DEBUG
+  uint64_t _total;
+
+  void check() const {
+    uint64_t t = zeroCount;
+    for (int i = 0; i < numBins; i++)
+      t += bins[i].count;
+    KMP_DEBUG_ASSERT(t == _total);
+  }
+#else
+  void check() const {}
+#endif
 
 public:
-  timerPair(explicit_timer_e ti, timer_e t) : timer_index(ti), timer(t) {}
-  inline explicit_timer_e get_index() const { return timer_index; }
-  inline timer_e get_timer() const { return timer; }
-  bool operator==(const timerPair &rhs) {
-    return this->get_index() == rhs.get_index();
+  logHistogram() { reset(); }
+
+  logHistogram(logHistogram const &o) {
+    for (int i = 0; i < numBins; i++)
+      bins[i] = o.bins[i];
+#ifdef KMP_DEBUG
+    _total = o._total;
+#endif
   }
-  bool operator!=(const timerPair &rhs) { return !(*this == rhs); }
+
+  void reset() {
+    zeroCount = 0;
+    for (int i = 0; i < numBins; i++) {
+      bins[i].count = 0;
+      bins[i].total = 0;
+    }
+
+#ifdef KMP_DEBUG
+    _total = 0;
+#endif
+  }
+  uint32_t count(int b) const { return bins[b + logOffset].count; }
+  double total(int b) const { return bins[b + logOffset].total; }
+  static uint32_t findBin(double sample);
+
+  logHistogram &operator+=(logHistogram const &o) {
+    zeroCount += o.zeroCount;
+    for (int i = 0; i < numBins; i++) {
+      bins[i].count += o.bins[i].count;
+      bins[i].total += o.bins[i].total;
+    }
+#ifdef KMP_DEBUG
+    _total += o._total;
+    check();
+#endif
+
+    return *this;
+  }
+
+  void addSample(double sample);
+  int minBin() const;
+  int maxBin() const;
+
+  std::string format(char) const;
 };
 
 class statistic {
-  double minVal;
+  double KMP_ALIGN_CACHE minVal;
   double maxVal;
   double meanVal;
   double m2;
   uint64_t sampleCount;
+  double offset;
+  bool collectingHist;
+  logHistogram hist;
 
 public:
-  statistic() { reset(); }
+  statistic(bool doHist = bool(KMP_STATS_HIST)) {
+    reset();
+    collectingHist = doHist;
+  }
   statistic(statistic const &o)
       : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2),
-        sampleCount(o.sampleCount) {}
-
+        sampleCount(o.sampleCount), offset(o.offset),
+        collectingHist(o.collectingHist), hist(o.hist) {}
+  statistic(double minv, double maxv, double meanv, uint64_t sc, double sd)
+      : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc),
+        sampleCount(sc), offset(0.0), collectingHist(false) {}
+  bool haveHist() const { return collectingHist; }
   double getMin() const { return minVal; }
   double getMean() const { return meanVal; }
   double getMax() const { return maxVal; }
   uint64_t getCount() const { return sampleCount; }
   double getSD() const { return sqrt(m2 / sampleCount); }
   double getTotal() const { return sampleCount * meanVal; }
+  logHistogram const *getHist() const { return &hist; }
+  void setOffset(double d) { offset = d; }
 
   void reset() {
     minVal = std::numeric_limits<double>::max();
-    maxVal = -std::numeric_limits<double>::max();
+    maxVal = -minVal;
     meanVal = 0.0;
     m2 = 0.0;
     sampleCount = 0;
+    offset = 0.0;
+    hist.reset();
   }
   void addSample(double sample);
   void scale(double factor);
   void scaleDown(double f) { scale(1. / f); }
+  void forceCount(uint64_t count) { sampleCount = count; }
   statistic &operator+=(statistic const &other);
 
   std::string format(char unit, bool total = false) const;
+  std::string formatHist(char unit) const { return hist.format(unit); }
 };
 
 struct statInfo {
@@ -359,39 +468,29 @@ public:
 // to live on the stack of the thread, they're more work to use.
 class explicitTimer {
   timeStat *stat;
+  timer_e timerEnumValue;
   tsc_tick_count startTime;
   tsc_tick_count pauseStartTime;
   tsc_tick_count::tsc_interval_t totalPauseTime;
 
 public:
-  explicitTimer()
-      : stat(0), startTime(0), pauseStartTime(0), totalPauseTime() {}
-  explicitTimer(timeStat *s)
-      : stat(s), startTime(), pauseStartTime(0), totalPauseTime() {}
-
-  void setStat(timeStat *s) { stat = s; }
-  void start(timer_e timerEnumValue);
-  void pause() { pauseStartTime = tsc_tick_count::now(); }
-  void resume() { totalPauseTime += (tsc_tick_count::now() - pauseStartTime); }
-  void stop(timer_e timerEnumValue, kmp_stats_list *stats_ptr = nullptr);
+  explicitTimer(timeStat *s, timer_e te)
+      : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0),
+        totalPauseTime() {}
+
+  // void setStat(timeStat *s) { stat = s; }
+  void start(tsc_tick_count tick);
+  void pause(tsc_tick_count tick) { pauseStartTime = tick; }
+  void resume(tsc_tick_count tick) {
+    totalPauseTime += (tick - pauseStartTime);
+  }
+  void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr);
   void reset() {
     startTime = 0;
     pauseStartTime = 0;
     totalPauseTime = 0;
   }
-};
-
-// Where all you need is to time a block, this is enough.
-// (It avoids the need to have an explicit end, leaving the scope suffices.)
-class blockTimer : public explicitTimer {
-  timer_e timerEnumValue;
-
-public:
-  blockTimer(timeStat *s, timer_e newTimerEnumValue)
-      : timerEnumValue(newTimerEnumValue), explicitTimer(s) {
-    start(timerEnumValue);
-  }
-  ~blockTimer() { stop(timerEnumValue); }
+  timer_e get_type() const { return timerEnumValue; }
 };
 
 // Where you need to partition a threads clock ticks into separate states
@@ -402,14 +501,13 @@ public:
 // versa
 class partitionedTimers {
 private:
-  explicitTimer *timers[EXPLICIT_TIMER_LAST + 1];
-  std::vector<timerPair> timer_stack;
+  std::vector<explicitTimer> timer_stack;
 
 public:
   partitionedTimers();
-  void add_timer(explicit_timer_e timer_index, explicitTimer *timer_pointer);
-  void init(timerPair timer_index);
-  void push(timerPair timer_index);
+  void init(explicitTimer timer);
+  void exchange(explicitTimer timer);
+  void push(explicitTimer timer);
   void pop();
   void windup();
 };
@@ -418,12 +516,11 @@ public:
 // It avoids the need to have an explicit end, leaving the scope suffices.
 class blockPartitionedTimer {
   partitionedTimers *part_timers;
-  timerPair timer_pair;
 
 public:
-  blockPartitionedTimer(partitionedTimers *pt, timerPair tp)
-      : part_timers(pt), timer_pair(tp) {
-    part_timers->push(timer_pair);
+  blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer)
+      : part_timers(pt) {
+    part_timers->push(timer);
   }
   ~blockPartitionedTimer() { part_timers->pop(); }
 };
@@ -609,7 +706,7 @@ class kmp_stats_list {
   int gtid;
   timeStat _timers[TIMER_LAST + 1];
   counter _counters[COUNTER_LAST + 1];
-  explicitTimer _explicitTimers[EXPLICIT_TIMER_LAST + 1];
+  explicitTimer thread_life_timer;
   partitionedTimers _partitionedTimers;
   int _nestLevel; // one per thread
   kmp_stats_event_vector _event_vector;
@@ -620,28 +717,21 @@ class kmp_stats_list {
 
 public:
   kmp_stats_list()
-      : _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
-        thread_is_idle_flag(0) {
-#define doInit(name, ignore1, ignore2)                                         \
-  getExplicitTimer(EXPLICIT_TIMER_##name)->setStat(getTimer(TIMER_##name));    \
-  _partitionedTimers.add_timer(EXPLICIT_TIMER_##name,                          \
-                               getExplicitTimer(EXPLICIT_TIMER_##name));
-    KMP_FOREACH_EXPLICIT_TIMER(doInit, 0);
-#undef doInit
-  }
+      : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life],
+                          TIMER_OMP_worker_thread_life),
+        _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE),
+        thread_is_idle_flag(0) {}
   ~kmp_stats_list() {}
   inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; }
   inline counter *getCounter(counter_e idx) { return &_counters[idx]; }
-  inline explicitTimer *getExplicitTimer(explicit_timer_e idx) {
-    return &_explicitTimers[idx];
-  }
   inline partitionedTimers *getPartitionedTimers() {
     return &_partitionedTimers;
   }
   inline timeStat *getTimers() { return _timers; }
   inline counter *getCounters() { return _counters; }
-  inline explicitTimer *getExplicitTimers() { return _explicitTimers; }
   inline kmp_stats_event_vector &getEventVector() { return _event_vector; }
+  inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); }
+  inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); }
   inline void resetEventVector() { _event_vector.reset(); }
   inline void incrementNestValue() { _nestLevel++; }
   inline int getNestValue() { return _nestLevel; }
@@ -774,22 +864,6 @@ extern kmp_stats_output_module __kmp_sta
 // Simple, standard interfaces that drop out completely if stats aren't enabled
 
 /*!
- * \brief Uses specified timer (name) to time code block.
- *
- * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
- *
- * \details Use KMP_TIME_BLOCK(name) macro to time a code block.  This will
- * record the time taken in the block and use the destructor to stop the timer.
- * Convenient! With this definition you can't have more than one KMP_TIME_BLOCK
- * in the same code block. I don't think that's a problem.
- *
- * @ingroup STATS_GATHERING
-*/
-#define KMP_TIME_BLOCK(name)                                                   \
-  blockTimer __BLOCKTIME__(__kmp_stats_thread_ptr->getTimer(TIMER_##name),     \
-                           TIMER_##name)
-
-/*!
  * \brief Adds value to specified timer (name).
  *
  * @param name timer name as specified under the KMP_FOREACH_TIMER() macro
@@ -817,44 +891,6 @@ extern kmp_stats_output_module __kmp_sta
   __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment()
 
 /*!
- * \brief "Starts" an explicit timer which will need a corresponding
- * KMP_STOP_EXPLICIT_TIMER() macro.
- *
- * @param name explicit timer name as specified under the
- * KMP_FOREACH_EXPLICIT_TIMER() macro
- *
- * \details Use to start a timer.  This will need a corresponding
- * KMP_STOP_EXPLICIT_TIMER() macro to stop the timer unlike the
- * KMP_TIME_BLOCK(name) macro which has an implicit stopping macro at the end
- * of the code block.  All explicit timers are stopped at library exit time
- * before the final statistics are outputted.
- *
- * @ingroup STATS_GATHERING
-*/
-#define KMP_START_EXPLICIT_TIMER(name)                                         \
-  __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name)              \
-      ->start(TIMER_##name)
-
-/*!
- * \brief "Stops" an explicit timer.
- *
- * @param name explicit timer name as specified under the
- * KMP_FOREACH_EXPLICIT_TIMER() macro
- *
- * \details Use KMP_STOP_EXPLICIT_TIMER(name) to stop a timer.  When this is
- * done, the time between the last KMP_START_EXPLICIT_TIMER(name) and this
- * KMP_STOP_EXPLICIT_TIMER(name) will be added to the timer's stat value. The
- * timer will then be reset. After the KMP_STOP_EXPLICIT_TIMER(name) macro is
- * called, another call to KMP_START_EXPLICIT_TIMER(name) will start the timer
- * once again.
- *
- * @ingroup STATS_GATHERING
-*/
-#define KMP_STOP_EXPLICIT_TIMER(name)                                          \
-  __kmp_stats_thread_ptr->getExplicitTimer(EXPLICIT_TIMER_##name)              \
-      ->stop(TIMER_##name)
-
-/*!
  * \brief Outputs the current thread statistics and reset them.
  *
  * @param heading_string heading put above the final stats output
@@ -881,21 +917,26 @@ extern kmp_stats_output_module __kmp_sta
  * @ingroup STATS_GATHERING
 */
 #define KMP_INIT_PARTITIONED_TIMERS(name)                                      \
-  __kmp_stats_thread_ptr->getPartitionedTimers()->init(                        \
-      timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
+  __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer(          \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
 
 #define KMP_TIME_PARTITIONED_BLOCK(name)                                       \
   blockPartitionedTimer __PBLOCKTIME__(                                        \
       __kmp_stats_thread_ptr->getPartitionedTimers(),                          \
-      timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
+      explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name),            \
+                    TIMER_##name))
 
 #define KMP_PUSH_PARTITIONED_TIMER(name)                                       \
-  __kmp_stats_thread_ptr->getPartitionedTimers()->push(                        \
-      timerPair(EXPLICIT_TIMER_##name, TIMER_##name))
+  __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer(          \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
 
 #define KMP_POP_PARTITIONED_TIMER()                                            \
   __kmp_stats_thread_ptr->getPartitionedTimers()->pop()
 
+#define KMP_EXCHANGE_PARTITIONED_TIMER(name)                                   \
+  __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer(      \
+      __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name))
+
 #define KMP_SET_THREAD_STATE(state_name)                                       \
   __kmp_stats_thread_ptr->setState(state_name)
 

Modified: openmp/trunk/runtime/src/kmp_stats_timing.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/kmp_stats_timing.cpp?rev=338276&r1=338275&r2=338276&view=diff
==============================================================================
--- openmp/trunk/runtime/src/kmp_stats_timing.cpp (original)
+++ openmp/trunk/runtime/src/kmp_stats_timing.cpp Mon Jul 30 10:41:08 2018
@@ -95,10 +95,11 @@ std::string formatSI(double interval, in
     static struct {
       double scale;
       char prefix;
-    } ranges[] = {{1.e12, 'f'},  {1.e9, 'p'},   {1.e6, 'n'},   {1.e3, 'u'},
-                  {1.0, 'm'},    {1.e-3, ' '},  {1.e-6, 'k'},  {1.e-9, 'M'},
-                  {1.e-12, 'G'}, {1.e-15, 'T'}, {1.e-18, 'P'}, {1.e-21, 'E'},
-                  {1.e-24, 'Z'}, {1.e-27, 'Y'}};
+    } ranges[] = {{1.e21, 'y'},  {1.e18, 'z'},  {1.e15, 'a'},  {1.e12, 'f'},
+                  {1.e9, 'p'},   {1.e6, 'n'},   {1.e3, 'u'},   {1.0, 'm'},
+                  {1.e-3, ' '},  {1.e-6, 'k'},  {1.e-9, 'M'},  {1.e-12, 'G'},
+                  {1.e-15, 'T'}, {1.e-18, 'P'}, {1.e-21, 'E'}, {1.e-24, 'Z'},
+                  {1.e-27, 'Y'}};
 
     if (interval == 0.0) {
       os << std::setw(width - 3) << std::right << "0.00" << std::setw(3)

Modified: openmp/trunk/runtime/src/z_Linux_util.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/z_Linux_util.cpp?rev=338276&r1=338275&r2=338276&view=diff
==============================================================================
--- openmp/trunk/runtime/src/z_Linux_util.cpp (original)
+++ openmp/trunk/runtime/src/z_Linux_util.cpp Mon Jul 30 10:41:08 2018
@@ -510,7 +510,7 @@ static void *__kmp_launch_worker(void *t
 #if KMP_STATS_ENABLED
   // set thread local index to point to thread-specific stats
   __kmp_stats_thread_ptr = ((kmp_info_t *)thr)->th.th_stats;
-  KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
+  __kmp_stats_thread_ptr->startLife();
   KMP_SET_THREAD_STATE(IDLE);
   KMP_INIT_PARTITIONED_TIMERS(OMP_idle);
 #endif




More information about the Openmp-commits mailing list