[Openmp-commits] [openmp] r357722 - [OpenMP] Fix hang on Windows

Jonathan Peyton via Openmp-commits openmp-commits at lists.llvm.org
Thu Apr 4 13:35:29 PDT 2019


Author: jlpeyton
Date: Thu Apr  4 13:35:29 2019
New Revision: 357722

URL: http://llvm.org/viewvc/llvm-project?rev=357722&view=rev
Log:
[OpenMP] Fix hang on Windows

Debug dump on large machine shows when many OpenMP threads (401 in total)
sleep on a barrier, one of the innermost nesting levels sleeps
on a child's b_arrived flag whose value is equal to 4 and is equal to
checker value. i.e., (1) sleep bit is 0, and (2) done_check() would
return true if called.

It is unclear how this might happen. It could be Windows Server 2016's
error of EnterCriticalSection / LeaveCriticalSection, or
error of WaitForSingleObject / SetEvent / ResetEvent, or
error in the library which is very difficult to find.

As a workaround, change INFINITE wait to timed wait, so that each
thread awakens each 5 seconds (the timeout was chosen arbitrary to not
disturb other threads much), check flag condition under the lock, and
either go to sleep again or stop sleeping as a result of the check.

Patch by Andrey Churbanov

Differential Revision: https://reviews.llvm.org/D59793

Modified:
    openmp/trunk/runtime/src/z_Windows_NT_util.cpp

Modified: openmp/trunk/runtime/src/z_Windows_NT_util.cpp
URL: http://llvm.org/viewvc/llvm-project/openmp/trunk/runtime/src/z_Windows_NT_util.cpp?rev=357722&r1=357721&r2=357722&view=diff
==============================================================================
--- openmp/trunk/runtime/src/z_Windows_NT_util.cpp (original)
+++ openmp/trunk/runtime/src/z_Windows_NT_util.cpp Thu Apr  4 13:35:29 2019
@@ -193,8 +193,9 @@ void __kmp_win32_cond_destroy(kmp_win32_
 /* TODO associate cv with a team instead of a thread so as to optimize
    the case where we wake up a whole team */
 
-void __kmp_win32_cond_wait(kmp_win32_cond_t *cv, kmp_win32_mutex_t *mx,
-                           kmp_info_t *th, int need_decrease_load) {
+template <class C>
+static void __kmp_win32_cond_wait(kmp_win32_cond_t *cv, kmp_win32_mutex_t *mx,
+                                  kmp_info_t *th, C *flag) {
   int my_generation;
   int last_waiter;
 
@@ -211,21 +212,46 @@ void __kmp_win32_cond_wait(kmp_win32_con
   __kmp_win32_mutex_unlock(mx);
 
   for (;;) {
-    int wait_done;
-
+    int wait_done = 0;
+    DWORD res, timeout = 5000; // just tried to quess an appropriate number
     /* Wait until the event is signaled */
-    WaitForSingleObject(cv->event_, INFINITE);
-
-    __kmp_win32_mutex_lock(&cv->waiters_count_lock_);
+    res = WaitForSingleObject(cv->event_, timeout);
 
-    /* Exit the loop when the <cv->event_> is signaled and there are still
-       waiting threads from this <wait_generation> that haven't been released
-       from this wait yet. */
-    wait_done = (cv->release_count_ > 0) &&
-                (cv->wait_generation_count_ != my_generation);
-
-    __kmp_win32_mutex_unlock(&cv->waiters_count_lock_);
+    if (res == WAIT_OBJECT_0) {
+      // event signaled
+      __kmp_win32_mutex_lock(&cv->waiters_count_lock_);
+      /* Exit the loop when the <cv->event_> is signaled and there are still
+         waiting threads from this <wait_generation> that haven't been released
+         from this wait yet. */
+      wait_done = (cv->release_count_ > 0) &&
+                  (cv->wait_generation_count_ != my_generation);
+      __kmp_win32_mutex_unlock(&cv->waiters_count_lock_);
+    } else if (res == WAIT_TIMEOUT || res == WAIT_FAILED) {
+      // check if the flag and cv counters are in consistent state
+      // as MS sent us debug dump whith inconsistent state of data
+      __kmp_win32_mutex_lock(mx);
+      typename C::flag_t old_f = flag->set_sleeping();
+      if (!flag->done_check_val(old_f & ~KMP_BARRIER_SLEEP_STATE)) {
+        __kmp_win32_mutex_unlock(mx);
+        continue;
+      }
+      // condition fulfilled, exiting
+      old_f = flag->unset_sleeping();
+      KMP_DEBUG_ASSERT(old_f & KMP_BARRIER_SLEEP_STATE);
+      TCW_PTR(th->th.th_sleep_loc, NULL);
+      KF_TRACE(50, ("__kmp_win32_cond_wait: exiting, condition "
+                    "fulfilled: flag's loc(%p): %u => %u\n",
+                    flag->get(), old_f, *(flag->get())));
+
+      __kmp_win32_mutex_lock(&cv->waiters_count_lock_);
+      KMP_DEBUG_ASSERT(cv->waiters_count_ > 0);
+      cv->release_count_ = cv->waiters_count_;
+      cv->wait_generation_count_++;
+      wait_done = 1;
+      __kmp_win32_mutex_unlock(&cv->waiters_count_lock_);
 
+      __kmp_win32_mutex_unlock(mx);
+    }
     /* there used to be a semicolon after the if statement, it looked like a
        bug, so i removed it */
     if (wait_done)
@@ -377,12 +403,11 @@ static inline void __kmp_suspend_templat
           KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
         }
         deactivated = TRUE;
-
-        __kmp_win32_cond_wait(&th->th.th_suspend_cv, &th->th.th_suspend_mx, 0,
-                              0);
+        __kmp_win32_cond_wait(&th->th.th_suspend_cv, &th->th.th_suspend_mx, th,
+                              flag);
       } else {
-        __kmp_win32_cond_wait(&th->th.th_suspend_cv, &th->th.th_suspend_mx, 0,
-                              0);
+        __kmp_win32_cond_wait(&th->th.th_suspend_cv, &th->th.th_suspend_mx, th,
+                              flag);
       }
 
 #ifdef KMP_DEBUG




More information about the Openmp-commits mailing list