[compiler-rt] cde307e - [scudo] Fine tune busy-waiting in HybridMutex

Thu Sep 21 14:03:41 PDT 2023

Author: Chia-hung Duan
Date: 2023-09-21T21:00:47Z
New Revision: cde307e4657738f1ef5f2eceaa7f896358acb3d6

URL: https://github.com/llvm/llvm-project/commit/cde307e4657738f1ef5f2eceaa7f896358acb3d6
DIFF: https://github.com/llvm/llvm-project/commit/cde307e4657738f1ef5f2eceaa7f896358acb3d6.diff

LOG: [scudo] Fine tune busy-waiting in HybridMutex

Instead of using hardware specific instruction, using simple loop over
volatile variable gives similar and more predicatable waiting time. Also
fine tune the waiting time to fit with the average time in malloc/free
operations.

Reviewed By: cferris

Differential Revision: https://reviews.llvm.org/D156951

Added: 
    

Modified: 
    compiler-rt/lib/scudo/standalone/common.h
    compiler-rt/lib/scudo/standalone/mutex.h

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h
index db5f20f7acdc49a..d0f429cfcb7a08e 100644

--- a/compiler-rt/lib/scudo/standalone/common.h
+++ b/compiler-rt/lib/scudo/standalone/common.h
@@ -112,21 +112,6 @@ template <typename T> inline void shuffle(T *A, u32 N, u32 *RandState) {
   *RandState = State;
 }
 
-// Hardware specific inlinable functions.
-
-inline void yieldProcessor(UNUSED u8 Count) {
-#if defined(__i386__) || defined(__x86_64__)
-  __asm__ __volatile__("" ::: "memory");
-  for (u8 I = 0; I < Count; I++)
-    __asm__ __volatile__("pause");
-#elif defined(__aarch64__) || defined(__arm__)
-  __asm__ __volatile__("" ::: "memory");
-  for (u8 I = 0; I < Count; I++)
-    __asm__ __volatile__("yield");
-#endif
-  __asm__ __volatile__("" ::: "memory");
-}
-
 // Platform specific functions.
 
 extern uptr PageSizeCached;

diff  --git a/compiler-rt/lib/scudo/standalone/mutex.h b/compiler-rt/lib/scudo/standalone/mutex.h
index 05340de3e12d7bc..38108397b1654bb 100644
--- a/compiler-rt/lib/scudo/standalone/mutex.h
+++ b/compiler-rt/lib/scudo/standalone/mutex.h
@@ -35,7 +35,7 @@ class CAPABILITY("mutex") HybridMutex {
 #pragma nounroll
 #endif
     for (u8 I = 0U; I < NumberOfTries; I++) {
-      yieldProcessor(NumberOfYields);
+      delayLoop();
       if (tryLock())
         return;
     }
@@ -53,10 +53,21 @@ class CAPABILITY("mutex") HybridMutex {
   }
 
 private:
+  void delayLoop() {
+    // The value comes from the average time spent in accessing caches (which
+    // are the fastest operations) so that we are unlikely to wait too long for
+    // fast operations.
+    constexpr u32 SpinTimes = 16;
+    volatile u32 V = 0;
+    for (u32 I = 0; I < SpinTimes; ++I)
+      ++V;
+  }
+
   void assertHeldImpl();
 
-  static constexpr u8 NumberOfTries = 8U;
-  static constexpr u8 NumberOfYields = 8U;
+  // TODO(chiahungduan): Adapt this value based on scenarios. E.g., primary and
+  // secondary allocator have 
diff erent allocation times.
+  static constexpr u8 NumberOfTries = 32U;
 
 #if SCUDO_LINUX
   atomic_u32 M = {};