[compiler-rt] 46240c3 - [scudo][standalone] Minor optimization & improvements

Thu Nov 21 10:05:53 PST 2019

Author: Kostya Kortchinsky
Date: 2019-11-21T10:05:39-08:00
New Revision: 46240c38721fe9919f9c63277bec7bbf3e62073b

URL: https://github.com/llvm/llvm-project/commit/46240c38721fe9919f9c63277bec7bbf3e62073b
DIFF: https://github.com/llvm/llvm-project/commit/46240c38721fe9919f9c63277bec7bbf3e62073b.diff

LOG: [scudo][standalone] Minor optimization & improvements

Summary:
A few small improvements and optimizations:
- when refilling the free list, push back the last batch and return
  the front one: this allows to keep the allocations towards the front
  of the region;
- instead of using 48 entries in the shuffle array, use a multiple of
  `MaxNumCached`;
- make the maximum number of batches to create on refil a constant;
  ultimately it should be configurable, but that's for later;
- `initCache` doesn't need to zero out the cache, it's already done.
- it turns out that when using `||` or `&&`, the compiler is adamant
  on adding a short circuit for every part of the expression. Which
  ends up making somewhat annoying asm with lots of test and
  conditional jump. I am changing that to bitwise `|` or `&` in two
  place so that the generated code looks better. Added comments since
  it might feel weird to people.

This yields to some small performance gains overall, nothing drastic
though.

Reviewers: hctim, morehouse, cferris, eugenis

Subscribers: #sanitizers, llvm-commits

Tags: #sanitizers, #llvm

Differential Revision: https://reviews.llvm.org/D70452

Added: 
    

Modified: 
    compiler-rt/lib/scudo/standalone/combined.h
    compiler-rt/lib/scudo/standalone/primary32.h
    compiler-rt/lib/scudo/standalone/primary64.h

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index f4fa5d4b99ad..0a05857a20d6 100644

--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -144,7 +144,10 @@ template <class Params> class Allocator {
 
   TSDRegistryT *getTSDRegistry() { return &TSDRegistry; }
 
-  void initCache(CacheT *Cache) { Cache->init(&Stats, &Primary); }
+  // The Cache must be provided zero-initialized.
+  void initCache(CacheT *Cache) {
+    Cache->initLinkerInitialized(&Stats, &Primary);
+  }
 
   // Release the resources used by a TSD, which involves:
   // - draining the local quarantine cache to the global quarantine;
@@ -161,7 +164,7 @@ template <class Params> class Allocator {
                           uptr Alignment = MinAlignment,
                           bool ZeroContents = false) {
     initThreadMaybe();
-    ZeroContents = ZeroContents || Options.ZeroContents;
+    ZeroContents |= static_cast<bool>(Options.ZeroContents);
 
     if (UNLIKELY(Alignment > MaxAlignment)) {
       if (Options.MayReturnNull)
@@ -181,12 +184,13 @@ template <class Params> class Allocator {
         ((Alignment > MinAlignment) ? Alignment : Chunk::getHeaderSize());
 
     // Takes care of extravagantly large sizes as well as integer overflows.
-    if (UNLIKELY(Size >= MaxAllowedMallocSize ||
-                 NeededSize >= MaxAllowedMallocSize)) {
+    COMPILER_CHECK(MaxAllowedMallocSize < UINTPTR_MAX - MaxAlignment);
+    if (UNLIKELY(Size >= MaxAllowedMallocSize)) {
       if (Options.MayReturnNull)
         return nullptr;
       reportAllocationSizeTooBig(Size, NeededSize, MaxAllowedMallocSize);
     }
+    DCHECK_LE(Size, NeededSize);
 
     void *Block;
     uptr ClassId;
@@ -541,7 +545,9 @@ template <class Params> class Allocator {
     Chunk::UnpackedHeader NewHeader = *Header;
     // If the quarantine is disabled, the actual size of a chunk is 0 or larger
     // than the maximum allowed, we return a chunk directly to the backend.
-    const bool BypassQuarantine = !Quarantine.getCacheSize() || !Size ||
+    // Logical Or can be short-circuited, which introduces unnecessary
+    // conditional jumps, so use bitwise Or and let the compiler be clever.
+    const bool BypassQuarantine = !Quarantine.getCacheSize() | !Size |
                                   (Size > Options.QuarantineMaxChunkSize);
     if (BypassQuarantine) {
       NewHeader.State = Chunk::State::Available;

diff  --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 453b06ee5549..a0d8560c3f6c 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -300,10 +300,10 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
     const uptr NumberOfBlocks = RegionSize / Size;
     DCHECK_GT(NumberOfBlocks, 0);
     TransferBatch *B = nullptr;
-    constexpr uptr ShuffleArraySize = 48;
+    constexpr u32 ShuffleArraySize = 8U * TransferBatch::MaxNumCached;
     void *ShuffleArray[ShuffleArraySize];
     u32 Count = 0;
-    const uptr AllocatedUser = NumberOfBlocks * Size;
+    const uptr AllocatedUser = Size * NumberOfBlocks;
     for (uptr I = Region; I < Region + AllocatedUser; I += Size) {
       ShuffleArray[Count++] = reinterpret_cast<void *>(I);
       if (Count == ShuffleArraySize) {
@@ -319,6 +319,11 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
         return nullptr;
     }
     DCHECK(B);
+    if (!Sci->FreeList.empty()) {
+      Sci->FreeList.push_back(B);
+      B = Sci->FreeList.front();
+      Sci->FreeList.pop_front();
+    }
     DCHECK_GT(B->getCount(), 0);
 
     C->getStats().add(StatFree, AllocatedUser);

diff  --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 409472c87776..559742d05ad9 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -187,6 +187,8 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator64 {
 
   // Call map for user memory with at least this size.
   static const uptr MapSizeIncrement = 1UL << 17;
+  // Fill at most this number of batches from the newly map'd memory.
+  static const u32 MaxNumBatches = 8U;
 
   struct RegionStats {
     uptr PoppedBlocks;
@@ -289,16 +291,18 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator64 {
       C->getStats().add(StatMapped, UserMapSize);
     }
 
-    const uptr NumberOfBlocks = Min(
-        8UL * MaxCount, (Region->MappedUser - Region->AllocatedUser) / Size);
+    const u32 NumberOfBlocks = Min(
+        MaxNumBatches * MaxCount,
+        static_cast<u32>((Region->MappedUser - Region->AllocatedUser) / Size));
     DCHECK_GT(NumberOfBlocks, 0);
 
     TransferBatch *B = nullptr;
-    constexpr uptr ShuffleArraySize = 48;
+    constexpr u32 ShuffleArraySize =
+        MaxNumBatches * TransferBatch::MaxNumCached;
     void *ShuffleArray[ShuffleArraySize];
     u32 Count = 0;
     const uptr P = RegionBeg + Region->AllocatedUser;
-    const uptr AllocatedUser = NumberOfBlocks * Size;
+    const uptr AllocatedUser = Size * NumberOfBlocks;
     for (uptr I = P; I < P + AllocatedUser; I += Size) {
       ShuffleArray[Count++] = reinterpret_cast<void *>(I);
       if (Count == ShuffleArraySize) {
@@ -314,6 +318,11 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator64 {
         return nullptr;
     }
     DCHECK(B);
+    if (!Region->FreeList.empty()) {
+      Region->FreeList.push_back(B);
+      B = Region->FreeList.front();
+      Region->FreeList.pop_front();
+    }
     DCHECK_GT(B->getCount(), 0);
 
     C->getStats().add(StatFree, AllocatedUser);