[compiler-rt] [scudo] Store more blocks in each TransferBatch (PR #70390)

Thu Oct 26 16:01:26 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-compiler-rt-sanitizer

Author: None (ChiaHungDuan)

<details>
<summary>Changes</summary>

Instead of always storing the same number of blocks as cached, we prefer increasing the utilization by saving more blocks in a single TransferBatch. This may slightly impact the performance, but it will save a lot of memory used by BatchClassId (especially for larger blocks).

---

Patch is 22.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70390.diff


4 Files Affected:

- (modified) compiler-rt/lib/scudo/standalone/allocator_common.h (+7) 
- (modified) compiler-rt/lib/scudo/standalone/primary32.h (+55-51) 
- (modified) compiler-rt/lib/scudo/standalone/primary64.h (+81-69) 
- (modified) compiler-rt/lib/scudo/standalone/tests/primary_test.cpp (+15-19) 


``````````diff

diff --git a/compiler-rt/lib/scudo/standalone/allocator_common.h b/compiler-rt/lib/scudo/standalone/allocator_common.h
index 95f4776ac596dc0..46dc7c0f3b914ec 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_common.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_common.h
@@ -40,6 +40,7 @@ template <class SizeClassAllocator> struct TransferBatch {
     B->Count = static_cast<u16>(B->Count - N);
   }
   void clear() { Count = 0; }
+  bool empty() { return Count == 0; }
   void add(CompactPtrT P) {
     DCHECK_LT(Count, MaxNumCached);
     Batch[Count++] = P;
@@ -48,6 +49,12 @@ template <class SizeClassAllocator> struct TransferBatch {
     memcpy(Array, Batch, sizeof(Batch[0]) * Count);
     clear();
   }
+
+  void moveNToArray(CompactPtrT *Array, u16 N) {
+    DCHECK_LE(N, Count);
+    memcpy(Array, Batch + Count - N, sizeof(Batch[0]) * Count);
+    Count -= N;
+  }
   u16 getCount() const { return Count; }
   bool isEmpty() const { return Count == 0U; }
   CompactPtrT get(u16 I) const {
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 8281e02ba164c13..d7cf09a376d5042 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -191,38 +191,21 @@ template <typename Config> class SizeClassAllocator32 {
     return BlockSize > PageSize;
   }
 
-  // Note that the `MaxBlockCount` will be used when we support arbitrary blocks
-  // count. Now it's the same as the number of blocks stored in the
-  // `TransferBatch`.
   u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
-                UNUSED const u16 MaxBlockCount) {
-    TransferBatchT *B = popBatch(C, ClassId);
-    if (!B)
-      return 0;
-
-    const u16 Count = B->getCount();
-    DCHECK_GT(Count, 0U);
-    B->moveToArray(ToArray);
-
-    if (ClassId != SizeClassMap::BatchClassId)
-      C->deallocate(SizeClassMap::BatchClassId, B);
-
-    return Count;
-  }
-
-  TransferBatchT *popBatch(CacheT *C, uptr ClassId) {
+                const u16 MaxBlockCount) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     ScopedLock L(Sci->Mutex);
-    TransferBatchT *B = popBatchImpl(C, ClassId, Sci);
-    if (UNLIKELY(!B)) {
+
+    u16 PopCount = popBlocksImpl(C, ClassId, Sci, ToArray, MaxBlockCount);
+    if (UNLIKELY(PopCount == 0)) {
       if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
-        return nullptr;
-      B = popBatchImpl(C, ClassId, Sci);
-      // if `populateFreeList` succeeded, we are supposed to get free blocks.
-      DCHECK_NE(B, nullptr);
+        return 0U;
+      PopCount = popBlocksImpl(C, ClassId, Sci, ToArray, MaxBlockCount);
+      DCHECK_NE(PopCount, 0U);
     }
-    return B;
+
+    return PopCount;
   }
 
   // Push the array of free blocks to the designated batch group.
@@ -510,7 +493,7 @@ template <typename Config> class SizeClassAllocator32 {
     // by TransferBatch is also free for use. We don't need to recycle the
     // TransferBatch. Note that the correctness is maintained by the invariant,
     //
-    //   The unit of each popBatch() request is entire TransferBatch. Return
+    //   Each popBlocks() request returns the entire TransferBatch. Return
     //   part of the blocks in a TransferBatch is invalid.
     //
     // This ensures that TransferBatch won't leak the address itself while it's
@@ -634,7 +617,7 @@ template <typename Config> class SizeClassAllocator32 {
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
+      BG->MaxCachedPerBatch = TransferBatchT::MaxNumCached;
 
       return BG;
     };
@@ -726,14 +709,11 @@ template <typename Config> class SizeClassAllocator32 {
     InsertBlocks(Cur, Array + Size - Count, Count);
   }
 
-  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
-  // group id will be considered first.
-  //
-  // The region mutex needs to be held while calling this method.
-  TransferBatchT *popBatchImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
+  u16 popBlocksImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci,
+                    CompactPtrT *ToArray, const u16 MaxBlockCount)
       REQUIRES(Sci->Mutex) {
     if (Sci->FreeListInfo.BlockList.empty())
-      return nullptr;
+      return 0U;
 
     SinglyLinkedList<TransferBatchT> &Batches =
         Sci->FreeListInfo.BlockList.front()->Batches;
@@ -746,33 +726,57 @@ template <typename Config> class SizeClassAllocator32 {
       // Block used by `BatchGroup` is from BatchClassId. Turn the block into
       // `TransferBatch` with single block.
       TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
-      TB->clear();
-      TB->add(
-          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB)));
+      ToArray[0] =
+          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB));
       Sci->FreeListInfo.PoppedBlocks += 1;
-      return TB;
+      return 1U;
     }
 
+    // So far, instead of always fill blocks to `MaxBlockCount`, we only exmaine
+    // single `TransferBatch` to minimize the time spent in the primary
+    // allocator. Besides, the sizes of `TransferBatch` and
+    // `CacheT::getMaxCached()` may also impact the times of accessing the
+    // primary allocator.
+    // TODO(chiahungduan): Evaluate if we want to always prepare `MaxBlockCount`
+    // blocks and/or adjust the size of `TransferBatch` according to
+    // `CacheT::getMaxCached()`.
     TransferBatchT *B = Batches.front();
-    Batches.pop_front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
-    if (Batches.empty()) {
-      BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
-      Sci->FreeListInfo.BlockList.pop_front();
-
-      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
-      // allocating. Note that block used by constructing BatchGroup is recorded
-      // as free blocks in the last element of BatchGroup::Batches. Which means,
-      // once we pop the last TransferBatch, the block is implicitly
-      // deallocated.
+    // BachClassId should always take all blocks in the TransferBatch. Read the
+    // comment in `pushBatchClassBlocks()` for more details.
+    const u16 PopCount = ClassId == SizeClassMap::BatchClassId
+                             ? B->getCount()
+                             : Min(MaxBlockCount, B->getCount());
+    B->moveNToArray(ToArray, PopCount);
+
+    // TODO(chiahungduan): The deallocation of unused BatchClassId blocks can be
+    // done without holding `Mutex`.
+    if (B->empty()) {
+      Batches.pop_front();
+      // `TransferBatch` of BatchClassId is self-contained, no need to
+      // deallocate. Read the comment in `pushBatchClassBlocks()` for more
+      // details.
       if (ClassId != SizeClassMap::BatchClassId)
-        C->deallocate(SizeClassMap::BatchClassId, BG);
+        C->deallocate(SizeClassMap::BatchClassId, B);
+
+      if (Batches.empty()) {
+        BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
+        Sci->FreeListInfo.BlockList.pop_front();
+
+        // We don't keep BatchGroup with zero blocks to avoid empty-checking
+        // while allocating. Note that block used by constructing BatchGroup is
+        // recorded as free blocks in the last element of BatchGroup::Batches.
+        // Which means, once we pop the last TransferBatch, the block is
+        // implicitly deallocated.
+        if (ClassId != SizeClassMap::BatchClassId)
+          C->deallocate(SizeClassMap::BatchClassId, BG);
+      }
     }
 
-    Sci->FreeListInfo.PoppedBlocks += B->getCount();
-    return B;
+    Sci->FreeListInfo.PoppedBlocks += PopCount;
+    return PopCount;
   }
 
   NOINLINE bool populateFreeList(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index d1929ff7212f478..5fd49ee60e2824e 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -221,41 +221,24 @@ template <typename Config> class SizeClassAllocator64 {
     DCHECK_EQ(BlocksInUse, BatchClassUsedInFreeLists);
   }
 
-  // Note that the `MaxBlockCount` will be used when we support arbitrary blocks
-  // count. Now it's the same as the number of blocks stored in the
-  // `TransferBatch`.
   u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
-                UNUSED const u16 MaxBlockCount) {
-    TransferBatchT *B = popBatch(C, ClassId);
-    if (!B)
-      return 0;
-
-    const u16 Count = B->getCount();
-    DCHECK_GT(Count, 0U);
-    B->moveToArray(ToArray);
-
-    if (ClassId != SizeClassMap::BatchClassId)
-      C->deallocate(SizeClassMap::BatchClassId, B);
-
-    return Count;
-  }
-
-  TransferBatchT *popBatch(CacheT *C, uptr ClassId) {
+                const u16 MaxBlockCount) {
     DCHECK_LT(ClassId, NumClasses);
     RegionInfo *Region = getRegionInfo(ClassId);
+    u16 PopCount = 0;
 
     {
       ScopedLock L(Region->FLLock);
-      TransferBatchT *B = popBatchImpl(C, ClassId, Region);
-      if (LIKELY(B))
-        return B;
+      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+      if (PopCount != 0U)
+        return PopCount;
     }
 
     bool ReportRegionExhausted = false;
-    TransferBatchT *B = nullptr;
 
     if (conditionVariableEnabled()) {
-      B = popBatchWithCV(C, ClassId, Region, ReportRegionExhausted);
+      PopCount = popBlocksWithCV(C, ClassId, Region, ToArray, MaxBlockCount,
+                                 ReportRegionExhausted);
     } else {
       while (true) {
         // When two threads compete for `Region->MMLock`, we only want one of
@@ -264,13 +247,15 @@ template <typename Config> class SizeClassAllocator64 {
         ScopedLock ML(Region->MMLock);
         {
           ScopedLock FL(Region->FLLock);
-          if ((B = popBatchImpl(C, ClassId, Region)))
-            break;
+          PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+          if (PopCount != 0U)
+            return PopCount;
         }
 
         const bool RegionIsExhausted = Region->Exhausted;
         if (!RegionIsExhausted)
-          B = populateFreeListAndPopBatch(C, ClassId, Region);
+          PopCount = populateFreeListAndPopBlocks(C, ClassId, Region, ToArray,
+                                                  MaxBlockCount);
         ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
         break;
       }
@@ -286,7 +271,7 @@ template <typename Config> class SizeClassAllocator64 {
         reportOutOfBatchClass();
     }
 
-    return B;
+    return PopCount;
   }
 
   // Push the array of free blocks to the designated batch group.
@@ -640,7 +625,7 @@ template <typename Config> class SizeClassAllocator64 {
     // by TransferBatch is also free for use. We don't need to recycle the
     // TransferBatch. Note that the correctness is maintained by the invariant,
     //
-    //   The unit of each popBatch() request is entire TransferBatch. Return
+    //   Each popBlocks() request returns the entire TransferBatch. Return
     //   part of the blocks in a TransferBatch is invalid.
     //
     // This ensures that TransferBatch won't leak the address itself while it's
@@ -763,7 +748,7 @@ template <typename Config> class SizeClassAllocator64 {
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
+      BG->MaxCachedPerBatch = TransferBatchT::MaxNumCached;
 
       return BG;
     };
@@ -855,9 +840,10 @@ template <typename Config> class SizeClassAllocator64 {
     InsertBlocks(Cur, Array + Size - Count, Count);
   }
 
-  TransferBatchT *popBatchWithCV(CacheT *C, uptr ClassId, RegionInfo *Region,
-                                 bool &ReportRegionExhausted) {
-    TransferBatchT *B = nullptr;
+  u16 popBlocksWithCV(CacheT *C, uptr ClassId, RegionInfo *Region,
+                      CompactPtrT *ToArray, const u16 MaxBlockCount,
+                      bool &ReportRegionExhausted) {
+    u16 PopCount = 0;
 
     while (true) {
       // We only expect one thread doing the freelist refillment and other
@@ -878,7 +864,8 @@ template <typename Config> class SizeClassAllocator64 {
 
         const bool RegionIsExhausted = Region->Exhausted;
         if (!RegionIsExhausted)
-          B = populateFreeListAndPopBatch(C, ClassId, Region);
+          PopCount = populateFreeListAndPopBlocks(C, ClassId, Region, ToArray,
+                                                  MaxBlockCount);
         ReportRegionExhausted = !RegionIsExhausted && Region->Exhausted;
 
         {
@@ -905,7 +892,8 @@ template <typename Config> class SizeClassAllocator64 {
       // blocks were used up right after the refillment. Therefore, we have to
       // check if someone is still populating the freelist.
       ScopedLock FL(Region->FLLock);
-      if (LIKELY(B = popBatchImpl(C, ClassId, Region)))
+      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+      if (PopCount != 0U)
         break;
 
       if (!Region->isPopulatingFreeList)
@@ -918,21 +906,19 @@ template <typename Config> class SizeClassAllocator64 {
       // `pushBatchClassBlocks()` and `mergeGroupsToReleaseBack()`.
       Region->FLLockCV.wait(Region->FLLock);
 
-      if (LIKELY(B = popBatchImpl(C, ClassId, Region)))
+      PopCount = popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+      if (PopCount != 0U)
         break;
     }
 
-    return B;
+    return PopCount;
   }
 
-  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
-  // group id will be considered first.
-  //
-  // The region mutex needs to be held while calling this method.
-  TransferBatchT *popBatchImpl(CacheT *C, uptr ClassId, RegionInfo *Region)
+  u16 popBlocksImpl(CacheT *C, uptr ClassId, RegionInfo *Region,
+                    CompactPtrT *ToArray, const u16 MaxBlockCount)
       REQUIRES(Region->FLLock) {
     if (Region->FreeListInfo.BlockList.empty())
-      return nullptr;
+      return 0U;
 
     SinglyLinkedList<TransferBatchT> &Batches =
         Region->FreeListInfo.BlockList.front()->Batches;
@@ -945,39 +931,64 @@ template <typename Config> class SizeClassAllocator64 {
       // Block used by `BatchGroup` is from BatchClassId. Turn the block into
       // `TransferBatch` with single block.
       TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
-      TB->clear();
-      TB->add(
-          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB)));
+      ToArray[0] =
+          compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB));
       Region->FreeListInfo.PoppedBlocks += 1;
-      return TB;
+      return 1U;
     }
 
+    // So far, instead of always fill blocks to `MaxBlockCount`, we only exmaine
+    // single `TransferBatch` to minimize the time spent in the primary
+    // allocator. Besides, the sizes of `TransferBatch` and
+    // `CacheT::getMaxCached()` may also impact the times of accessing the
+    // primary allocator.
+    // TODO(chiahungduan): Evaluate if we want to always prepare `MaxBlockCount`
+    // blocks and/or adjust the size of `TransferBatch` according to
+    // `CacheT::getMaxCached()`.
     TransferBatchT *B = Batches.front();
-    Batches.pop_front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
-    if (Batches.empty()) {
-      BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
-      Region->FreeListInfo.BlockList.pop_front();
-
-      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
-      // allocating. Note that block used by constructing BatchGroup is recorded
-      // as free blocks in the last element of BatchGroup::Batches. Which means,
-      // once we pop the last TransferBatch, the block is implicitly
-      // deallocated.
+    // BachClassId should always take all blocks in the TransferBatch. Read the
+    // comment in `pushBatchClassBlocks()` for more details.
+    const u16 PopCount = ClassId == SizeClassMap::BatchClassId
+                             ? B->getCount()
+                             : Min(MaxBlockCount, B->getCount());
+    B->moveNToArray(ToArray, PopCount);
+
+    // TODO(chiahungduan): The deallocation of unused BatchClassId blocks can be
+    // done without holding `FLLock`.
+    if (B->empty()) {
+      Batches.pop_front();
+      // `TransferBatch` of BatchClassId is self-contained, no need to
+      // deallocate. Read the comment in `pushBatchClassBlocks()` for more
+      // details.
       if (ClassId != SizeClassMap::BatchClassId)
-        C->deallocate(SizeClassMap::BatchClassId, BG);
+        C->deallocate(SizeClassMap::BatchClassId, B);
+
+      if (Batches.empty()) {
+        BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
+        Region->FreeListInfo.BlockList.pop_front();
+
+        // We don't keep BatchGroup with zero blocks to avoid empty-checking
+        // while allocating. Note that block used by constructing BatchGroup is
+        // recorded as free blocks in the last element of BatchGroup::Batches.
+        // Which means, once we pop the last TransferBatch, the block is
+        // implicitly deallocated.
+        if (ClassId != SizeClassMap::BatchClassId)
+          C->deallocate(SizeClassMap::BatchClassId, BG);
+      }
     }
 
-    Region->FreeListInfo.PoppedBlocks += B->getCount();
+    Region->FreeListInfo.PoppedBlocks += PopCount;
 
-    return B;
+    return PopCount;
   }
 
-  // Refill the freelist and return one batch.
-  NOINLINE TransferBatchT *populateFreeListAndPopBatch(CacheT *C, uptr ClassId,
-                                                       RegionInfo *Region)
+  NOINLINE u16 populateFreeListAndPopBlocks(CacheT *C, uptr ClassId,
+                                            RegionInfo *Region,
+                                            CompactPtrT *ToArray,
+                                            const u16 MaxBlockCount)
       REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
     const uptr Size = getSizeByClassId(ClassId);
     const u16 MaxCount = CacheT::getMaxCached(Size);
@@ -994,7 +1005,7 @@ template <typename Config> class SizeClassAllocator64 {
       const uptr RegionBase = RegionBeg - getRegionBaseByClassId(ClassId);
       if (UNLIKELY(RegionBase + MappedUser + MapSize > RegionSize)) {
         Region->Exhausted = true;
-        return nullptr;
+        return 0U;
       }
 
       if (UNLIKELY(!Region->MemMapInfo.MemMap.remap(
@@ -1002,7 +1013,7 @@ template <typename Config> class SizeClassAllocator64 {
               MAP_ALLOWNOMEM | MAP_RESIZABLE |
                   (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG
                                                             : 0)))) {
-        return nullptr;
+        return 0U;
       }
       Region->MemMapInfo.MappedUser += MapSize;
       C->getStats().add(StatMapped, MapSize);
@@ -1049,8 +1060,9 @@ template <typename Config> class SizeClassAllocator64 {
       pushBatchClassBlocks(Region, ShuffleArray, NumberOfBlocks);
     }
 
-    TransferBatchT *B = popBatchImpl(C, ClassId, Region);
-    DCHECK_NE(B, nullptr);
+    const u16 PopCount =
+        popBlocksImpl(C, ClassId, Region, ToArray, MaxBlockCount);
+    DCHECK_NE(PopCount, 0U);
 
     // Note that `PushedBlocks` and `PoppedBlocks` are supposed to only record
     // the requests from `PushBlocks` and `PopBatch` which are external
@@ -1062,7 +1074,7 @@ template <typename Config> class SizeClassAllocator64 {
     C->getStats().add(StatFree, AllocatedUser);
     Region->MemMapInfo.AllocatedUser += AllocatedUser;
 
-    return B;
+    return PopCount;
   }
 
   void getStats(ScopedString *Str, uptr ClassId, RegionInfo *Region)
@@ -1182,7 +1194,7 @@ template <typename Config> class SizeClassAllocator64 {
     }
 
     // Note that we have extracted the `GroupsToRelease` from region freelist.
-    // It's safe to let pushBlocks()/popBatches() access the remaining region
+    // It's safe to let pushBlocks()/popBlocks() access the remaining region
     // freelist. In the steps 3 and 4, we will temporarily release the FLLock
     // and lock it again before step 5.
 
diff --git a/com...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/70390