[compiler-rt] 24b0c43 - Reapply "[scudo] Make local cache be agnostic to the type of node in … (#68633)

Mon Oct 9 15:17:47 PDT 2023

Author: ChiaHungDuan
Date: 2023-10-09T15:17:42-07:00
New Revision: 24b0c43c991689f62b40ee14042cc769b21b88f5

URL: https://github.com/llvm/llvm-project/commit/24b0c43c991689f62b40ee14042cc769b21b88f5
DIFF: https://github.com/llvm/llvm-project/commit/24b0c43c991689f62b40ee14042cc769b21b88f5.diff

LOG: Reapply "[scudo] Make local cache be agnostic to the type of node in … (#68633)

…f… (#68626)

This reverts commit 8dd9615dfbd148993964ea6f2de6c6a94f49660c.

1. Fixed the ambiguous aliasing
2. Fixed the type conversion warning

Added: 
    compiler-rt/lib/scudo/standalone/allocator_common.h

Modified: 
    compiler-rt/lib/scudo/standalone/CMakeLists.txt
    compiler-rt/lib/scudo/standalone/local_cache.h
    compiler-rt/lib/scudo/standalone/primary32.h
    compiler-rt/lib/scudo/standalone/primary64.h
    compiler-rt/lib/scudo/standalone/tests/primary_test.cpp

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
index 2b7a613066b83b3..c4d3ea1e4f05ba8 100644

--- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
@@ -56,6 +56,7 @@ if(ANDROID)
 endif()
 
 set(SCUDO_HEADERS
+  allocator_common.h
   allocator_config.h
   atomic_helpers.h
   bytemap.h

diff  --git a/compiler-rt/lib/scudo/standalone/allocator_common.h b/compiler-rt/lib/scudo/standalone/allocator_common.h
new file mode 100644
index 000000000000000..95f4776ac596dc0
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/allocator_common.h
@@ -0,0 +1,85 @@
+//===-- allocator_common.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_ALLOCATOR_COMMON_H_
+#define SCUDO_ALLOCATOR_COMMON_H_
+
+#include "common.h"
+#include "list.h"
+
+namespace scudo {
+
+template <class SizeClassAllocator> struct TransferBatch {
+  typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
+  typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
+
+  static const u16 MaxNumCached = SizeClassMap::MaxNumCachedHint;
+  void setFromArray(CompactPtrT *Array, u16 N) {
+    DCHECK_LE(N, MaxNumCached);
+    Count = N;
+    memcpy(Batch, Array, sizeof(Batch[0]) * Count);
+  }
+  void appendFromArray(CompactPtrT *Array, u16 N) {
+    DCHECK_LE(N, MaxNumCached - Count);
+    memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
+    // u16 will be promoted to int by arithmetic type conversion.
+    Count = static_cast<u16>(Count + N);
+  }
+  void appendFromTransferBatch(TransferBatch *B, u16 N) {
+    DCHECK_LE(N, MaxNumCached - Count);
+    DCHECK_GE(B->Count, N);
+    // Append from the back of `B`.
+    memcpy(Batch + Count, B->Batch + (B->Count - N), sizeof(Batch[0]) * N);
+    // u16 will be promoted to int by arithmetic type conversion.
+    Count = static_cast<u16>(Count + N);
+    B->Count = static_cast<u16>(B->Count - N);
+  }
+  void clear() { Count = 0; }
+  void add(CompactPtrT P) {
+    DCHECK_LT(Count, MaxNumCached);
+    Batch[Count++] = P;
+  }
+  void moveToArray(CompactPtrT *Array) {
+    memcpy(Array, Batch, sizeof(Batch[0]) * Count);
+    clear();
+  }
+  u16 getCount() const { return Count; }
+  bool isEmpty() const { return Count == 0U; }
+  CompactPtrT get(u16 I) const {
+    DCHECK_LE(I, Count);
+    return Batch[I];
+  }
+  TransferBatch *Next;
+
+private:
+  CompactPtrT Batch[MaxNumCached];
+  u16 Count;
+};
+
+// A BatchGroup is used to collect blocks. Each group has a group id to
+// identify the group kind of contained blocks.
+template <class SizeClassAllocator> struct BatchGroup {
+  // `Next` is used by IntrusiveList.
+  BatchGroup *Next;
+  // The compact base address of each group
+  uptr CompactPtrGroupBase;
+  // Cache value of SizeClassAllocatorLocalCache::getMaxCached()
+  u16 MaxCachedPerBatch;
+  // Number of blocks pushed into this group. This is an increment-only
+  // counter.
+  uptr PushedBlocks;
+  // This is used to track how many bytes are not in-use since last time we
+  // tried to release pages.
+  uptr BytesInBGAtLastCheckpoint;
+  // Blocks are managed by TransferBatch in a list.
+  SinglyLinkedList<TransferBatch<SizeClassAllocator>> Batches;
+};
+
+} // namespace scudo
+
+#endif // SCUDO_ALLOCATOR_COMMON_H_

diff  --git a/compiler-rt/lib/scudo/standalone/local_cache.h b/compiler-rt/lib/scudo/standalone/local_cache.h
index 1095eb5f186d1e1..1814272277ff436 100644
--- a/compiler-rt/lib/scudo/standalone/local_cache.h
+++ b/compiler-rt/lib/scudo/standalone/local_cache.h
@@ -22,74 +22,6 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
   typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
 
-  struct TransferBatch {
-    static const u16 MaxNumCached = SizeClassMap::MaxNumCachedHint;
-    void setFromArray(CompactPtrT *Array, u16 N) {
-      DCHECK_LE(N, MaxNumCached);
-      Count = N;
-      memcpy(Batch, Array, sizeof(Batch[0]) * Count);
-    }
-    void appendFromArray(CompactPtrT *Array, u16 N) {
-      DCHECK_LE(N, MaxNumCached - Count);
-      memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
-      // u16 will be promoted to int by arithmetic type conversion.
-      Count = static_cast<u16>(Count + N);
-    }
-    void appendFromTransferBatch(TransferBatch *B, u16 N) {
-      DCHECK_LE(N, MaxNumCached - Count);
-      DCHECK_GE(B->Count, N);
-      // Append from the back of `B`.
-      memcpy(Batch + Count, B->Batch + (B->Count - N), sizeof(Batch[0]) * N);
-      // u16 will be promoted to int by arithmetic type conversion.
-      Count = static_cast<u16>(Count + N);
-      B->Count = static_cast<u16>(B->Count - N);
-    }
-    void clear() { Count = 0; }
-    void add(CompactPtrT P) {
-      DCHECK_LT(Count, MaxNumCached);
-      Batch[Count++] = P;
-    }
-    void copyToArray(CompactPtrT *Array) const {
-      memcpy(Array, Batch, sizeof(Batch[0]) * Count);
-    }
-    u16 getCount() const { return Count; }
-    bool isEmpty() const { return Count == 0U; }
-    CompactPtrT get(u16 I) const {
-      DCHECK_LE(I, Count);
-      return Batch[I];
-    }
-    static u16 getMaxCached(uptr Size) {
-      return Min(MaxNumCached, SizeClassMap::getMaxCachedHint(Size));
-    }
-    TransferBatch *Next;
-
-  private:
-    CompactPtrT Batch[MaxNumCached];
-    u16 Count;
-  };
-
-  // A BatchGroup is used to collect blocks. Each group has a group id to
-  // identify the group kind of contained blocks.
-  struct BatchGroup {
-    // `Next` is used by IntrusiveList.
-    BatchGroup *Next;
-    // The compact base address of each group
-    uptr CompactPtrGroupBase;
-    // Cache value of TransferBatch::getMaxCached()
-    u16 MaxCachedPerBatch;
-    // Number of blocks pushed into this group. This is an increment-only
-    // counter.
-    uptr PushedBlocks;
-    // This is used to track how many bytes are not in-use since last time we
-    // tried to release pages.
-    uptr BytesInBGAtLastCheckpoint;
-    // Blocks are managed by TransferBatch in a list.
-    SinglyLinkedList<TransferBatch> Batches;
-  };
-
-  static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch),
-                "BatchGroup uses the same class size as TransferBatch");
-
   void init(GlobalStats *S, SizeClassAllocator *A) {
     DCHECK(isEmpty());
     Stats.init();
@@ -151,7 +83,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   }
 
   void drain() {
-    // Drain BatchClassId last as createBatch can refill it.
+    // Drain BatchClassId last as it may be needed while draining normal blocks.
     for (uptr I = 0; I < NumClasses; ++I) {
       if (I == BatchClassId)
         continue;
@@ -163,19 +95,11 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     DCHECK(isEmpty());
   }
 
-  TransferBatch *createBatch(uptr ClassId, void *B) {
-    if (ClassId != BatchClassId)
-      B = allocate(BatchClassId);
+  void *getBatchClassBlock() {
+    void *B = allocate(BatchClassId);
     if (UNLIKELY(!B))
       reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
-    return reinterpret_cast<TransferBatch *>(B);
-  }
-
-  BatchGroup *createGroup() {
-    void *Ptr = allocate(BatchClassId);
-    if (UNLIKELY(!Ptr))
-      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
-    return reinterpret_cast<BatchGroup *>(Ptr);
+    return B;
   }
 
   LocalStats &getStats() { return Stats; }
@@ -203,6 +127,11 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
       Str->append("    No block is cached.\n");
   }
 
+  static u16 getMaxCached(uptr Size) {
+    return Min(SizeClassMap::MaxNumCachedHint,
+               SizeClassMap::getMaxCachedHint(Size));
+  }
+
 private:
   static const uptr NumClasses = SizeClassMap::NumClasses;
   static const uptr BatchClassId = SizeClassMap::BatchClassId;
@@ -211,7 +140,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     u16 MaxCount;
     // Note: ClassSize is zero for the transfer batch.
     uptr ClassSize;
-    CompactPtrT Chunks[2 * TransferBatch::MaxNumCached];
+    CompactPtrT Chunks[2 * SizeClassMap::MaxNumCachedHint];
   };
   PerClass PerClassArray[NumClasses] = {};
   LocalStats Stats;
@@ -228,7 +157,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     for (uptr I = 0; I < NumClasses; I++) {
       PerClass *P = &PerClassArray[I];
       const uptr Size = SizeClassAllocator::getSizeByClassId(I);
-      P->MaxCount = static_cast<u16>(2 * TransferBatch::getMaxCached(Size));
+      P->MaxCount = static_cast<u16>(2 * getMaxCached(Size));
       if (I != BatchClassId) {
         P->ClassSize = Size;
       } else {
@@ -246,15 +175,14 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
 
   NOINLINE bool refill(PerClass *C, uptr ClassId) {
     initCacheMaybe(C);
-    TransferBatch *B = Allocator->popBatch(this, ClassId);
-    if (UNLIKELY(!B))
-      return false;
-    DCHECK_GT(B->getCount(), 0);
-    C->Count = B->getCount();
-    B->copyToArray(C->Chunks);
-    B->clear();
-    destroyBatch(ClassId, B);
-    return true;
+
+    // TODO(chiahungduan): Pass the max number cached for each size class.
+    const u16 NumBlocksRefilled =
+        Allocator->popBlocks(this, ClassId, C->Chunks);
+    DCHECK_LE(NumBlocksRefilled,
+              getMaxCached(SizeClassAllocator::getSizeByClassId(ClassId)));
+    C->Count = static_cast<u16>(C->Count + NumBlocksRefilled);
+    return NumBlocksRefilled != 0;
   }
 
   NOINLINE void drain(PerClass *C, uptr ClassId) {

diff  --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 533615ad3816d09..c900550ac675e47 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -9,6 +9,7 @@
 #ifndef SCUDO_PRIMARY32_H_
 #define SCUDO_PRIMARY32_H_
 
+#include "allocator_common.h"
 #include "bytemap.h"
 #include "common.h"
 #include "list.h"
@@ -53,12 +54,15 @@ template <typename Config> class SizeClassAllocator32 {
                 "");
   typedef SizeClassAllocator32<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
-  typedef typename CacheT::TransferBatch TransferBatch;
-  typedef typename CacheT::BatchGroup BatchGroup;
+  typedef TransferBatch<ThisT> TransferBatchT;
+  typedef BatchGroup<ThisT> BatchGroupT;
+
+  static_assert(sizeof(BatchGroupT) <= sizeof(TransferBatchT),
+                "BatchGroupT uses the same class size as TransferBatchT");
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
-               ? sizeof(TransferBatch)
+               ? sizeof(TransferBatchT)
                : SizeClassMap::getSizeByClassId(ClassId);
   }
 
@@ -126,7 +130,7 @@ template <typename Config> class SizeClassAllocator32 {
       SizeClassInfo *Sci = getSizeClassInfo(I);
       ScopedLock L1(Sci->Mutex);
       uptr TotalBlocks = 0;
-      for (BatchGroup &BG : Sci->FreeListInfo.BlockList) {
+      for (BatchGroupT &BG : Sci->FreeListInfo.BlockList) {
         // `BG::Batches` are `TransferBatches`. +1 for `BatchGroup`.
         BatchClassUsedInFreeLists += BG.Batches.size() + 1;
         for (const auto &It : BG.Batches)
@@ -141,7 +145,7 @@ template <typename Config> class SizeClassAllocator32 {
     SizeClassInfo *Sci = getSizeClassInfo(SizeClassMap::BatchClassId);
     ScopedLock L1(Sci->Mutex);
     uptr TotalBlocks = 0;
-    for (BatchGroup &BG : Sci->FreeListInfo.BlockList) {
+    for (BatchGroupT &BG : Sci->FreeListInfo.BlockList) {
       if (LIKELY(!BG.Batches.empty())) {
         for (const auto &It : BG.Batches)
           TotalBlocks += It.getCount();
@@ -187,11 +191,26 @@ template <typename Config> class SizeClassAllocator32 {
     return BlockSize > PageSize;
   }
 
-  TransferBatch *popBatch(CacheT *C, uptr ClassId) {
+  u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray) {
+    TransferBatchT *B = popBatch(C, ClassId);
+    if (!B)
+      return 0;
+
+    const u16 Count = B->getCount();
+    DCHECK_GT(Count, 0U);
+    B->moveToArray(ToArray);
+
+    if (ClassId != SizeClassMap::BatchClassId)
+      C->deallocate(SizeClassMap::BatchClassId, B);
+
+    return Count;
+  }
+
+  TransferBatchT *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     ScopedLock L(Sci->Mutex);
-    TransferBatch *B = popBatchImpl(C, ClassId, Sci);
+    TransferBatchT *B = popBatchImpl(C, ClassId, Sci);
     if (UNLIKELY(!B)) {
       if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
         return nullptr;
@@ -381,7 +400,7 @@ template <typename Config> class SizeClassAllocator32 {
   };
 
   struct BlocksInfo {
-    SinglyLinkedList<BatchGroup> BlockList = {};
+    SinglyLinkedList<BatchGroupT> BlockList = {};
     uptr PoppedBlocks = 0;
     uptr PushedBlocks = 0;
   };
@@ -505,11 +524,11 @@ template <typename Config> class SizeClassAllocator32 {
     // reusable and don't need additional space for them.
 
     Sci->FreeListInfo.PushedBlocks += Size;
-    BatchGroup *BG = Sci->FreeListInfo.BlockList.front();
+    BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
 
     if (BG == nullptr) {
       // Construct `BatchGroup` on the last element.
-      BG = reinterpret_cast<BatchGroup *>(
+      BG = reinterpret_cast<BatchGroupT *>(
           decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
       --Size;
       BG->Batches.clear();
@@ -520,8 +539,8 @@ template <typename Config> class SizeClassAllocator32 {
       // from `CreateGroup` in `pushBlocksImpl`
       BG->PushedBlocks = 1;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch = TransferBatch::getMaxCached(
-          getSizeByClassId(SizeClassMap::BatchClassId));
+      BG->MaxCachedPerBatch =
+          CacheT::getMaxCached(getSizeByClassId(SizeClassMap::BatchClassId));
 
       Sci->FreeListInfo.BlockList.push_front(BG);
     }
@@ -534,7 +553,7 @@ template <typename Config> class SizeClassAllocator32 {
     //   2. Only 1 block is pushed when the freelist is empty.
     if (BG->Batches.empty()) {
       // Construct the `TransferBatch` on the last element.
-      TransferBatch *TB = reinterpret_cast<TransferBatch *>(
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(
           decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
       TB->clear();
       // As mentioned above, addresses of `TransferBatch` and `BatchGroup` are
@@ -549,14 +568,14 @@ template <typename Config> class SizeClassAllocator32 {
       BG->Batches.push_front(TB);
     }
 
-    TransferBatch *CurBatch = BG->Batches.front();
+    TransferBatchT *CurBatch = BG->Batches.front();
     DCHECK_NE(CurBatch, nullptr);
 
     for (u32 I = 0; I < Size;) {
       u16 UnusedSlots =
           static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
       if (UnusedSlots == 0) {
-        CurBatch = reinterpret_cast<TransferBatch *>(
+        CurBatch = reinterpret_cast<TransferBatchT *>(
             decompactPtr(SizeClassMap::BatchClassId, Array[I]));
         CurBatch->clear();
         // Self-contained
@@ -600,24 +619,25 @@ template <typename Config> class SizeClassAllocator32 {
     DCHECK_GT(Size, 0U);
 
     auto CreateGroup = [&](uptr CompactPtrGroupBase) {
-      BatchGroup *BG = C->createGroup();
+      BatchGroupT *BG =
+          reinterpret_cast<BatchGroupT *>(C->getBatchClassBlock());
       BG->Batches.clear();
-      TransferBatch *TB = C->createBatch(ClassId, nullptr);
+      TransferBatchT *TB =
+          reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
       TB->clear();
 
       BG->CompactPtrGroupBase = CompactPtrGroupBase;
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch =
-          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+      BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
 
       return BG;
     };
 
-    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
-      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
-      TransferBatch *CurBatch = Batches.front();
+    auto InsertBlocks = [&](BatchGroupT *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatchT> &Batches = BG->Batches;
+      TransferBatchT *CurBatch = Batches.front();
       DCHECK_NE(CurBatch, nullptr);
 
       for (u32 I = 0; I < Size;) {
@@ -625,9 +645,8 @@ template <typename Config> class SizeClassAllocator32 {
         u16 UnusedSlots =
             static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
         if (UnusedSlots == 0) {
-          CurBatch = C->createBatch(
-              ClassId,
-              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch =
+              reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
           CurBatch->clear();
           Batches.push_front(CurBatch);
           UnusedSlots = BG->MaxCachedPerBatch;
@@ -642,11 +661,11 @@ template <typename Config> class SizeClassAllocator32 {
     };
 
     Sci->FreeListInfo.PushedBlocks += Size;
-    BatchGroup *Cur = Sci->FreeListInfo.BlockList.front();
+    BatchGroupT *Cur = Sci->FreeListInfo.BlockList.front();
 
     // In the following, `Cur` always points to the BatchGroup for blocks that
     // will be pushed next. `Prev` is the element right before `Cur`.
-    BatchGroup *Prev = nullptr;
+    BatchGroupT *Prev = nullptr;
 
     while (Cur != nullptr &&
            compactPtrGroupBase(Array[0]) > Cur->CompactPtrGroupBase) {
@@ -707,22 +726,22 @@ template <typename Config> class SizeClassAllocator32 {
   // group id will be considered first.
   //
   // The region mutex needs to be held while calling this method.
-  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
+  TransferBatchT *popBatchImpl(CacheT *C, uptr ClassId, SizeClassInfo *Sci)
       REQUIRES(Sci->Mutex) {
     if (Sci->FreeListInfo.BlockList.empty())
       return nullptr;
 
-    SinglyLinkedList<TransferBatch> &Batches =
+    SinglyLinkedList<TransferBatchT> &Batches =
         Sci->FreeListInfo.BlockList.front()->Batches;
 
     if (Batches.empty()) {
       DCHECK_EQ(ClassId, SizeClassMap::BatchClassId);
-      BatchGroup *BG = Sci->FreeListInfo.BlockList.front();
+      BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
       Sci->FreeListInfo.BlockList.pop_front();
 
       // Block used by `BatchGroup` is from BatchClassId. Turn the block into
       // `TransferBatch` with single block.
-      TransferBatch *TB = reinterpret_cast<TransferBatch *>(BG);
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
       TB->clear();
       TB->add(
           compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB)));
@@ -730,13 +749,13 @@ template <typename Config> class SizeClassAllocator32 {
       return TB;
     }
 
-    TransferBatch *B = Batches.front();
+    TransferBatchT *B = Batches.front();
     Batches.pop_front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
     if (Batches.empty()) {
-      BatchGroup *BG = Sci->FreeListInfo.BlockList.front();
+      BatchGroupT *BG = Sci->FreeListInfo.BlockList.front();
       Sci->FreeListInfo.BlockList.pop_front();
 
       // We don't keep BatchGroup with zero blocks to avoid empty-checking while
@@ -775,7 +794,7 @@ template <typename Config> class SizeClassAllocator32 {
     }
 
     const uptr Size = getSizeByClassId(ClassId);
-    const u16 MaxCount = TransferBatch::getMaxCached(Size);
+    const u16 MaxCount = CacheT::getMaxCached(Size);
     DCHECK_GT(MaxCount, 0U);
     // The maximum number of blocks we should carve in the region is dictated
     // by the maximum number of batches we want to fill, and the amount of
@@ -788,7 +807,7 @@ template <typename Config> class SizeClassAllocator32 {
     DCHECK_GT(NumberOfBlocks, 0U);
 
     constexpr u32 ShuffleArraySize =
-        MaxNumBatches * TransferBatch::MaxNumCached;
+        MaxNumBatches * TransferBatchT::MaxNumCached;
     // Fill the transfer batches and put them in the size-class freelist. We
     // need to randomize the blocks for security purposes, so we first fill a
     // local array that we then shuffle before populating the batches.
@@ -1053,7 +1072,7 @@ template <typename Config> class SizeClassAllocator32 {
     auto DecompactPtr = [](CompactPtrT CompactPtr) {
       return reinterpret_cast<uptr>(CompactPtr);
     };
-    for (BatchGroup &BG : Sci->FreeListInfo.BlockList) {
+    for (BatchGroupT &BG : Sci->FreeListInfo.BlockList) {
       const uptr GroupBase = decompactGroupBase(BG.CompactPtrGroupBase);
       // The `GroupSize` may not be divided by `BlockSize`, which means there is
       // an unused space at the end of Region. Exclude that space to avoid

diff  --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 6d160b4c64d75fc..6e5ab7e3ba7965d 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -9,6 +9,7 @@
 #ifndef SCUDO_PRIMARY64_H_
 #define SCUDO_PRIMARY64_H_
 
+#include "allocator_common.h"
 #include "bytemap.h"
 #include "common.h"
 #include "list.h"
@@ -55,12 +56,15 @@ template <typename Config> class SizeClassAllocator64 {
   static const uptr GroupScale = GroupSizeLog - CompactPtrScale;
   typedef SizeClassAllocator64<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
-  typedef typename CacheT::TransferBatch TransferBatch;
-  typedef typename CacheT::BatchGroup BatchGroup;
+  typedef TransferBatch<ThisT> TransferBatchT;
+  typedef BatchGroup<ThisT> BatchGroupT;
+
+  static_assert(sizeof(BatchGroupT) <= sizeof(TransferBatchT),
+                "BatchGroupT uses the same class size as TransferBatchT");
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
-               ? roundUp(sizeof(TransferBatch), 1U << CompactPtrScale)
+               ? roundUp(sizeof(TransferBatchT), 1U << CompactPtrScale)
                : SizeClassMap::getSizeByClassId(ClassId);
   }
 
@@ -167,7 +171,7 @@ template <typename Config> class SizeClassAllocator64 {
       ScopedLock FL(Region->FLLock);
       const uptr BlockSize = getSizeByClassId(I);
       uptr TotalBlocks = 0;
-      for (BatchGroup &BG : Region->FreeListInfo.BlockList) {
+      for (BatchGroupT &BG : Region->FreeListInfo.BlockList) {
         // `BG::Batches` are `TransferBatches`. +1 for `BatchGroup`.
         BatchClassUsedInFreeLists += BG.Batches.size() + 1;
         for (const auto &It : BG.Batches)
@@ -184,7 +188,7 @@ template <typename Config> class SizeClassAllocator64 {
     ScopedLock FL(Region->FLLock);
     const uptr BlockSize = getSizeByClassId(SizeClassMap::BatchClassId);
     uptr TotalBlocks = 0;
-    for (BatchGroup &BG : Region->FreeListInfo.BlockList) {
+    for (BatchGroupT &BG : Region->FreeListInfo.BlockList) {
       if (LIKELY(!BG.Batches.empty())) {
         for (const auto &It : BG.Batches)
           TotalBlocks += It.getCount();
@@ -203,19 +207,34 @@ template <typename Config> class SizeClassAllocator64 {
     DCHECK_EQ(BlocksInUse, BatchClassUsedInFreeLists);
   }
 
-  TransferBatch *popBatch(CacheT *C, uptr ClassId) {
+  u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray) {
+    TransferBatchT *B = popBatch(C, ClassId);
+    if (!B)
+      return 0;
+
+    const u16 Count = B->getCount();
+    DCHECK_GT(Count, 0U);
+    B->moveToArray(ToArray);
+
+    if (ClassId != SizeClassMap::BatchClassId)
+      C->deallocate(SizeClassMap::BatchClassId, B);
+
+    return Count;
+  }
+
+  TransferBatchT *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     RegionInfo *Region = getRegionInfo(ClassId);
 
     {
       ScopedLock L(Region->FLLock);
-      TransferBatch *B = popBatchImpl(C, ClassId, Region);
+      TransferBatchT *B = popBatchImpl(C, ClassId, Region);
       if (LIKELY(B))
         return B;
     }
 
     bool ReportRegionExhausted = false;
-    TransferBatch *B = nullptr;
+    TransferBatchT *B = nullptr;
 
     while (true) {
       // When two threads compete for `Region->MMLock`, we only want one of them
@@ -503,7 +522,7 @@ template <typename Config> class SizeClassAllocator64 {
   };
 
   struct BlocksInfo {
-    SinglyLinkedList<BatchGroup> BlockList = {};
+    SinglyLinkedList<BatchGroupT> BlockList = {};
     uptr PoppedBlocks = 0;
     uptr PushedBlocks = 0;
   };
@@ -615,11 +634,11 @@ template <typename Config> class SizeClassAllocator64 {
     // reusable and don't need additional space for them.
 
     Region->FreeListInfo.PushedBlocks += Size;
-    BatchGroup *BG = Region->FreeListInfo.BlockList.front();
+    BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
 
     if (BG == nullptr) {
       // Construct `BatchGroup` on the last element.
-      BG = reinterpret_cast<BatchGroup *>(
+      BG = reinterpret_cast<BatchGroupT *>(
           decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
       --Size;
       BG->Batches.clear();
@@ -630,8 +649,8 @@ template <typename Config> class SizeClassAllocator64 {
       // from `CreateGroup` in `pushBlocksImpl`
       BG->PushedBlocks = 1;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch = TransferBatch::getMaxCached(
-          getSizeByClassId(SizeClassMap::BatchClassId));
+      BG->MaxCachedPerBatch =
+          CacheT::getMaxCached(getSizeByClassId(SizeClassMap::BatchClassId));
 
       Region->FreeListInfo.BlockList.push_front(BG);
     }
@@ -644,7 +663,7 @@ template <typename Config> class SizeClassAllocator64 {
     //   2. Only 1 block is pushed when the freelist is empty.
     if (BG->Batches.empty()) {
       // Construct the `TransferBatch` on the last element.
-      TransferBatch *TB = reinterpret_cast<TransferBatch *>(
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(
           decompactPtr(SizeClassMap::BatchClassId, Array[Size - 1]));
       TB->clear();
       // As mentioned above, addresses of `TransferBatch` and `BatchGroup` are
@@ -659,14 +678,14 @@ template <typename Config> class SizeClassAllocator64 {
       BG->Batches.push_front(TB);
     }
 
-    TransferBatch *CurBatch = BG->Batches.front();
+    TransferBatchT *CurBatch = BG->Batches.front();
     DCHECK_NE(CurBatch, nullptr);
 
     for (u32 I = 0; I < Size;) {
       u16 UnusedSlots =
           static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
       if (UnusedSlots == 0) {
-        CurBatch = reinterpret_cast<TransferBatch *>(
+        CurBatch = reinterpret_cast<TransferBatchT *>(
             decompactPtr(SizeClassMap::BatchClassId, Array[I]));
         CurBatch->clear();
         // Self-contained
@@ -709,24 +728,25 @@ template <typename Config> class SizeClassAllocator64 {
     DCHECK_GT(Size, 0U);
 
     auto CreateGroup = [&](uptr CompactPtrGroupBase) {
-      BatchGroup *BG = C->createGroup();
+      BatchGroupT *BG =
+          reinterpret_cast<BatchGroupT *>(C->getBatchClassBlock());
       BG->Batches.clear();
-      TransferBatch *TB = C->createBatch(ClassId, nullptr);
+      TransferBatchT *TB =
+          reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
       TB->clear();
 
       BG->CompactPtrGroupBase = CompactPtrGroupBase;
       BG->Batches.push_front(TB);
       BG->PushedBlocks = 0;
       BG->BytesInBGAtLastCheckpoint = 0;
-      BG->MaxCachedPerBatch =
-          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+      BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
 
       return BG;
     };
 
-    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
-      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
-      TransferBatch *CurBatch = Batches.front();
+    auto InsertBlocks = [&](BatchGroupT *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatchT> &Batches = BG->Batches;
+      TransferBatchT *CurBatch = Batches.front();
       DCHECK_NE(CurBatch, nullptr);
 
       for (u32 I = 0; I < Size;) {
@@ -734,9 +754,8 @@ template <typename Config> class SizeClassAllocator64 {
         u16 UnusedSlots =
             static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
         if (UnusedSlots == 0) {
-          CurBatch = C->createBatch(
-              ClassId,
-              reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+          CurBatch =
+              reinterpret_cast<TransferBatchT *>(C->getBatchClassBlock());
           CurBatch->clear();
           Batches.push_front(CurBatch);
           UnusedSlots = BG->MaxCachedPerBatch;
@@ -751,11 +770,11 @@ template <typename Config> class SizeClassAllocator64 {
     };
 
     Region->FreeListInfo.PushedBlocks += Size;
-    BatchGroup *Cur = Region->FreeListInfo.BlockList.front();
+    BatchGroupT *Cur = Region->FreeListInfo.BlockList.front();
 
     // In the following, `Cur` always points to the BatchGroup for blocks that
     // will be pushed next. `Prev` is the element right before `Cur`.
-    BatchGroup *Prev = nullptr;
+    BatchGroupT *Prev = nullptr;
 
     while (Cur != nullptr &&
            compactPtrGroup(Array[0]) > Cur->CompactPtrGroupBase) {
@@ -816,22 +835,22 @@ template <typename Config> class SizeClassAllocator64 {
   // group id will be considered first.
   //
   // The region mutex needs to be held while calling this method.
-  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId, RegionInfo *Region)
+  TransferBatchT *popBatchImpl(CacheT *C, uptr ClassId, RegionInfo *Region)
       REQUIRES(Region->FLLock) {
     if (Region->FreeListInfo.BlockList.empty())
       return nullptr;
 
-    SinglyLinkedList<TransferBatch> &Batches =
+    SinglyLinkedList<TransferBatchT> &Batches =
         Region->FreeListInfo.BlockList.front()->Batches;
 
     if (Batches.empty()) {
       DCHECK_EQ(ClassId, SizeClassMap::BatchClassId);
-      BatchGroup *BG = Region->FreeListInfo.BlockList.front();
+      BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
       Region->FreeListInfo.BlockList.pop_front();
 
       // Block used by `BatchGroup` is from BatchClassId. Turn the block into
       // `TransferBatch` with single block.
-      TransferBatch *TB = reinterpret_cast<TransferBatch *>(BG);
+      TransferBatchT *TB = reinterpret_cast<TransferBatchT *>(BG);
       TB->clear();
       TB->add(
           compactPtr(SizeClassMap::BatchClassId, reinterpret_cast<uptr>(TB)));
@@ -839,13 +858,13 @@ template <typename Config> class SizeClassAllocator64 {
       return TB;
     }
 
-    TransferBatch *B = Batches.front();
+    TransferBatchT *B = Batches.front();
     Batches.pop_front();
     DCHECK_NE(B, nullptr);
     DCHECK_GT(B->getCount(), 0U);
 
     if (Batches.empty()) {
-      BatchGroup *BG = Region->FreeListInfo.BlockList.front();
+      BatchGroupT *BG = Region->FreeListInfo.BlockList.front();
       Region->FreeListInfo.BlockList.pop_front();
 
       // We don't keep BatchGroup with zero blocks to avoid empty-checking while
@@ -863,11 +882,11 @@ template <typename Config> class SizeClassAllocator64 {
   }
 
   // Refill the freelist and return one batch.
-  NOINLINE TransferBatch *populateFreeListAndPopBatch(CacheT *C, uptr ClassId,
-                                                      RegionInfo *Region)
+  NOINLINE TransferBatchT *populateFreeListAndPopBatch(CacheT *C, uptr ClassId,
+                                                       RegionInfo *Region)
       REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
     const uptr Size = getSizeByClassId(ClassId);
-    const u16 MaxCount = TransferBatch::getMaxCached(Size);
+    const u16 MaxCount = CacheT::getMaxCached(Size);
 
     const uptr RegionBeg = Region->RegionBeg;
     const uptr MappedUser = Region->MemMapInfo.MappedUser;
@@ -903,7 +922,7 @@ template <typename Config> class SizeClassAllocator64 {
     DCHECK_GT(NumberOfBlocks, 0);
 
     constexpr u32 ShuffleArraySize =
-        MaxNumBatches * TransferBatch::MaxNumCached;
+        MaxNumBatches * TransferBatchT::MaxNumCached;
     CompactPtrT ShuffleArray[ShuffleArraySize];
     DCHECK_LE(NumberOfBlocks, ShuffleArraySize);
 
@@ -936,7 +955,7 @@ template <typename Config> class SizeClassAllocator64 {
       pushBatchClassBlocks(Region, ShuffleArray, NumberOfBlocks);
     }
 
-    TransferBatch *B = popBatchImpl(C, ClassId, Region);
+    TransferBatchT *B = popBatchImpl(C, ClassId, Region);
     DCHECK_NE(B, nullptr);
 
     // Note that `PushedBlocks` and `PoppedBlocks` are supposed to only record
@@ -987,7 +1006,7 @@ template <typename Config> class SizeClassAllocator64 {
     const uptr AllocatedUserEnd =
         Region->MemMapInfo.AllocatedUser + Region->RegionBeg;
 
-    SinglyLinkedList<BatchGroup> GroupsToRelease;
+    SinglyLinkedList<BatchGroupT> GroupsToRelease;
     {
       ScopedLock L(Region->FLLock);
       GroupsToRelease = Region->FreeListInfo.BlockList;
@@ -1030,7 +1049,7 @@ template <typename Config> class SizeClassAllocator64 {
     uptr BytesInFreeList;
     const uptr AllocatedUserEnd =
         Region->MemMapInfo.AllocatedUser + Region->RegionBeg;
-    SinglyLinkedList<BatchGroup> GroupsToRelease;
+    SinglyLinkedList<BatchGroupT> GroupsToRelease;
 
     {
       ScopedLock L(Region->FLLock);
@@ -1173,13 +1192,13 @@ template <typename Config> class SizeClassAllocator64 {
     return true;
   }
 
-  SinglyLinkedList<BatchGroup>
+  SinglyLinkedList<BatchGroupT>
   collectGroupsToRelease(RegionInfo *Region, const uptr BlockSize,
                          const uptr AllocatedUserEnd, const uptr CompactPtrBase)
       REQUIRES(Region->MMLock, Region->FLLock) {
     const uptr GroupSize = (1UL << GroupSizeLog);
     const uptr PageSize = getPageSizeCached();
-    SinglyLinkedList<BatchGroup> GroupsToRelease;
+    SinglyLinkedList<BatchGroupT> GroupsToRelease;
 
     // We are examining each group and will take the minimum distance to the
     // release threshold as the next Region::TryReleaseThreshold(). Note that if
@@ -1188,8 +1207,8 @@ template <typename Config> class SizeClassAllocator64 {
     // the comment on `SmallerBlockReleasePageDelta` for more details.
     uptr MinDistToThreshold = GroupSize;
 
-    for (BatchGroup *BG = Region->FreeListInfo.BlockList.front(),
-                    *Prev = nullptr;
+    for (BatchGroupT *BG = Region->FreeListInfo.BlockList.front(),
+                     *Prev = nullptr;
          BG != nullptr;) {
       // Group boundary is always GroupSize-aligned from CompactPtr base. The
       // layout of memory groups is like,
@@ -1273,7 +1292,7 @@ template <typename Config> class SizeClassAllocator64 {
         }
       }
 
-      // If `BG` is the first BatchGroup in the list, we only need to advance
+      // If `BG` is the first BatchGroupT in the list, we only need to advance
       // `BG` and call FreeListInfo.BlockList::pop_front(). No update is needed
       // for `Prev`.
       //
@@ -1309,7 +1328,7 @@ template <typename Config> class SizeClassAllocator64 {
       // Note that we need to advance before pushing this BatchGroup to
       // GroupsToRelease because it's a destructive operation.
 
-      BatchGroup *Cur = BG;
+      BatchGroupT *Cur = BG;
       BG = BG->Next;
 
       // Ideally, we may want to update this only after successful release.
@@ -1342,7 +1361,7 @@ template <typename Config> class SizeClassAllocator64 {
   PageReleaseContext
   markFreeBlocks(RegionInfo *Region, const uptr BlockSize,
                  const uptr AllocatedUserEnd, const uptr CompactPtrBase,
-                 SinglyLinkedList<BatchGroup> &GroupsToRelease)
+                 SinglyLinkedList<BatchGroupT> &GroupsToRelease)
       REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
     const uptr GroupSize = (1UL << GroupSizeLog);
     auto DecompactPtr = [CompactPtrBase](CompactPtrT CompactPtr) {
@@ -1371,7 +1390,7 @@ template <typename Config> class SizeClassAllocator64 {
     if (UNLIKELY(!Context.ensurePageMapAllocated()))
       return Context;
 
-    for (BatchGroup &BG : GroupsToRelease) {
+    for (BatchGroupT &BG : GroupsToRelease) {
       const uptr BatchGroupBase =
           decompactGroupBase(CompactPtrBase, BG.CompactPtrGroupBase);
       const uptr BatchGroupEnd = BatchGroupBase + GroupSize;
@@ -1419,7 +1438,7 @@ template <typename Config> class SizeClassAllocator64 {
   }
 
   void mergeGroupsToReleaseBack(RegionInfo *Region,
-                                SinglyLinkedList<BatchGroup> &GroupsToRelease)
+                                SinglyLinkedList<BatchGroupT> &GroupsToRelease)
       REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
     ScopedLock L(Region->FLLock);
 
@@ -1440,8 +1459,8 @@ template <typename Config> class SizeClassAllocator64 {
     // Merge GroupsToRelease back to the Region::FreeListInfo.BlockList. Note
     // that both `Region->FreeListInfo.BlockList` and `GroupsToRelease` are
     // sorted.
-    for (BatchGroup *BG = Region->FreeListInfo.BlockList.front(),
-                    *Prev = nullptr;
+    for (BatchGroupT *BG = Region->FreeListInfo.BlockList.front(),
+                     *Prev = nullptr;
          ;) {
       if (BG == nullptr || GroupsToRelease.empty()) {
         if (!GroupsToRelease.empty())
@@ -1458,8 +1477,8 @@ template <typename Config> class SizeClassAllocator64 {
         continue;
       }
 
-      BatchGroup *Cur = GroupsToRelease.front();
-      TransferBatch *UnusedTransferBatch = nullptr;
+      BatchGroupT *Cur = GroupsToRelease.front();
+      TransferBatchT *UnusedTransferBatch = nullptr;
       GroupsToRelease.pop_front();
 
       if (BG->CompactPtrGroupBase == Cur->CompactPtrGroupBase) {
@@ -1475,7 +1494,7 @@ template <typename Config> class SizeClassAllocator64 {
         if (Cur->Batches.front()->getCount() == MaxCachedPerBatch) {
           BG->Batches.append_back(&Cur->Batches);
         } else {
-          TransferBatch *NonFullBatch = Cur->Batches.front();
+          TransferBatchT *NonFullBatch = Cur->Batches.front();
           Cur->Batches.pop_front();
           const u16 NonFullBatchCount = NonFullBatch->getCount();
           // The remaining Batches in `Cur` are full.
@@ -1540,8 +1559,8 @@ template <typename Config> class SizeClassAllocator64 {
     }
 
     if (SCUDO_DEBUG) {
-      BatchGroup *Prev = Region->FreeListInfo.BlockList.front();
-      for (BatchGroup *Cur = Prev->Next; Cur != nullptr;
+      BatchGroupT *Prev = Region->FreeListInfo.BlockList.front();
+      for (BatchGroupT *Cur = Prev->Next; Cur != nullptr;
            Prev = Cur, Cur = Cur->Next) {
         CHECK_LT(Prev->CompactPtrGroupBase, Cur->CompactPtrGroupBase);
       }

diff  --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
index 074977ff27e65f6..4db05b76241134c 100644
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@@ -207,7 +207,7 @@ struct SmallRegionsConfig {
 // For the 32-bit one, it requires actually exhausting memory, so we skip it.
 TEST(ScudoPrimaryTest, Primary64OOM) {
   using Primary = scudo::SizeClassAllocator64<SmallRegionsConfig>;
-  using TransferBatch = Primary::CacheT::TransferBatch;
+  using TransferBatch = Primary::TransferBatchT;
   Primary Allocator;
   Allocator.init(/*ReleaseToOsInterval=*/-1);
   typename Primary::CacheT Cache;
@@ -233,8 +233,9 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
   while (!Batches.empty()) {
     TransferBatch *B = Batches.back();
     Batches.pop_back();
-    B->copyToArray(Blocks);
-    Allocator.pushBlocks(&Cache, ClassId, Blocks, B->getCount());
+    const scudo::u16 Count = B->getCount();
+    B->moveToArray(Blocks);
+    Allocator.pushBlocks(&Cache, ClassId, Blocks, Count);
     Cache.deallocate(Primary::SizeClassMap::BatchClassId, B);
   }
   Cache.destroy(nullptr);