[compiler-rt] cf9d7f5 - [scudo] Manage free blocks in BatchGroup.

Thu Oct 13 16:35:22 PDT 2022

Author: Chia-hung Duan
Date: 2022-10-13T23:35:06Z
New Revision: cf9d7f55d3bec7640fa8b2f8ec1d9c1268233caa

URL: https://github.com/llvm/llvm-project/commit/cf9d7f55d3bec7640fa8b2f8ec1d9c1268233caa
DIFF: https://github.com/llvm/llvm-project/commit/cf9d7f55d3bec7640fa8b2f8ec1d9c1268233caa.diff

LOG: [scudo] Manage free blocks in BatchGroup.

Scudo is supposed to allocate any blocks across the entired mapped
pages and each page is equally likely to be selected. Which means Scudo
is leaning to touch as many pages as possible. This brings better
security but it also sacrifices the chance of releasing dirty pages.

To alleviate the unmanagable footprint growing, this CL introduces the
BatchGroup concept. Each blocks will be classified into a BatchGroup
according to its address. While allocation, we are leaning to allocate
blocks in the same group first. Note that the blocks selected from a
group is still random over several pages. At the same time, we have
better prediction of dirty page growing speed. Besides, we are able to
do partial page releasing by examing part of BatchGroups.

Reviewed By: cryptoad, cferris

Differential Revision: https://reviews.llvm.org/D133897

Added: 
    

Modified: 
    compiler-rt/lib/scudo/standalone/allocator_config.h
    compiler-rt/lib/scudo/standalone/list.h
    compiler-rt/lib/scudo/standalone/local_cache.h
    compiler-rt/lib/scudo/standalone/primary32.h
    compiler-rt/lib/scudo/standalone/primary64.h
    compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
    compiler-rt/lib/scudo/standalone/tests/list_test.cpp
    compiler-rt/lib/scudo/standalone/tests/primary_test.cpp

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h
index e6f46b511dbfa..2933714ea1545 100644

--- a/compiler-rt/lib/scudo/standalone/allocator_config.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.h
@@ -34,6 +34,14 @@ namespace scudo {
 //   typedef SizeClassAllocator64<ExampleConfig> Primary;
 //   // Log2 of the size of a size class region, as used by the Primary.
 //   static const uptr PrimaryRegionSizeLog = 30U;
+//   // Log2 of the size of block group, as used by the Primary. Each group
+//   // contains a range of memory addresses, blocks in the range will belong to
+//   // the same group. In general, single region may have 1 or 2MB group size.
+//   // Multiple regions will have the group size equal to the region size
+//   // because the region size is usually smaller than 1 MB.
+//   // Smaller value gives fine-grained control of memory usage but the trade
+//   // off is that it may take longer time of deallocation.
+//   static const uptr PrimaryGroupSizeLog = 20U;
 //   // Defines the type and scale of a compact pointer. A compact pointer can
 //   // be understood as the offset of a pointer within the region it belongs
 //   // to, in increments of a power-of-2 scale.
@@ -65,6 +73,7 @@ struct DefaultConfig {
 #if SCUDO_CAN_USE_PRIMARY64
   typedef SizeClassAllocator64<DefaultConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 32U;
+  static const uptr PrimaryGroupSizeLog = 21U;
   typedef uptr PrimaryCompactPtrT;
   static const uptr PrimaryCompactPtrScale = 0;
   static const bool PrimaryEnableRandomOffset = true;
@@ -72,6 +81,7 @@ struct DefaultConfig {
 #else
   typedef SizeClassAllocator32<DefaultConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 19U;
+  static const uptr PrimaryGroupSizeLog = 19U;
   typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
@@ -96,11 +106,13 @@ struct AndroidConfig {
   static const uptr PrimaryRegionSizeLog = 28U;
   typedef u32 PrimaryCompactPtrT;
   static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  static const uptr PrimaryGroupSizeLog = 20U;
   static const bool PrimaryEnableRandomOffset = true;
   static const uptr PrimaryMapSizeIncrement = 1UL << 18;
 #else
   typedef SizeClassAllocator32<AndroidConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 18U;
+  static const uptr PrimaryGroupSizeLog = 18U;
   typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
@@ -127,11 +139,13 @@ struct AndroidSvelteConfig {
   static const uptr PrimaryRegionSizeLog = 27U;
   typedef u32 PrimaryCompactPtrT;
   static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+  static const uptr PrimaryGroupSizeLog = 18U;
   static const bool PrimaryEnableRandomOffset = true;
   static const uptr PrimaryMapSizeIncrement = 1UL << 18;
 #else
   typedef SizeClassAllocator32<AndroidSvelteConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 16U;
+  static const uptr PrimaryGroupSizeLog = 16U;
   typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
@@ -156,6 +170,7 @@ struct FuchsiaConfig {
 
   typedef SizeClassAllocator64<FuchsiaConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 30U;
+  static const uptr PrimaryGroupSizeLog = 30U;
   typedef u32 PrimaryCompactPtrT;
   static const bool PrimaryEnableRandomOffset = true;
   static const uptr PrimaryMapSizeIncrement = 1UL << 18;
@@ -175,6 +190,7 @@ struct TrustyConfig {
   typedef SizeClassAllocator64<TrustyConfig> Primary;
   // Some apps have 1 page of heap total so small regions are necessary.
   static const uptr PrimaryRegionSizeLog = 10U;
+  static const uptr PrimaryGroupSizeLog = 10U;
   typedef u32 PrimaryCompactPtrT;
   static const bool PrimaryEnableRandomOffset = false;
   // Trusty is extremely memory-constrained so minimally round up map calls.

diff  --git a/compiler-rt/lib/scudo/standalone/list.h b/compiler-rt/lib/scudo/standalone/list.h
index 1ac93c2f65d7f..0137667d1dcf3 100644
--- a/compiler-rt/lib/scudo/standalone/list.h
+++ b/compiler-rt/lib/scudo/standalone/list.h
@@ -110,6 +110,18 @@ template <class T> struct SinglyLinkedList : public IntrusiveList<T> {
     Size--;
   }
 
+  // Insert X next to Prev
+  void insert(T *Prev, T *X) {
+    DCHECK(!empty());
+    DCHECK_NE(Prev, nullptr);
+    DCHECK_NE(X, nullptr);
+    X->Next = Prev->Next;
+    Prev->Next = X;
+    if (Last == Prev)
+      Last = X;
+    ++Size;
+  }
+
   void extract(T *Prev, T *X) {
     DCHECK(!empty());
     DCHECK_NE(Prev, nullptr);

diff  --git a/compiler-rt/lib/scudo/standalone/local_cache.h b/compiler-rt/lib/scudo/standalone/local_cache.h
index dfdea00529c7d..7f270472d70ff 100644
--- a/compiler-rt/lib/scudo/standalone/local_cache.h
+++ b/compiler-rt/lib/scudo/standalone/local_cache.h
@@ -10,6 +10,7 @@
 #define SCUDO_LOCAL_CACHE_H_
 
 #include "internal_defs.h"
+#include "list.h"
 #include "platform.h"
 #include "report.h"
 #include "stats.h"
@@ -27,6 +28,11 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
       Count = N;
       memcpy(Batch, Array, sizeof(Batch[0]) * Count);
     }
+    void appendFromArray(CompactPtrT *Array, u16 N) {
+      DCHECK_LE(N, MaxNumCached - Count);
+      memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
+      Count += N;
+    }
     void clear() { Count = 0; }
     void add(CompactPtrT P) {
       DCHECK_LT(Count, MaxNumCached);
@@ -50,6 +56,22 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     u16 Count;
   };
 
+  // A BatchGroup is used to collect blocks. Each group has a group id to
+  // identify the group kind of contained blocks.
+  struct BatchGroup {
+    // `Next` is used by IntrusiveList.
+    BatchGroup *Next;
+    // The identifier of each group
+    uptr GroupId;
+    // Cache value of TransferBatch::getMaxCached()
+    u16 MaxCachedPerBatch;
+    // Blocks are managed by TransferBatch in a list.
+    SinglyLinkedList<TransferBatch> Batches;
+  };
+
+  static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch),
+                "BatchGroup uses the same class size as TransferBatch");
+
   void init(GlobalStats *S, SizeClassAllocator *A) {
     DCHECK(isEmpty());
     Stats.init();
@@ -121,9 +143,18 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   TransferBatch *createBatch(uptr ClassId, void *B) {
     if (ClassId != BatchClassId)
       B = allocate(BatchClassId);
+    if (UNLIKELY(!B))
+      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
     return reinterpret_cast<TransferBatch *>(B);
   }
 
+  BatchGroup *createGroup() {
+    void *Ptr = allocate(BatchClassId);
+    if (UNLIKELY(!Ptr))
+      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
+    return reinterpret_cast<BatchGroup *>(Ptr);
+  }
+
   LocalStats &getStats() { return Stats; }
 
 private:
@@ -182,15 +213,10 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
 
   NOINLINE void drain(PerClass *C, uptr ClassId) {
     const u16 Count = Min(static_cast<u16>(C->MaxCount / 2), C->Count);
-    TransferBatch *B =
-        createBatch(ClassId, Allocator->decompactPtr(ClassId, C->Chunks[0]));
-    if (UNLIKELY(!B))
-      reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
-    B->setFromArray(&C->Chunks[0], Count);
+    Allocator->pushBlocks(this, ClassId, &C->Chunks[0], Count);
     C->Count -= Count;
-    for (uptr I = 0; I < C->Count; I++)
+    for (u16 I = 0; I < C->Count; I++)
       C->Chunks[I] = C->Chunks[I + Count];
-    Allocator->pushBatch(ClassId, B);
   }
 };
 

diff  --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index d9905a0571747..b411a3bd837a0 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -43,6 +43,7 @@ template <typename Config> class SizeClassAllocator32 {
 public:
   typedef typename Config::PrimaryCompactPtrT CompactPtrT;
   typedef typename Config::SizeClassMap SizeClassMap;
+  static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog;
   // The bytemap can only track UINT8_MAX - 1 classes.
   static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
   // Regions should be large enough to hold the largest Block.
@@ -51,6 +52,7 @@ template <typename Config> class SizeClassAllocator32 {
   typedef SizeClassAllocator32<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
+  typedef typename CacheT::BatchGroup BatchGroup;
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
@@ -111,30 +113,68 @@ template <typename Config> class SizeClassAllocator32 {
     return reinterpret_cast<void *>(static_cast<uptr>(CompactPtr));
   }
 
+  uptr compactPtrGroup(CompactPtrT CompactPtr) {
+    return CompactPtr >> GroupSizeLog;
+  }
+
   TransferBatch *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
     ScopedLock L(Sci->Mutex);
-    TransferBatch *B = Sci->FreeList.front();
-    if (B) {
-      Sci->FreeList.pop_front();
-    } else {
-      B = populateFreeList(C, ClassId, Sci);
-      if (UNLIKELY(!B))
+    TransferBatch *B = popBatchImpl(C, ClassId);
+    if (UNLIKELY(!B)) {
+      if (UNLIKELY(!populateFreeList(C, ClassId, Sci)))
         return nullptr;
+      B = popBatchImpl(C, ClassId);
+      // if `populateFreeList` succeeded, we are supposed to get free blocks.
+      DCHECK_NE(B, nullptr);
     }
-    DCHECK_GT(B->getCount(), 0);
     Sci->Stats.PoppedBlocks += B->getCount();
     return B;
   }
 
-  void pushBatch(uptr ClassId, TransferBatch *B) {
+  // Push the array of free blocks to the designated batch group.
+  void pushBlocks(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size) {
     DCHECK_LT(ClassId, NumClasses);
-    DCHECK_GT(B->getCount(), 0);
+    DCHECK_GT(Size, 0);
+
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+    if (ClassId == SizeClassMap::BatchClassId) {
+      ScopedLock L(Sci->Mutex);
+      // Constructing a batch group in the free list will use two blocks in
+      // BatchClassId. If we are pushing BatchClassId blocks, we will use the
+      // blocks in the array directly (can't delegate local cache which will
+      // cause a recursive allocation). However, The number of free blocks may
+      // be less than two. Therefore, populate the free list before inserting
+      // the blocks.
+      if (Size == 1 && !populateFreeList(C, ClassId, Sci))
+          return;
+      pushBlocksImpl(C, ClassId, Array, Size);
+      Sci->Stats.PushedBlocks += Size;
+      return;
+    }
+
+    // TODO(chiahungduan): Consider not doing grouping if the group size is not
+    // greater than the block size with a certain scale.
+
+    // Sort the blocks so that blocks belonging to the same group can be pushed
+    // together.
+    bool SameGroup = true;
+    for (u32 I = 1; I < Size; ++I) {
+      CompactPtrT Cur = Array[I];
+      u32 J = I;
+      while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
+        SameGroup = false;
+        Array[J] = Array[J - 1];
+        --J;
+      }
+      Array[J] = Cur;
+    }
+
     ScopedLock L(Sci->Mutex);
-    Sci->FreeList.push_front(B);
-    Sci->Stats.PushedBlocks += B->getCount();
+    pushBlocksImpl(C, ClassId, Array, Size, SameGroup);
+
+    Sci->Stats.PushedBlocks += Size;
     if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Sci, ClassId);
   }
@@ -256,7 +296,7 @@ template <typename Config> class SizeClassAllocator32 {
 
   struct alignas(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
     HybridMutex Mutex;
-    SinglyLinkedList<TransferBatch> FreeList;
+    SinglyLinkedList<BatchGroup> FreeList;
     uptr CurrentRegion;
     uptr CurrentRegionAllocated;
     SizeClassStats Stats;
@@ -328,8 +368,185 @@ template <typename Config> class SizeClassAllocator32 {
     return &SizeClassInfoArray[ClassId];
   }
 
-  NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
-                                           SizeClassInfo *Sci) {
+  // Push the blocks to their batch group. The layout will be like,
+  //
+  // FreeList - > BG -> BG -> BG
+  //              |     |     |
+  //              v     v     v
+  //              TB    TB    TB
+  //              |
+  //              v
+  //              TB
+  //
+  // Each BlockGroup(BG) will associate with unique group id and the free blocks
+  // are managed by a list of TransferBatch(TB). To reduce the time of inserting
+  // blocks, BGs are sorted and the input `Array` are supposed to be sorted so
+  // that we can get better performance of maintaining sorted property.
+  // Use `SameGroup=true` to indicate that all blocks in the array are from the
+  // same group then we will skip checking the group id of each block.
+  //
+  // Note that this aims to have a better management of dirty pages, i.e., the
+  // RSS usage won't grow indefinitely. There's an exception that we may not put
+  // a block to its associated group. While populating new blocks, we may have
+  // blocks cross 
diff erent groups. However, most cases will fall into same
+  // group and they are supposed to be popped soon. In that case, it's not worth
+  // sorting the array with the almost-sorted property. Therefore, we use
+  // `SameGroup=true` instead.
+  //
+  // The region mutex needs to be held while calling this method.
+  void pushBlocksImpl(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size,
+                      bool SameGroup = false) {
+    DCHECK_GT(Size, 0U);
+    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+
+    auto CreateGroup = [&](uptr GroupId) {
+      BatchGroup *BG = nullptr;
+      TransferBatch *TB = nullptr;
+      if (ClassId == SizeClassMap::BatchClassId) {
+        DCHECK_GE(Size, 2U);
+        BG = reinterpret_cast<BatchGroup *>(
+            decompactPtr(ClassId, Array[Size - 1]));
+        BG->Batches.clear();
+
+        TB = reinterpret_cast<TransferBatch *>(
+            decompactPtr(ClassId, Array[Size - 2]));
+        TB->clear();
+      } else {
+        BG = C->createGroup();
+        BG->Batches.clear();
+
+        TB = C->createBatch(ClassId, nullptr);
+        TB->clear();
+      }
+
+      BG->GroupId = GroupId;
+      BG->Batches.push_front(TB);
+      BG->MaxCachedPerBatch =
+          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+
+      return BG;
+    };
+
+    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
+      TransferBatch *CurBatch = Batches.front();
+      DCHECK_NE(CurBatch, nullptr);
+
+      for (u32 I = 0; I < Size;) {
+        DCHECK_GE(BG->MaxCachedPerBatch, CurBatch->getCount());
+        u16 UnusedSlots = BG->MaxCachedPerBatch - CurBatch->getCount();
+        if (UnusedSlots == 0) {
+          CurBatch = C->createBatch(ClassId, reinterpret_cast<void *>(
+              decompactPtr(ClassId, Array[I])));
+          CurBatch->clear();
+          Batches.push_front(CurBatch);
+          UnusedSlots = BG->MaxCachedPerBatch;
+        }
+        u16 AppendSize = static_cast<u16>(Min<u32>(UnusedSlots, Size - I));
+        CurBatch->appendFromArray(&Array[I], AppendSize);
+        I += AppendSize;
+      }
+    };
+
+    BatchGroup *Cur = Sci->FreeList.front();
+
+    if (ClassId == SizeClassMap::BatchClassId) {
+      if (Cur == nullptr) {
+        // Don't need to classify BatchClassId.
+        Cur = CreateGroup(/*GroupId=*/0);
+        Sci->FreeList.push_front(Cur);
+      }
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // In the following, `Cur` always points to the BatchGroup for blocks that
+    // will be pushed next. `Prev` is the element right before `Cur`.
+    BatchGroup *Prev = nullptr;
+
+    while (Cur != nullptr && compactPtrGroup(Array[0]) > Cur->GroupId) {
+      Prev = Cur;
+      Cur = Cur->Next;
+    }
+
+    if (Cur == nullptr || compactPtrGroup(Array[0]) != Cur->GroupId) {
+      Cur = CreateGroup(compactPtrGroup(Array[0]));
+      if (Prev == nullptr)
+        Sci->FreeList.push_front(Cur);
+      else
+        Sci->FreeList.insert(Prev, Cur);
+    }
+
+    // All the blocks are from the same group, just push without checking group
+    // id.
+    if (SameGroup) {
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // The blocks are sorted by group id. Determine the segment of group and
+    // push them to their group together.
+    u32 Count = 1;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) {
+        DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->GroupId);
+        InsertBlocks(Cur, Array + I - Count, Count);
+
+        while (Cur != nullptr && compactPtrGroup(Array[I]) > Cur->GroupId) {
+          Prev = Cur;
+          Cur = Cur->Next;
+        }
+
+        if (Cur == nullptr || compactPtrGroup(Array[I]) != Cur->GroupId) {
+          Cur = CreateGroup(compactPtrGroup(Array[I]));
+          DCHECK_NE(Prev, nullptr);
+          Sci->FreeList.insert(Prev, Cur);
+        }
+
+        Count = 1;
+      } else {
+        ++Count;
+      }
+    }
+
+    InsertBlocks(Cur, Array + Size - Count, Count);
+  }
+
+  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
+  // group id will be considered first.
+  //
+  // The region mutex needs to be held while calling this method.
+  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId) {
+    SizeClassInfo *Sci = getSizeClassInfo(ClassId);
+    if (Sci->FreeList.empty())
+      return nullptr;
+
+    SinglyLinkedList<TransferBatch> &Batches = Sci->FreeList.front()->Batches;
+    DCHECK(!Batches.empty());
+
+    TransferBatch *B = Batches.front();
+    Batches.pop_front();
+    DCHECK_NE(B, nullptr);
+    DCHECK_GT(B->getCount(), 0U);
+
+    if (Batches.empty()) {
+      BatchGroup *BG = Sci->FreeList.front();
+      Sci->FreeList.pop_front();
+
+      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
+      // allocating. Note that block used by constructing BatchGroup is recorded
+      // as free blocks in the last element of BatchGroup::Batches. Which means,
+      // once we pop the last TransferBatch, the block is implicitly
+      // deallocated.
+      if (ClassId != SizeClassMap::BatchClassId)
+        C->deallocate(SizeClassMap::BatchClassId, BG);
+    }
+
+    return B;
+  }
+
+  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId,
+                                 SizeClassInfo *Sci) {
     uptr Region;
     uptr Offset;
     // If the size-class currently has a region associated to it, use it. The
@@ -344,7 +561,7 @@ template <typename Config> class SizeClassAllocator32 {
       DCHECK_EQ(Sci->CurrentRegionAllocated, 0U);
       Region = allocateRegion(Sci, ClassId);
       if (UNLIKELY(!Region))
-        return nullptr;
+        return false;
       C->getStats().add(StatMapped, RegionSize);
       Sci->CurrentRegion = Region;
       Offset = 0;
@@ -378,20 +595,15 @@ template <typename Config> class SizeClassAllocator32 {
     if (ClassId != SizeClassMap::BatchClassId)
       shuffle(ShuffleArray, NumberOfBlocks, &Sci->RandState);
     for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B =
-          C->createBatch(ClassId, reinterpret_cast<void *>(ShuffleArray[I]));
-      if (UNLIKELY(!B))
-        return nullptr;
       // `MaxCount` is u16 so the result will also fit in u16.
       const u16 N = static_cast<u16>(Min<u32>(MaxCount, NumberOfBlocks - I));
-      B->setFromArray(&ShuffleArray[I], N);
-      Sci->FreeList.push_back(B);
+      // Note that the N blocks here may have 
diff erent group ids. Given that
+      // it only happens when it crosses the group size boundary. Instead of
+      // sorting them, treat them as same group here to avoid sorting the
+      // almost-sorted blocks.
+      pushBlocksImpl(C, ClassId, &ShuffleArray[I], N, /*SameGroup=*/true);
       I += N;
     }
-    TransferBatch *B = Sci->FreeList.front();
-    Sci->FreeList.pop_front();
-    DCHECK(B);
-    DCHECK_GT(B->getCount(), 0);
 
     const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
@@ -407,7 +619,7 @@ template <typename Config> class SizeClassAllocator32 {
     }
     Sci->AllocatedUser += AllocatedUser;
 
-    return B;
+    return true;
   }
 
   void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
@@ -477,8 +689,12 @@ template <typename Config> class SizeClassAllocator32 {
     auto DecompactPtr = [](CompactPtrT CompactPtr) {
       return reinterpret_cast<uptr>(CompactPtr);
     };
-    releaseFreeMemoryToOS(Sci->FreeList, RegionSize, NumberOfRegions, BlockSize,
-                          Recorder, DecompactPtr, SkipRegion);
+    PageReleaseContext Context(BlockSize, RegionSize, NumberOfRegions);
+    for (BatchGroup &BG : Sci->FreeList)
+      Context.markFreeBlocks(BG.Batches, DecompactPtr, Base);
+
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
+
     if (Recorder.getReleasedRangesCount() > 0) {
       Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
       Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();

diff  --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index d52a1d9a6c4e3..84b811a8b399d 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -45,10 +45,12 @@ template <typename Config> class SizeClassAllocator64 {
 public:
   typedef typename Config::PrimaryCompactPtrT CompactPtrT;
   static const uptr CompactPtrScale = Config::PrimaryCompactPtrScale;
+  static const uptr GroupSizeLog = Config::PrimaryGroupSizeLog;
   typedef typename Config::SizeClassMap SizeClassMap;
   typedef SizeClassAllocator64<Config> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
+  typedef typename CacheT::BatchGroup BatchGroup;
 
   static uptr getSizeByClassId(uptr ClassId) {
     return (ClassId == SizeClassMap::BatchClassId)
@@ -99,25 +101,60 @@ template <typename Config> class SizeClassAllocator64 {
     DCHECK_LT(ClassId, NumClasses);
     RegionInfo *Region = getRegionInfo(ClassId);
     ScopedLock L(Region->Mutex);
-    TransferBatch *B = Region->FreeList.front();
-    if (B) {
-      Region->FreeList.pop_front();
-    } else {
-      B = populateFreeList(C, ClassId, Region);
-      if (UNLIKELY(!B))
+    TransferBatch *B = popBatchImpl(C, ClassId);
+    if (UNLIKELY(!B)) {
+      if (UNLIKELY(!populateFreeList(C, ClassId, Region)))
         return nullptr;
+      B = popBatchImpl(C, ClassId);
+      // if `populateFreeList` succeeded, we are supposed to get free blocks.
+      DCHECK_NE(B, nullptr);
     }
-    DCHECK_GT(B->getCount(), 0);
     Region->Stats.PoppedBlocks += B->getCount();
     return B;
   }
 
-  void pushBatch(uptr ClassId, TransferBatch *B) {
-    DCHECK_GT(B->getCount(), 0);
+  // Push the array of free blocks to the designated batch group.
+  void pushBlocks(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size) {
+    DCHECK_LT(ClassId, NumClasses);
+    DCHECK_GT(Size, 0);
+
     RegionInfo *Region = getRegionInfo(ClassId);
+    if (ClassId == SizeClassMap::BatchClassId) {
+      ScopedLock L(Region->Mutex);
+      // Constructing a batch group in the free list will use two blocks in
+      // BatchClassId. If we are pushing BatchClassId blocks, we will use the
+      // blocks in the array directly (can't delegate local cache which will
+      // cause a recursive allocation). However, The number of free blocks may
+      // be less than two. Therefore, populate the free list before inserting
+      // the blocks.
+      if (Size == 1 && UNLIKELY(!populateFreeList(C, ClassId, Region)))
+          return;
+      pushBlocksImpl(C, ClassId, Array, Size);
+      Region->Stats.PushedBlocks += Size;
+      return;
+    }
+
+    // TODO(chiahungduan): Consider not doing grouping if the group size is not
+    // greater than the block size with a certain scale.
+
+    // Sort the blocks so that blocks belonging to the same group can be pushed
+    // together.
+    bool SameGroup = true;
+    for (u32 I = 1; I < Size; ++I) {
+      CompactPtrT Cur = Array[I];
+      u32 J = I;
+      while (J > 0 && compactPtrGroup(Cur) < compactPtrGroup(Array[J - 1])) {
+        SameGroup = false;
+        Array[J] = Array[J - 1];
+        --J;
+      }
+      Array[J] = Cur;
+    }
+
     ScopedLock L(Region->Mutex);
-    Region->FreeList.push_front(B);
-    Region->Stats.PushedBlocks += B->getCount();
+    pushBlocksImpl(C, ClassId, Array, Size, SameGroup);
+
+    Region->Stats.PushedBlocks += Size;
     if (ClassId != SizeClassMap::BatchClassId)
       releaseToOSMaybe(Region, ClassId);
   }
@@ -292,7 +329,7 @@ template <typename Config> class SizeClassAllocator64 {
 
   struct UnpaddedRegionInfo {
     HybridMutex Mutex;
-    SinglyLinkedList<TransferBatch> FreeList;
+    SinglyLinkedList<BatchGroup> FreeList;
     uptr RegionBeg = 0;
     RegionStats Stats = {};
     u32 RandState = 0;
@@ -330,7 +367,192 @@ template <typename Config> class SizeClassAllocator64 {
     return Base + (static_cast<uptr>(CompactPtr) << CompactPtrScale);
   }
 
-  NOINLINE TransferBatch *populateFreeList(CacheT *C, uptr ClassId,
+  static uptr compactPtrGroup(CompactPtrT CompactPtr) {
+    return CompactPtr >> (GroupSizeLog - CompactPtrScale);
+  }
+
+  // Push the blocks to their batch group. The layout will be like,
+  //
+  // FreeList - > BG -> BG -> BG
+  //              |     |     |
+  //              v     v     v
+  //              TB    TB    TB
+  //              |
+  //              v
+  //              TB
+  //
+  // Each BlockGroup(BG) will associate with unique group id and the free blocks
+  // are managed by a list of TransferBatch(TB). To reduce the time of inserting
+  // blocks, BGs are sorted and the input `Array` are supposed to be sorted so
+  // that we can get better performance of maintaining sorted property.
+  // Use `SameGroup=true` to indicate that all blocks in the array are from the
+  // same group then we will skip checking the group id of each block.
+  //
+  // Note that this aims to have a better management of dirty pages, i.e., the
+  // RSS usage won't grow indefinitely. There's an exception that we may not put
+  // a block to its associated group. While populating new blocks, we may have
+  // blocks cross 
diff erent groups. However, most cases will fall into same
+  // group and they are supposed to be popped soon. In that case, it's not worth
+  // sorting the array with the almost-sorted property. Therefore, we use
+  // `SameGroup=true` instead.
+  //
+  // The region mutex needs to be held while calling this method.
+  void pushBlocksImpl(CacheT *C, uptr ClassId, CompactPtrT *Array, u32 Size,
+                      bool SameGroup = false) {
+    DCHECK_GT(Size, 0U);
+    RegionInfo *Region = getRegionInfo(ClassId);
+
+    auto CreateGroup = [&](uptr GroupId) {
+      BatchGroup *BG = nullptr;
+      TransferBatch *TB = nullptr;
+      if (ClassId == SizeClassMap::BatchClassId) {
+        DCHECK_GE(Size, 2U);
+        BG = reinterpret_cast<BatchGroup *>(
+            decompactPtr(ClassId, Array[Size - 1]));
+        BG->Batches.clear();
+
+        TB = reinterpret_cast<TransferBatch *>(
+            decompactPtr(ClassId, Array[Size - 2]));
+        TB->clear();
+      } else {
+        BG = C->createGroup();
+        BG->Batches.clear();
+
+        TB = C->createBatch(ClassId, nullptr);
+        TB->clear();
+      }
+
+      BG->GroupId = GroupId;
+      BG->Batches.push_front(TB);
+      BG->MaxCachedPerBatch =
+          TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+
+      return BG;
+    };
+
+    auto InsertBlocks = [&](BatchGroup *BG, CompactPtrT *Array, u32 Size) {
+      SinglyLinkedList<TransferBatch> &Batches = BG->Batches;
+      TransferBatch *CurBatch = Batches.front();
+      DCHECK_NE(CurBatch, nullptr);
+
+      for (u32 I = 0; I < Size;) {
+        DCHECK_GE(BG->MaxCachedPerBatch, CurBatch->getCount());
+        u16 UnusedSlots = BG->MaxCachedPerBatch - CurBatch->getCount();
+        if (UnusedSlots == 0) {
+          CurBatch = C->createBatch(ClassId, reinterpret_cast<void *>(
+              decompactPtr(ClassId, Array[I])));
+          CurBatch->clear();
+          Batches.push_front(CurBatch);
+          UnusedSlots = BG->MaxCachedPerBatch;
+        }
+        // `UnusedSlots` is u16 so the result will be also fit in u16.
+        u16 AppendSize = static_cast<u16>(Min<u32>(UnusedSlots, Size - I));
+        CurBatch->appendFromArray(&Array[I], AppendSize);
+        I += AppendSize;
+      }
+    };
+
+    BatchGroup *Cur = Region->FreeList.front();
+
+    if (ClassId == SizeClassMap::BatchClassId) {
+      if (Cur == nullptr) {
+        // Don't need to classify BatchClassId.
+        Cur = CreateGroup(/*GroupId=*/0);
+        Region->FreeList.push_front(Cur);
+      }
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // In the following, `Cur` always points to the BatchGroup for blocks that
+    // will be pushed next. `Prev` is the element right before `Cur`.
+    BatchGroup *Prev = nullptr;
+
+    while (Cur != nullptr && compactPtrGroup(Array[0]) > Cur->GroupId) {
+      Prev = Cur;
+      Cur = Cur->Next;
+    }
+
+    if (Cur == nullptr || compactPtrGroup(Array[0]) != Cur->GroupId) {
+      Cur = CreateGroup(compactPtrGroup(Array[0]));
+      if (Prev == nullptr)
+        Region->FreeList.push_front(Cur);
+      else
+        Region->FreeList.insert(Prev, Cur);
+    }
+
+
+    // All the blocks are from the same group, just push without checking group
+    // id.
+    if (SameGroup) {
+      InsertBlocks(Cur, Array, Size);
+      return;
+    }
+
+    // The blocks are sorted by group id. Determine the segment of group and
+    // push them to their group together.
+    u32 Count = 1;
+    for (u32 I = 1; I < Size; ++I) {
+      if (compactPtrGroup(Array[I - 1]) != compactPtrGroup(Array[I])) {
+        DCHECK_EQ(compactPtrGroup(Array[I - 1]), Cur->GroupId);
+        InsertBlocks(Cur, Array + I - Count, Count);
+
+        while (Cur != nullptr && compactPtrGroup(Array[I]) > Cur->GroupId) {
+          Prev = Cur;
+          Cur = Cur->Next;
+        }
+
+        if (Cur == nullptr || compactPtrGroup(Array[I]) != Cur->GroupId) {
+          Cur = CreateGroup(compactPtrGroup(Array[I]));
+          DCHECK_NE(Prev, nullptr);
+          Region->FreeList.insert(Prev, Cur);
+        }
+
+        Count = 1;
+      } else {
+        ++Count;
+      }
+    }
+
+    InsertBlocks(Cur, Array + Size - Count, Count);
+  }
+
+  // Pop one TransferBatch from a BatchGroup. The BatchGroup with the smallest
+  // group id will be considered first.
+  //
+  // The region mutex needs to be held while calling this method.
+  TransferBatch *popBatchImpl(CacheT *C, uptr ClassId) {
+    RegionInfo *Region = getRegionInfo(ClassId);
+    if (Region->FreeList.empty())
+      return nullptr;
+
+    SinglyLinkedList<TransferBatch> &Batches =
+        Region->FreeList.front()->Batches;
+    DCHECK(!Batches.empty());
+
+    TransferBatch *B = Batches.front();
+    Batches.pop_front();
+    DCHECK_NE(B, nullptr);
+    DCHECK_GT(B->getCount(), 0U);
+
+    if (Batches.empty()) {
+      BatchGroup *BG = Region->FreeList.front();
+      Region->FreeList.pop_front();
+
+      // We don't keep BatchGroup with zero blocks to avoid empty-checking while
+      // allocating. Note that block used by constructing BatchGroup is recorded
+      // as free blocks in the last element of BatchGroup::Batches. Which means,
+      // once we pop the last TransferBatch, the block is implicitly
+      // deallocated.
+      if (ClassId != SizeClassMap::BatchClassId)
+        C->deallocate(SizeClassMap::BatchClassId, BG);
+    }
+
+    return B;
+  }
+
+
+  NOINLINE bool populateFreeList(CacheT *C, uptr ClassId,
                                            RegionInfo *Region) {
     const uptr Size = getSizeByClassId(ClassId);
     const u16 MaxCount = TransferBatch::getMaxCached(Size);
@@ -354,7 +576,7 @@ template <typename Config> class SizeClassAllocator64 {
               RegionSize >> 20, Size);
           Str.output();
         }
-        return nullptr;
+        return false;
       }
       if (MappedUser == 0)
         Region->Data = Data;
@@ -363,8 +585,9 @@ template <typename Config> class SizeClassAllocator64 {
               "scudo:primary",
               MAP_ALLOWNOMEM | MAP_RESIZABLE |
                   (useMemoryTagging<Config>(Options.load()) ? MAP_MEMTAG : 0),
-              &Region->Data)))
-        return nullptr;
+              &Region->Data))) {
+        return false;
+      }
       Region->MappedUser += MapSize;
       C->getStats().add(StatMapped, MapSize);
     }
@@ -387,27 +610,21 @@ template <typename Config> class SizeClassAllocator64 {
     if (ClassId != SizeClassMap::BatchClassId)
       shuffle(ShuffleArray, NumberOfBlocks, &Region->RandState);
     for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B =
-          C->createBatch(ClassId, reinterpret_cast<void *>(decompactPtrInternal(
-                                      CompactPtrBase, ShuffleArray[I])));
-      if (UNLIKELY(!B))
-        return nullptr;
       // `MaxCount` is u16 so the result will also fit in u16.
       const u16 N = static_cast<u16>(Min<u32>(MaxCount, NumberOfBlocks - I));
-      B->setFromArray(&ShuffleArray[I], N);
-      Region->FreeList.push_back(B);
+      // Note that the N blocks here may have 
diff erent group ids. Given that
+      // it only happens when it crosses the group size boundary. Instead of
+      // sorting them, treat them as same group here to avoid sorting the
+      // almost-sorted blocks.
+      pushBlocksImpl(C, ClassId, &ShuffleArray[I], N, /*SameGroup=*/true);
       I += N;
     }
-    TransferBatch *B = Region->FreeList.front();
-    Region->FreeList.pop_front();
-    DCHECK(B);
-    DCHECK_GT(B->getCount(), 0);
 
     const uptr AllocatedUser = Size * NumberOfBlocks;
     C->getStats().add(StatFree, AllocatedUser);
     Region->AllocatedUser += AllocatedUser;
 
-    return B;
+    return true;
   }
 
   void getStats(ScopedString *Str, uptr ClassId, uptr Rss) {
@@ -473,8 +690,11 @@ template <typename Config> class SizeClassAllocator64 {
       return decompactPtrInternal(CompactPtrBase, CompactPtr);
     };
     auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
-    releaseFreeMemoryToOS(Region->FreeList, Region->AllocatedUser, 1U,
-                          BlockSize, Recorder, DecompactPtr, SkipRegion);
+    PageReleaseContext Context(BlockSize, RegionSize, /*NumberOfRegions=*/1U);
+    for (BatchGroup &BG : Region->FreeList)
+      Context.markFreeBlocks(BG.Batches, DecompactPtr, Region->RegionBeg);
+
+    releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
 
     if (Recorder.getReleasedRangesCount() > 0) {
       Region->ReleaseInfo.PushedBlocksAtLastRelease =

diff  --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 49c1ba9f520c0..5b9d47151c7f5 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -525,6 +525,7 @@ struct DeathConfig {
   static const scudo::uptr PrimaryCompactPtrScale = 0;
   static const bool PrimaryEnableRandomOffset = true;
   static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+  static const scudo::uptr PrimaryGroupSizeLog = 18;
 
   typedef scudo::MapAllocatorNoCache SecondaryCache;
   template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>;

diff  --git a/compiler-rt/lib/scudo/standalone/tests/list_test.cpp b/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
index 8e139916d0588..140ca027ae928 100644
--- a/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
@@ -161,6 +161,10 @@ TEST(ScudoListTest, SinglyLinkedList) {
   setList(&L1, X);
   checkList(&L1, X);
 
+  setList(&L1, X, Y);
+  L1.insert(X, Z);
+  checkList(&L1, X, Z, Y);
+
   setList(&L1, X, Y, Z);
   setList(&L2, A, B, C);
   L1.append_back(&L2);

diff  --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
index d70dcc9e3db37..a6262d2620c5a 100644
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@@ -12,8 +12,11 @@
 #include "primary64.h"
 #include "size_class_map.h"
 
+#include <algorithm>
+#include <chrono>
 #include <condition_variable>
 #include <mutex>
+#include <random>
 #include <stdlib.h>
 #include <thread>
 #include <vector>
@@ -24,6 +27,7 @@
 
 struct TestConfig1 {
   static const scudo::uptr PrimaryRegionSizeLog = 18U;
+  static const scudo::uptr PrimaryGroupSizeLog = 18U;
   static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = false;
@@ -40,6 +44,7 @@ struct TestConfig2 {
 #else
   static const scudo::uptr PrimaryRegionSizeLog = 24U;
 #endif
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
   static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = false;
@@ -56,6 +61,7 @@ struct TestConfig3 {
 #else
   static const scudo::uptr PrimaryRegionSizeLog = 24U;
 #endif
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
   static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
   static const bool MaySupportMemoryTagging = true;
@@ -65,6 +71,23 @@ struct TestConfig3 {
   static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
 };
 
+struct TestConfig4 {
+#if defined(__mips__)
+  // Unable to allocate greater size on QEMU-user.
+  static const scudo::uptr PrimaryRegionSizeLog = 23U;
+#else
+  static const scudo::uptr PrimaryRegionSizeLog = 24U;
+#endif
+  static const scudo::s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+  static const scudo::s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+  static const bool MaySupportMemoryTagging = true;
+  static const scudo::uptr PrimaryCompactPtrScale = 3U;
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
+  typedef scudo::u32 PrimaryCompactPtrT;
+  static const bool PrimaryEnableRandomOffset = true;
+  static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+};
+
 template <typename BaseConfig, typename SizeClassMapT>
 struct Config : public BaseConfig {
   using SizeClassMap = SizeClassMapT;
@@ -100,7 +123,8 @@ template <class BaseConfig> struct ScudoPrimaryTest : public Test {};
 #define SCUDO_TYPED_TEST_ALL_TYPES(FIXTURE, NAME)                              \
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig1)                            \
   SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig2)                            \
-  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig3)                            \
+  SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TestConfig4)
 #endif
 
 #define SCUDO_TYPED_TEST_TYPE(FIXTURE, NAME, TYPE)                             \
@@ -153,6 +177,7 @@ struct SmallRegionsConfig {
   static const scudo::uptr PrimaryCompactPtrScale = 0;
   static const bool PrimaryEnableRandomOffset = true;
   static const scudo::uptr PrimaryMapSizeIncrement = 1UL << 18;
+  static const scudo::uptr PrimaryGroupSizeLog = 20U;
 };
 
 // The 64-bit SizeClassAllocator can be easily OOM'd with small region sizes.
@@ -170,6 +195,8 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
   std::vector<TransferBatch *> Batches;
   const scudo::uptr ClassId = Primary::SizeClassMap::LargestClassId;
   const scudo::uptr Size = Primary::getSizeByClassId(ClassId);
+  typename Primary::CacheT::CompactPtrT Blocks[TransferBatch::MaxNumCached];
+
   for (scudo::uptr I = 0; I < 10000U; I++) {
     TransferBatch *B = Allocator.popBatch(&Cache, ClassId);
     if (!B) {
@@ -181,8 +208,11 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
     Batches.push_back(B);
   }
   while (!Batches.empty()) {
-    Allocator.pushBatch(ClassId, Batches.back());
+    TransferBatch *B = Batches.back();
     Batches.pop_back();
+    B->copyToArray(Blocks);
+    Allocator.pushBlocks(&Cache, ClassId, Blocks, B->getCount());
+    Cache.deallocate(Primary::SizeClassMap::BatchClassId, B);
   }
   Cache.destroy(nullptr);
   Allocator.releaseToOS();
@@ -294,3 +324,46 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, ReleaseToOS) {
   Cache.destroy(nullptr);
   EXPECT_GT(Allocator->releaseToOS(), 0U);
 }
+
+SCUDO_TYPED_TEST(ScudoPrimaryTest, MemoryGroup) {
+  using Primary = TestAllocator<TypeParam, scudo::DefaultSizeClassMap>;
+  std::unique_ptr<Primary> Allocator(new Primary);
+  Allocator->init(/*ReleaseToOsInterval=*/-1);
+  typename Primary::CacheT Cache;
+  Cache.init(nullptr, Allocator.get());
+  const scudo::uptr Size = 32U;
+  const scudo::uptr ClassId = Primary::SizeClassMap::getClassIdBySize(Size);
+
+  // We will allocate 4 times the group size memory and release all of them. We
+  // expect the free blocks will be classified with groups. Then we will
+  // allocate the same amount of memory as group size and expect the blocks will
+  // have the max address 
diff erence smaller or equal to 2 times the group size.
+  // Note that it isn't necessary to be in the range of single group size
+  // because the way we get the group id is doing compact pointer shifting.
+  // According to configuration, the compact pointer may not align to group
+  // size. As a result, the blocks can cross two groups at most.
+  const scudo::uptr GroupSizeMem = (1ULL << Primary::GroupSizeLog);
+  const scudo::uptr PeakAllocationMem = 4 * GroupSizeMem;
+  const scudo::uptr PeakNumberOfAllocations = PeakAllocationMem / Size;
+  const scudo::uptr FinalNumberOfAllocations = GroupSizeMem / Size;
+  std::vector<scudo::uptr> Blocks;
+  std::mt19937 R;
+
+  for (scudo::uptr I = 0; I < PeakNumberOfAllocations; ++I)
+    Blocks.push_back(reinterpret_cast<scudo::uptr>(Cache.allocate(ClassId)));
+
+  std::shuffle(Blocks.begin(), Blocks.end(), R);
+
+  // Release all the allocated blocks, including those held by local cache.
+  while (!Blocks.empty()) {
+    Cache.deallocate(ClassId, reinterpret_cast<void *>(Blocks.back()));
+    Blocks.pop_back();
+  }
+  Cache.drain();
+
+  for (scudo::uptr I = 0; I < FinalNumberOfAllocations; ++I)
+    Blocks.push_back(reinterpret_cast<scudo::uptr>(Cache.allocate(ClassId)));
+
+  EXPECT_LE(*std::max_element(Blocks.begin(), Blocks.end()) -
+            *std::min_element(Blocks.begin(), Blocks.end()), GroupSizeMem * 2);
+}