[compiler-rt] 998334d - [scudo][standalone] Change the release loop for efficiency purposes

Kostya Kortchinsky via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 24 10:36:31 PDT 2020


Author: Kostya Kortchinsky
Date: 2020-07-24T10:35:49-07:00
New Revision: 998334da2b1536e7c8f11c560770c8d4cfacb354

URL: https://github.com/llvm/llvm-project/commit/998334da2b1536e7c8f11c560770c8d4cfacb354
DIFF: https://github.com/llvm/llvm-project/commit/998334da2b1536e7c8f11c560770c8d4cfacb354.diff

LOG: [scudo][standalone] Change the release loop for efficiency purposes

Summary:
On 32-b, the release algo loops multiple times over the freelist for a size
class, which lead to a decrease in performance when there were a lot of free
blocks.

This changes the release functions to loop only once over the freelist, at the
cost of using a little bit more memory for the release process: instead of
working on one region at a time, we pass the whole memory area covered by all
the regions for a given size class, and work on sub-areas of `RegionSize` in
this large area. For 64-b, we just have 1 sub-area encompassing the whole
region. Of course, not all the sub-areas within that large memory area will
belong to the class id we are working on, but those will just be left untouched
(which will not add to the RSS during the release process).

Reviewers: pcc, cferris, hctim, eugenis

Subscribers: llvm-commits, #sanitizers

Tags: #sanitizers

Differential Revision: https://reviews.llvm.org/D83993

Added: 
    

Modified: 
    compiler-rt/lib/scudo/standalone/primary32.h
    compiler-rt/lib/scudo/standalone/primary64.h
    compiler-rt/lib/scudo/standalone/release.h
    compiler-rt/lib/scudo/standalone/tests/release_test.cpp

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 2ee0f6c600ab..321cf92fae30 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -467,28 +467,33 @@ class SizeClassAllocator32 {
       }
     }
 
-    // TODO(kostyak): currently not ideal as we loop over all regions and
-    // iterate multiple times over the same freelist if a ClassId spans multiple
-    // regions. But it will have to do for now.
-    uptr TotalReleasedBytes = 0;
-    const uptr MaxSize = (RegionSize / BlockSize) * BlockSize;
+    DCHECK_GT(MinRegionIndex, 0U);
+    uptr First = 0;
     for (uptr I = MinRegionIndex; I <= MaxRegionIndex; I++) {
       if (PossibleRegions[I] - 1U == ClassId) {
-        const uptr Region = I * RegionSize;
-        // If the region is the one currently associated to the size-class, we
-        // only need to release up to CurrentRegionAllocated, MaxSize otherwise.
-        const uptr Size = (Region == Sci->CurrentRegion)
-                              ? Sci->CurrentRegionAllocated
-                              : MaxSize;
-        ReleaseRecorder Recorder(Region);
-        releaseFreeMemoryToOS(Sci->FreeList, Region, Size, BlockSize,
-                              &Recorder);
-        if (Recorder.getReleasedRangesCount() > 0) {
-          Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
-          Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
-          Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
-          TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
-        }
+        First = I;
+        break;
+      }
+    }
+    uptr Last = 0;
+    for (uptr I = MaxRegionIndex; I >= MinRegionIndex; I--) {
+      if (PossibleRegions[I] - 1U == ClassId) {
+        Last = I;
+        break;
+      }
+    }
+    uptr TotalReleasedBytes = 0;
+    if (First && Last) {
+      const uptr Base = First * RegionSize;
+      const uptr NumberOfRegions = Last - First + 1U;
+      ReleaseRecorder Recorder(Base);
+      releaseFreeMemoryToOS(Sci->FreeList, Base, RegionSize, NumberOfRegions,
+                            BlockSize, &Recorder);
+      if (Recorder.getReleasedRangesCount() > 0) {
+        Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
+        Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
+        Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
+        TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
       }
     }
     Sci->ReleaseInfo.LastReleaseAtNs = getMonotonicTime();

diff  --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 01e674bf3fba..e37dc4951f23 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -481,7 +481,7 @@ class SizeClassAllocator64 {
 
     ReleaseRecorder Recorder(Region->RegionBeg, &Region->Data);
     releaseFreeMemoryToOS(Region->FreeList, Region->RegionBeg,
-                          Region->AllocatedUser, BlockSize, &Recorder);
+                          Region->AllocatedUser, 1U, BlockSize, &Recorder);
 
     if (Recorder.getReleasedRangesCount() > 0) {
       Region->ReleaseInfo.PushedBlocksAtLastRelease =

diff  --git a/compiler-rt/lib/scudo/standalone/release.h b/compiler-rt/lib/scudo/standalone/release.h
index fd55ea24132e..748e1c001153 100644
--- a/compiler-rt/lib/scudo/standalone/release.h
+++ b/compiler-rt/lib/scudo/standalone/release.h
@@ -49,7 +49,10 @@ class ReleaseRecorder {
 // incremented past MaxValue.
 class PackedCounterArray {
 public:
-  PackedCounterArray(uptr NumCounters, uptr MaxValue) : N(NumCounters) {
+  PackedCounterArray(uptr NumberOfRegions, uptr CountersPerRegion,
+                     uptr MaxValue)
+      : Regions(NumberOfRegions), NumCounters(CountersPerRegion) {
+    CHECK_GT(Regions, 0);
     CHECK_GT(NumCounters, 0);
     CHECK_GT(MaxValue, 0);
     constexpr uptr MaxCounterBits = sizeof(*Buffer) * 8UL;
@@ -66,9 +69,10 @@ class PackedCounterArray {
     PackingRatioLog = getLog2(PackingRatio);
     BitOffsetMask = PackingRatio - 1;
 
-    BufferSize = (roundUpTo(N, static_cast<uptr>(1U) << PackingRatioLog) >>
-                  PackingRatioLog) *
-                 sizeof(*Buffer);
+    SizePerRegion =
+        roundUpTo(NumCounters, static_cast<uptr>(1U) << PackingRatioLog) >>
+        PackingRatioLog;
+    BufferSize = SizePerRegion * sizeof(*Buffer) * Regions;
     if (BufferSize <= (StaticBufferCount * sizeof(Buffer[0])) &&
         Mutex.tryLock()) {
       Buffer = &StaticBuffer[0];
@@ -89,41 +93,45 @@ class PackedCounterArray {
 
   bool isAllocated() const { return !!Buffer; }
 
-  uptr getCount() const { return N; }
+  uptr getCount() const { return NumCounters; }
 
-  uptr get(uptr I) const {
-    DCHECK_LT(I, N);
+  uptr get(uptr Region, uptr I) const {
+    DCHECK_LT(Region, Regions);
+    DCHECK_LT(I, NumCounters);
     const uptr Index = I >> PackingRatioLog;
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
-    return (Buffer[Index] >> BitOffset) & CounterMask;
+    return (Buffer[Region * SizePerRegion + Index] >> BitOffset) & CounterMask;
   }
 
-  void inc(uptr I) const {
-    DCHECK_LT(get(I), CounterMask);
+  void inc(uptr Region, uptr I) const {
+    DCHECK_LT(get(Region, I), CounterMask);
     const uptr Index = I >> PackingRatioLog;
     const uptr BitOffset = (I & BitOffsetMask) << CounterSizeBitsLog;
     DCHECK_LT(BitOffset, SCUDO_WORDSIZE);
-    Buffer[Index] += static_cast<uptr>(1U) << BitOffset;
+    Buffer[Region * SizePerRegion + Index] += static_cast<uptr>(1U)
+                                              << BitOffset;
   }
 
-  void incRange(uptr From, uptr To) const {
+  void incRange(uptr Region, uptr From, uptr To) const {
     DCHECK_LE(From, To);
-    const uptr Top = Min(To + 1, N);
+    const uptr Top = Min(To + 1, NumCounters);
     for (uptr I = From; I < Top; I++)
-      inc(I);
+      inc(Region, I);
   }
 
   uptr getBufferSize() const { return BufferSize; }
 
-  static const uptr StaticBufferCount = 1024U;
+  static const uptr StaticBufferCount = 2048U;
 
 private:
-  const uptr N;
+  const uptr Regions;
+  const uptr NumCounters;
   uptr CounterSizeBitsLog;
   uptr CounterMask;
   uptr PackingRatioLog;
   uptr BitOffsetMask;
 
+  uptr SizePerRegion;
   uptr BufferSize;
   uptr *Buffer;
 
@@ -169,7 +177,8 @@ template <class ReleaseRecorderT> class FreePagesRangeTracker {
 template <class TransferBatchT, class ReleaseRecorderT>
 NOINLINE void
 releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
-                      uptr Size, uptr BlockSize, ReleaseRecorderT *Recorder) {
+                      uptr RegionSize, uptr NumberOfRegions, uptr BlockSize,
+                      ReleaseRecorderT *Recorder) {
   const uptr PageSize = getPageSizeCached();
 
   // Figure out the number of chunks per page and whether we can take a fast
@@ -206,13 +215,15 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
     }
   }
 
-  const uptr PagesCount = roundUpTo(Size, PageSize) / PageSize;
-  PackedCounterArray Counters(PagesCount, FullPagesBlockCountMax);
+  const uptr PagesCount = roundUpTo(RegionSize, PageSize) / PageSize;
+  PackedCounterArray Counters(NumberOfRegions, PagesCount,
+                              FullPagesBlockCountMax);
   if (!Counters.isAllocated())
     return;
 
   const uptr PageSizeLog = getLog2(PageSize);
-  const uptr RoundedSize = PagesCount << PageSizeLog;
+  const uptr RoundedRegionSize = PagesCount << PageSizeLog;
+  const uptr RoundedSize = NumberOfRegions * RoundedRegionSize;
 
   // Iterate over free chunks and count how many free chunks affect each
   // allocated page.
@@ -228,14 +239,17 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
       for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
         const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
         // This takes care of P < Base and P >= Base + RoundedSize.
-        if (P < RoundedSize)
-          Counters.inc(P >> PageSizeLog);
+        if (P < RoundedSize) {
+          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
+          const uptr PInRegion = P - RegionIndex * RegionSize;
+          Counters.inc(RegionIndex, PInRegion >> PageSizeLog);
+        }
       }
     }
-    for (uptr P = Size; P < RoundedSize; P += BlockSize)
-      Counters.inc(P >> PageSizeLog);
   } else {
     // In all other cases chunks might affect more than one page.
+    DCHECK_GE(RegionSize, BlockSize);
+    const uptr LastBlockInRegion = ((RegionSize / BlockSize) - 1U) * BlockSize;
     for (const auto &It : FreeList) {
       // See TransferBatch comment above.
       const bool IsTransferBatch =
@@ -244,13 +258,24 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
       for (u32 I = IsTransferBatch ? 1 : 0; I < It.getCount(); I++) {
         const uptr P = reinterpret_cast<uptr>(It.get(I)) - Base;
         // This takes care of P < Base and P >= Base + RoundedSize.
-        if (P < RoundedSize)
-          Counters.incRange(P >> PageSizeLog,
-                            (P + BlockSize - 1) >> PageSizeLog);
+        if (P < RoundedSize) {
+          const uptr RegionIndex = NumberOfRegions == 1U ? 0 : P / RegionSize;
+          uptr PInRegion = P - RegionIndex * RegionSize;
+          Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
+                            (PInRegion + BlockSize - 1) >> PageSizeLog);
+          // The last block in a region might straddle a page, so if it's
+          // free, we mark the following "pretend" memory block(s) as free.
+          if (PInRegion == LastBlockInRegion) {
+            PInRegion += BlockSize;
+            while (PInRegion < RoundedRegionSize) {
+              Counters.incRange(RegionIndex, PInRegion >> PageSizeLog,
+                                (PInRegion + BlockSize - 1) >> PageSizeLog);
+              PInRegion += BlockSize;
+            }
+          }
+        }
       }
     }
-    for (uptr P = Size; P < RoundedSize; P += BlockSize)
-      Counters.incRange(P >> PageSizeLog, (P + BlockSize - 1) >> PageSizeLog);
   }
 
   // Iterate over pages detecting ranges of pages with chunk Counters equal
@@ -258,8 +283,10 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
   FreePagesRangeTracker<ReleaseRecorderT> RangeTracker(Recorder);
   if (SameBlockCountPerPage) {
     // Fast path, every page has the same number of chunks affecting it.
-    for (uptr I = 0; I < Counters.getCount(); I++)
-      RangeTracker.processNextPage(Counters.get(I) == FullPagesBlockCountMax);
+    for (uptr I = 0; I < NumberOfRegions; I++)
+      for (uptr J = 0; J < PagesCount; J++)
+        RangeTracker.processNextPage(Counters.get(I, J) ==
+                                     FullPagesBlockCountMax);
   } else {
     // Slow path, go through the pages keeping count how many chunks affect
     // each page.
@@ -270,23 +297,25 @@ releaseFreeMemoryToOS(const IntrusiveList<TransferBatchT> &FreeList, uptr Base,
     // except the first and the last one) and then the last chunk size, adding
     // up the number of chunks on the current page and checking on every step
     // whether the page boundary was crossed.
-    uptr PrevPageBoundary = 0;
-    uptr CurrentBoundary = 0;
-    for (uptr I = 0; I < Counters.getCount(); I++) {
-      const uptr PageBoundary = PrevPageBoundary + PageSize;
-      uptr BlocksPerPage = Pn;
-      if (CurrentBoundary < PageBoundary) {
-        if (CurrentBoundary > PrevPageBoundary)
-          BlocksPerPage++;
-        CurrentBoundary += Pnc;
+    for (uptr I = 0; I < NumberOfRegions; I++) {
+      uptr PrevPageBoundary = 0;
+      uptr CurrentBoundary = 0;
+      for (uptr J = 0; J < PagesCount; J++) {
+        const uptr PageBoundary = PrevPageBoundary + PageSize;
+        uptr BlocksPerPage = Pn;
         if (CurrentBoundary < PageBoundary) {
-          BlocksPerPage++;
-          CurrentBoundary += BlockSize;
+          if (CurrentBoundary > PrevPageBoundary)
+            BlocksPerPage++;
+          CurrentBoundary += Pnc;
+          if (CurrentBoundary < PageBoundary) {
+            BlocksPerPage++;
+            CurrentBoundary += BlockSize;
+          }
         }
-      }
-      PrevPageBoundary = PageBoundary;
+        PrevPageBoundary = PageBoundary;
 
-      RangeTracker.processNextPage(Counters.get(I) == BlocksPerPage);
+        RangeTracker.processNextPage(Counters.get(I, J) == BlocksPerPage);
+      }
     }
   }
   RangeTracker.finish();

diff  --git a/compiler-rt/lib/scudo/standalone/tests/release_test.cpp b/compiler-rt/lib/scudo/standalone/tests/release_test.cpp
index a7478f47479d..a693b97f80da 100644
--- a/compiler-rt/lib/scudo/standalone/tests/release_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/release_test.cpp
@@ -21,14 +21,14 @@
 TEST(ScudoReleaseTest, PackedCounterArray) {
   for (scudo::uptr I = 0; I < SCUDO_WORDSIZE; I++) {
     // Various valid counter's max values packed into one word.
-    scudo::PackedCounterArray Counters2N(1, 1UL << I);
+    scudo::PackedCounterArray Counters2N(1U, 1U, 1UL << I);
     EXPECT_EQ(sizeof(scudo::uptr), Counters2N.getBufferSize());
     // Check the "all bit set" values too.
-    scudo::PackedCounterArray Counters2N1_1(1, ~0UL >> I);
+    scudo::PackedCounterArray Counters2N1_1(1U, 1U, ~0UL >> I);
     EXPECT_EQ(sizeof(scudo::uptr), Counters2N1_1.getBufferSize());
     // Verify the packing ratio, the counter is Expected to be packed into the
     // closest power of 2 bits.
-    scudo::PackedCounterArray Counters(SCUDO_WORDSIZE, 1UL << I);
+    scudo::PackedCounterArray Counters(1U, SCUDO_WORDSIZE, 1UL << I);
     EXPECT_EQ(sizeof(scudo::uptr) * scudo::roundUpToPowerOfTwo(I + 1),
               Counters.getBufferSize());
   }
@@ -38,19 +38,20 @@ TEST(ScudoReleaseTest, PackedCounterArray) {
     // Make sure counters request one memory page for the buffer.
     const scudo::uptr NumCounters =
         (scudo::getPageSizeCached() / 8) * (SCUDO_WORDSIZE >> I);
-    scudo::PackedCounterArray Counters(NumCounters, 1UL << ((1UL << I) - 1));
-    Counters.inc(0);
+    scudo::PackedCounterArray Counters(1U, NumCounters,
+                                       1UL << ((1UL << I) - 1));
+    Counters.inc(0U, 0U);
     for (scudo::uptr C = 1; C < NumCounters - 1; C++) {
-      EXPECT_EQ(0UL, Counters.get(C));
-      Counters.inc(C);
-      EXPECT_EQ(1UL, Counters.get(C - 1));
+      EXPECT_EQ(0UL, Counters.get(0U, C));
+      Counters.inc(0U, C);
+      EXPECT_EQ(1UL, Counters.get(0U, C - 1));
     }
-    EXPECT_EQ(0UL, Counters.get(NumCounters - 1));
-    Counters.inc(NumCounters - 1);
+    EXPECT_EQ(0UL, Counters.get(0U, NumCounters - 1));
+    Counters.inc(0U, NumCounters - 1);
     if (I > 0) {
-      Counters.incRange(0, NumCounters - 1);
+      Counters.incRange(0u, 0U, NumCounters - 1);
       for (scudo::uptr C = 0; C < NumCounters; C++)
-        EXPECT_EQ(2UL, Counters.get(C));
+        EXPECT_EQ(2UL, Counters.get(0U, C));
     }
   }
 }
@@ -190,7 +191,7 @@ template <class SizeClassMap> void testReleaseFreeMemoryToOS() {
 
     // Release the memory.
     ReleasedPagesRecorder Recorder;
-    releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, BlockSize,
+    releaseFreeMemoryToOS(FreeList, 0, MaxBlocks * BlockSize, 1U, BlockSize,
                           &Recorder);
 
     // Verify that there are no released pages touched by used chunks and all


        


More information about the llvm-commits mailing list