[compiler-rt] r300861 - [scudo] Remove GetActuallyAllocatedSize calls from the fast path

Thu Apr 20 11:07:18 PDT 2017

Author: cryptoad
Date: Thu Apr 20 13:07:17 2017
New Revision: 300861

URL: http://llvm.org/viewvc/llvm-project?rev=300861&view=rev
Log:
[scudo] Remove GetActuallyAllocatedSize calls from the fast path

Summary:
GetActuallyAllocatedSize is actually expensive. In order to avoid calling this
function in the malloc/free fast path, we change the Scudo chunk header to
store the size of the chunk, if from the Primary, or the amount of unused
bytes if from the Secondary. This way, we only have to call the culprit
function for Secondary backed allocations (and still in realloc).

The performance gain on a singly threaded pure malloc/free benchmark exercising
the Primary allocator is above 5%.

Reviewers: alekseyshl, kcc, dvyukov

Reviewed By: dvyukov

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D32299

Modified:
    compiler-rt/trunk/lib/scudo/scudo_allocator.cpp
    compiler-rt/trunk/lib/scudo/scudo_allocator.h

Modified: compiler-rt/trunk/lib/scudo/scudo_allocator.cpp
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/scudo/scudo_allocator.cpp?rev=300861&r1=300860&r2=300861&view=diff
==============================================================================

--- compiler-rt/trunk/lib/scudo/scudo_allocator.cpp (original)
+++ compiler-rt/trunk/lib/scudo/scudo_allocator.cpp Thu Apr 20 13:07:17 2017
@@ -341,14 +341,14 @@ struct ScudoAllocator {
       dieWithMessage("ERROR: the maximum possible offset doesn't fit in the "
                      "header\n");
     }
-    // Verify that we can fit the maximum amount of unused bytes in the header.
-    // Given that the Secondary fits the allocation to a page, the worst case
-    // scenario happens in the Primary. It will depend on the second to last
-    // and last class sizes, as well as the dynamic base for the Primary. The
-    // following is an over-approximation that works for our needs.
-    uptr MaxUnusedBytes = SizeClassMap::kMaxSize - 1 - AlignedChunkHeaderSize;
-    Header.UnusedBytes = MaxUnusedBytes;
-    if (Header.UnusedBytes != MaxUnusedBytes) {
+    // Verify that we can fit the maximum size or amount of unused bytes in the
+    // header. Given that the Secondary fits the allocation to a page, the worst
+    // case scenario happens in the Primary. It will depend on the second to
+    // last and last class sizes, as well as the dynamic base for the Primary.
+    // The following is an over-approximation that works for our needs.
+    uptr MaxSizeOrUnusedBytes = SizeClassMap::kMaxSize - 1;
+    Header.SizeOrUnusedBytes = MaxSizeOrUnusedBytes;
+    if (Header.SizeOrUnusedBytes != MaxSizeOrUnusedBytes) {
       dieWithMessage("ERROR: the maximum possible unused bytes doesn't fit in "
                      "the header\n");
     }
@@ -428,11 +428,9 @@ struct ScudoAllocator {
         NeededSize -= Alignment;
     }
 
-    uptr ActuallyAllocatedSize = BackendAllocator.GetActuallyAllocatedSize(
-        reinterpret_cast<void *>(AllocBeg));
     // If requested, we will zero out the entire contents of the returned chunk.
     if ((ForceZeroContents || ZeroContents) && FromPrimary)
-       memset(Ptr, 0, ActuallyAllocatedSize);
+       memset(Ptr, 0, BackendAllocator.GetActuallyAllocatedSize(Ptr));
 
     uptr UserBeg = AllocBeg + AlignedChunkHeaderSize;
     if (!IsAligned(UserBeg, Alignment))
@@ -443,8 +441,18 @@ struct ScudoAllocator {
     uptr Offset = UserBeg - AlignedChunkHeaderSize - AllocBeg;
     Header.Offset = Offset >> MinAlignmentLog;
     Header.AllocType = Type;
-    Header.UnusedBytes = ActuallyAllocatedSize - Offset -
-        AlignedChunkHeaderSize - Size;
+    if (FromPrimary) {
+      Header.FromPrimary = FromPrimary;
+      Header.SizeOrUnusedBytes = Size;
+    } else {
+      // The secondary fits the allocations to a page, so the amount of unused
+      // bytes is the difference between the end of the user allocation and the
+      // next page boundary.
+      uptr PageSize = GetPageSizeCached();
+      uptr TrailingBytes = (UserBeg + Size) & (PageSize - 1);
+      if (TrailingBytes)
+        Header.SizeOrUnusedBytes = PageSize - TrailingBytes;
+    }
     Header.Salt = static_cast<u8>(Prng.getNext());
     getScudoChunk(UserBeg)->storeHeader(&Header);
     void *UserPtr = reinterpret_cast<void *>(UserBeg);
@@ -482,8 +490,8 @@ struct ScudoAllocator {
         }
       }
     }
-    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
-    uptr Size = UsableSize - OldHeader.UnusedBytes;
+    uptr Size = OldHeader.FromPrimary ? OldHeader.SizeOrUnusedBytes :
+        Chunk->getUsableSize(&OldHeader) - OldHeader.SizeOrUnusedBytes;
     if (DeleteSizeMismatch) {
       if (DeleteSize && DeleteSize != Size) {
         dieWithMessage("ERROR: invalid sized delete on chunk at address %p\n",
@@ -495,14 +503,19 @@ struct ScudoAllocator {
     NewHeader.State = ChunkQuarantine;
     Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
 
+    // If a small memory amount was allocated with a larger alignment, we want
+    // to take that into account. Otherwise the Quarantine would be filled with
+    // tiny chunks, taking a lot of VA memory. This an approximation of the
+    // usable size, that allows us to not call GetActuallyAllocatedSize.
+    uptr LiableSize = Size + (OldHeader.Offset << MinAlignment);
     if (LIKELY(!ThreadTornDown)) {
       AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                              QuarantineCallback(&Cache), Chunk, UsableSize);
+                              QuarantineCallback(&Cache), Chunk, LiableSize);
     } else {
       SpinMutexLock l(&FallbackMutex);
       AllocatorQuarantine.Put(&FallbackQuarantineCache,
                               QuarantineCallback(&FallbackAllocatorCache),
-                              Chunk, UsableSize);
+                              Chunk, LiableSize);
     }
   }
 
@@ -529,9 +542,12 @@ struct ScudoAllocator {
     }
     uptr UsableSize = Chunk->getUsableSize(&OldHeader);
     UnpackedHeader NewHeader = OldHeader;
-    // The new size still fits in the current chunk.
-    if (NewSize <= UsableSize) {
-      NewHeader.UnusedBytes = UsableSize - NewSize;
+    // The new size still fits in the current chunk, and the size difference
+    // is reasonable.
+    if (NewSize <= UsableSize &&
+        (UsableSize - NewSize) < (SizeClassMap::kMaxSize / 2)) {
+      NewHeader.SizeOrUnusedBytes =
+                OldHeader.FromPrimary ? NewSize : UsableSize - NewSize;
       Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
       return OldPtr;
     }
@@ -539,7 +555,8 @@ struct ScudoAllocator {
     // old one.
     void *NewPtr = allocate(NewSize, MinAlignment, FromMalloc);
     if (NewPtr) {
-      uptr OldSize = UsableSize - OldHeader.UnusedBytes;
+      uptr OldSize = OldHeader.FromPrimary ? OldHeader.SizeOrUnusedBytes :
+          UsableSize - OldHeader.SizeOrUnusedBytes;
       memcpy(NewPtr, OldPtr, Min(NewSize, OldSize));
       NewHeader.State = ChunkQuarantine;
       Chunk->compareExchangeHeader(&NewHeader, &OldHeader);

Modified: compiler-rt/trunk/lib/scudo/scudo_allocator.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/scudo/scudo_allocator.h?rev=300861&r1=300860&r2=300861&view=diff
==============================================================================
--- compiler-rt/trunk/lib/scudo/scudo_allocator.h (original)
+++ compiler-rt/trunk/lib/scudo/scudo_allocator.h Thu Apr 20 13:07:17 2017
@@ -44,15 +44,17 @@ enum ChunkState : u8 {
 // well. The header will be atomically loaded and stored.
 typedef u64 PackedHeader;
 struct UnpackedHeader {
-  u64 Checksum    : 16;
-  u64 UnusedBytes : 20; // Needed for reallocation purposes.
-  u64 State       : 2;  // available, allocated, or quarantined
-  u64 AllocType   : 2;  // malloc, new, new[], or memalign
-  u64 Offset      : 16; // Offset from the beginning of the backend
-                        // allocation to the beginning of the chunk itself,
-                        // in multiples of MinAlignment. See comment about
-                        // its maximum value and test in init().
-  u64 Salt        : 8;
+  u64 Checksum          : 16;
+  u64 SizeOrUnusedBytes : 19; // Size for Primary backed allocations, amount of
+                              // unused bytes in the chunk for Secondary ones.
+  u64 FromPrimary       : 1;
+  u64 State             : 2;  // available, allocated, or quarantined
+  u64 AllocType         : 2;  // malloc, new, new[], or memalign
+  u64 Offset            : 16; // Offset from the beginning of the backend
+                              // allocation to the beginning of the chunk
+                              // itself, in multiples of MinAlignment. See
+                              /// comment about its maximum value and in init().
+  u64 Salt              : 8;
 };
 
 typedef atomic_uint64_t AtomicPackedHeader;