[llvm] 59da1af - [memprof] Speed up caller-callee pair extraction (#116184)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 14 15:55:00 PST 2024
Author: Kazu Hirata
Date: 2024-11-14T15:54:55-08:00
New Revision: 59da1afd2ad74af2a8b8475412353c5d54a7d7f5
URL: https://github.com/llvm/llvm-project/commit/59da1afd2ad74af2a8b8475412353c5d54a7d7f5
DIFF: https://github.com/llvm/llvm-project/commit/59da1afd2ad74af2a8b8475412353c5d54a7d7f5.diff
LOG: [memprof] Speed up caller-callee pair extraction (#116184)
We know that the MemProf profile has a lot of duplicate call stacks.
Extracting caller-callee pairs from a call stack we've seen before is
a wasteful effort.
This patch makes the extraction more efficient by first coming up with
a work list of linear call stack IDs -- the set of starting positions
in the radix tree array -- and then extract caller-callee pairs from
each call stack in the work list.
We implement the work list as a bit vector because we expect the work
list to be dense in the range [0, RadixTreeSize). Also, we want the
set insertion to be cheap.
Without this patch, it takes 25 seconds to extract caller-callee pairs
from a large MemProf profile. This patch shortenes that down to 4
seconds.
Added:
Modified:
llvm/include/llvm/ProfileData/InstrProfReader.h
llvm/lib/ProfileData/InstrProfReader.cpp
llvm/lib/ProfileData/InstrProfWriter.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 42414bc193bc84..1930cc3f5c2c30 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -683,6 +683,8 @@ class IndexedMemProfReader {
const unsigned char *FrameBase = nullptr;
/// The starting address of the call stack array.
const unsigned char *CallStackBase = nullptr;
+ // The number of elements in the radix tree array.
+ unsigned RadixTreeSize = 0;
Error deserializeV012(const unsigned char *Start, const unsigned char *Ptr,
uint64_t FirstWord);
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index cae6ce5b824e62..54a7dea59b1aea 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1303,6 +1303,12 @@ Error IndexedMemProfReader::deserializeV3(const unsigned char *Start,
FrameBase = Ptr;
CallStackBase = Start + CallStackPayloadOffset;
+ // Compute the number of elements in the radix tree array. Since we use this
+ // to reserve enough bits in a BitVector, it's totally OK if we overestimate
+ // this number a little bit because of padding just before the next section.
+ RadixTreeSize = (RecordPayloadOffset - CallStackPayloadOffset) /
+ sizeof(memprof::LinearFrameId);
+
// Now initialize the table reader with a pointer into data buffer.
MemProfRecordTable.reset(MemProfRecordHashTable::Create(
/*Buckets=*/Start + RecordTableOffset,
@@ -1674,11 +1680,22 @@ IndexedMemProfReader::getMemProfCallerCalleePairs() const {
memprof::LinearFrameIdConverter FrameIdConv(FrameBase);
memprof::CallerCalleePairExtractor Extractor(CallStackBase, FrameIdConv);
+ // The set of linear call stack IDs that we need to traverse from. We expect
+ // the set to be dense, so we use a BitVector.
+ BitVector Worklist(RadixTreeSize);
+
+ // Collect the set of linear call stack IDs. Since we expect a lot of
+ // duplicates, we first collect them in the form of a bit vector before
+ // processing them.
for (const memprof::IndexedMemProfRecord &IndexedRecord :
MemProfRecordTable->data())
for (const memprof::IndexedAllocationInfo &IndexedAI :
IndexedRecord.AllocSites)
- Extractor(IndexedAI.CSId);
+ Worklist.set(IndexedAI.CSId);
+
+ // Collect caller-callee pairs for each linear call stack ID in Worklist.
+ for (unsigned CS : Worklist.set_bits())
+ Extractor(CS);
DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> Pairs =
std::move(Extractor.CallerCalleePairs);
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 0ab9f942a08589..456014741d93f7 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -601,7 +601,8 @@ writeMemProfCallStackArray(
&MemProfCallStackData,
llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
&MemProfFrameIndexes,
- llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
+ llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram,
+ unsigned &NumElements) {
llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
MemProfCallStackIndexes;
@@ -610,6 +611,7 @@ writeMemProfCallStackArray(
FrameHistogram);
for (auto I : Builder.getRadixArray())
OS.write32(I);
+ NumElements = Builder.getRadixArray().size();
MemProfCallStackIndexes = Builder.takeCallStackPos();
// Release the memory of this vector as it is no longer needed.
@@ -771,15 +773,26 @@ static Error writeMemProfV3(ProfOStream &OS,
writeMemProfFrameArray(OS, MemProfData.Frames, FrameHistogram);
uint64_t CallStackPayloadOffset = OS.tell();
+ // The number of elements in the call stack array.
+ unsigned NumElements = 0;
llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
- MemProfCallStackIndexes = writeMemProfCallStackArray(
- OS, MemProfData.CallStacks, MemProfFrameIndexes, FrameHistogram);
+ MemProfCallStackIndexes =
+ writeMemProfCallStackArray(OS, MemProfData.CallStacks,
+ MemProfFrameIndexes, FrameHistogram,
+ NumElements);
uint64_t RecordPayloadOffset = OS.tell();
uint64_t RecordTableOffset =
writeMemProfRecords(OS, MemProfData.Records, &Schema, memprof::Version3,
&MemProfCallStackIndexes);
+ // IndexedMemProfReader::deserializeV3 computes the number of elements in the
+ // call stack array from the
diff erence between CallStackPayloadOffset and
+ // RecordPayloadOffset. Verify that the computation works.
+ assert(CallStackPayloadOffset +
+ NumElements * sizeof(memprof::LinearFrameId) ==
+ RecordPayloadOffset);
+
uint64_t Header[] = {
CallStackPayloadOffset,
RecordPayloadOffset,
More information about the llvm-commits
mailing list