[llvm] [memprof] Dump call site matching information (PR #125130)
Kazu Hirata via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 5 13:11:51 PST 2025
https://github.com/kazutakahirata updated https://github.com/llvm/llvm-project/pull/125130
>From a919358b635dd21dcf299f792a74f834a1e49216 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Thu, 30 Jan 2025 12:22:31 -0800
Subject: [PATCH 1/3] [memprof] Dump call site matching information
MemProfiler.cpp annotates the IR with the memory profile so that we
can later duplicate context. Dumping the call site matching
information here allows us to analyze how well we manage to annotate
the IR. Specifically, this patch dumps:
- the full stack ID (to identify the profile call stack)
- the index within the profile call stack where we start matching
- the size of InlinedCallStack
This way, we get to see what part of profile call stack we are
matching, not just one frame somewhere in the profile call stack.
Now, obtaining the full stack ID requires a little bit of refactoring.
This patch modifies the value type of LocHashToCallSites so that it
contains the full stack as well as the starting index of a match.
Essentially, this patch partially reverts:
commit 7c294eb78009ef252aafa269963f5496d1dedf6f
Author: Kazu Hirata <kazu at google.com>
Date: Sat Dec 14 00:03:27 2024 -0800
---
.../Instrumentation/MemProfiler.cpp | 21 +++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 91c48338d03208..66d7466a92c3b3 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -1034,13 +1034,15 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
// A hash function for std::unordered_set<ArrayRef<Frame>> to work.
struct CallStackHash {
- size_t operator()(ArrayRef<Frame> CS) const {
- return computeFullStackId(CS);
+ size_t operator()(const std::pair<ArrayRef<Frame>, unsigned> &CS) const {
+ auto &[CallStack, Idx] = CS;
+ return computeFullStackId(ArrayRef<Frame>(CallStack).drop_front(Idx));
}
};
// For the callsites we need to record slices of the frame array (see comments
// below where the map entries are added).
- std::map<uint64_t, std::unordered_set<ArrayRef<Frame>, CallStackHash>>
+ std::map<uint64_t, std::unordered_set<std::pair<ArrayRef<Frame>, unsigned>,
+ CallStackHash>>
LocHashToCallSites;
for (auto &AI : MemProfRec->AllocSites) {
NumOfMemProfAllocContextProfiles++;
@@ -1058,7 +1060,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
unsigned Idx = 0;
for (auto &StackFrame : CS) {
uint64_t StackId = computeStackId(StackFrame);
- LocHashToCallSites[StackId].insert(ArrayRef<Frame>(CS).drop_front(Idx++));
+ LocHashToCallSites[StackId].emplace(CS, Idx++);
ProfileHasColumns |= StackFrame.Column;
// Once we find this function, we can stop recording.
if (StackFrame.Function == FuncGUID)
@@ -1201,15 +1203,22 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
// instruction's leaf location in the callsites map and not the allocation
// map.
assert(CallSitesIter != LocHashToCallSites.end());
- for (auto CallStackIdx : CallSitesIter->second) {
+ for (auto &[ProfileCallStack, Idx] : CallSitesIter->second) {
// If we found and thus matched all frames on the call, create and
// attach call stack metadata.
- if (stackFrameIncludesInlinedCallStack(CallStackIdx,
+ if (stackFrameIncludesInlinedCallStack(ProfileCallStack.drop_front(Idx),
InlinedCallStack)) {
NumOfMemProfMatchedCallSites++;
addCallsiteMetadata(I, InlinedCallStack, Ctx);
// Only need to find one with a matching call stack and add a single
// callsite metadata.
+
+ // Dump call site matching information upon request.
+ if (ClPrintMemProfMatchInfo) {
+ uint64_t FullStackId = computeFullStackId(ProfileCallStack);
+ errs() << "MemProf callsite " << FullStackId << " " << Idx << " "
+ << InlinedCallStack.size() << "\n";
+ }
break;
}
}
>From ec5c64ff89d1b60d89cf552aa62cc7e22f21ba35 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Fri, 31 Jan 2025 14:34:24 -0800
Subject: [PATCH 2/3] Address comments.
---
.../Instrumentation/MemProfiler.cpp | 46 ++++---
.../memprof-dump-matched-call-sites.ll | 114 ++++++++++++++++++
llvm/test/Transforms/PGOProfile/memprof.ll | 10 ++
3 files changed, 152 insertions(+), 18 deletions(-)
create mode 100644 llvm/test/Transforms/PGOProfile/memprof-dump-matched-call-sites.ll
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 66d7466a92c3b3..72806382449a8c 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -966,11 +966,12 @@ undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
UndriftCallStack(CS);
}
-static void
-readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
- const TargetLibraryInfo &TLI,
- std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo,
- DenseMap<uint64_t, LocToLocMap> &UndriftMaps) {
+static void readMemprof(
+ Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
+ const TargetLibraryInfo &TLI,
+ std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo,
+ std::set<std::vector<uint64_t>> &MatchedCallSites,
+ DenseMap<uint64_t, LocToLocMap> &UndriftMaps) {
auto &Ctx = M.getContext();
// Previously we used getIRPGOFuncName() here. If F is local linkage,
// getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But
@@ -1034,15 +1035,13 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
// A hash function for std::unordered_set<ArrayRef<Frame>> to work.
struct CallStackHash {
- size_t operator()(const std::pair<ArrayRef<Frame>, unsigned> &CS) const {
- auto &[CallStack, Idx] = CS;
- return computeFullStackId(ArrayRef<Frame>(CallStack).drop_front(Idx));
+ size_t operator()(ArrayRef<Frame> CS) const {
+ return computeFullStackId(CS);
}
};
// For the callsites we need to record slices of the frame array (see comments
// below where the map entries are added).
- std::map<uint64_t, std::unordered_set<std::pair<ArrayRef<Frame>, unsigned>,
- CallStackHash>>
+ std::map<uint64_t, std::unordered_set<ArrayRef<Frame>, CallStackHash>>
LocHashToCallSites;
for (auto &AI : MemProfRec->AllocSites) {
NumOfMemProfAllocContextProfiles++;
@@ -1060,7 +1059,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
unsigned Idx = 0;
for (auto &StackFrame : CS) {
uint64_t StackId = computeStackId(StackFrame);
- LocHashToCallSites[StackId].emplace(CS, Idx++);
+ LocHashToCallSites[StackId].insert(ArrayRef<Frame>(CS).drop_front(Idx++));
ProfileHasColumns |= StackFrame.Column;
// Once we find this function, we can stop recording.
if (StackFrame.Function == FuncGUID)
@@ -1203,21 +1202,21 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
// instruction's leaf location in the callsites map and not the allocation
// map.
assert(CallSitesIter != LocHashToCallSites.end());
- for (auto &[ProfileCallStack, Idx] : CallSitesIter->second) {
+ for (auto CallStackIdx : CallSitesIter->second) {
// If we found and thus matched all frames on the call, create and
// attach call stack metadata.
- if (stackFrameIncludesInlinedCallStack(ProfileCallStack.drop_front(Idx),
+ if (stackFrameIncludesInlinedCallStack(CallStackIdx,
InlinedCallStack)) {
NumOfMemProfMatchedCallSites++;
addCallsiteMetadata(I, InlinedCallStack, Ctx);
// Only need to find one with a matching call stack and add a single
// callsite metadata.
- // Dump call site matching information upon request.
+ // Accumulate call site matching information upon request.
if (ClPrintMemProfMatchInfo) {
- uint64_t FullStackId = computeFullStackId(ProfileCallStack);
- errs() << "MemProf callsite " << FullStackId << " " << Idx << " "
- << InlinedCallStack.size() << "\n";
+ std::vector<uint64_t> CallStack;
+ append_range(CallStack, InlinedCallStack);
+ MatchedCallSites.insert(std::move(CallStack));
}
break;
}
@@ -1275,13 +1274,17 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
// it to an allocation in the IR.
std::map<uint64_t, AllocMatchInfo> FullStackIdToAllocMatchInfo;
+ // Set of the matched call sites, each expressed as a sequence of an inline
+ // call stack.
+ std::set<std::vector<uint64_t>> MatchedCallSites;
+
for (auto &F : M) {
if (F.isDeclaration())
continue;
const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
readMemprof(M, F, MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo,
- UndriftMaps);
+ MatchedCallSites, UndriftMaps);
}
if (ClPrintMemProfMatchInfo) {
@@ -1290,6 +1293,13 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
<< " context with id " << Id << " has total profiled size "
<< Info.TotalSize << (Info.Matched ? " is" : " not")
<< " matched\n";
+
+ for (const auto &CallStack : MatchedCallSites) {
+ errs() << "MemProf callsite match for inline call stack";
+ for (uint64_t StackId : CallStack)
+ errs() << " " << StackId;
+ errs() << "\n";
+ }
}
return PreservedAnalyses::none();
diff --git a/llvm/test/Transforms/PGOProfile/memprof-dump-matched-call-sites.ll b/llvm/test/Transforms/PGOProfile/memprof-dump-matched-call-sites.ll
new file mode 100644
index 00000000000000..a5302895d0593d
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/memprof-dump-matched-call-sites.ll
@@ -0,0 +1,114 @@
+; Tests that the compiler dumps call site matches upon request.
+;
+; The test case is generated from:
+;
+; // main
+; // |
+; // f1 (noinline)
+; // |
+; // f2
+; // |
+; // f3 (noinline)
+; // |
+; // new
+;
+; __attribute__((noinline)) char *f3() { return ::new char[4]; }
+;
+; static char *f2() { return f3(); }
+;
+; __attribute__((noinline)) static char *f1() { return f2(); }
+;
+; int main() {
+; f1();
+; return 0;
+; }
+;
+; Here we expect to match two inline call stacks:
+;
+; - [main]
+; - [f1, f2]
+;
+; Note that f3 is considered to be an allocation site, not a call site, because
+; it directly calls new after inlining.
+
+; REQUIRES: x86_64-linux
+; RUN: split-file %s %t
+; RUN: llvm-profdata merge %t/memprof-dump-matched-call-site.yaml -o %t/memprof-dump-matched-call-site.memprofdata
+; RUN: opt < %t/memprof-dump-matched-call-site.ll -passes='memprof-use<profile-filename=%t/memprof-dump-matched-call-site.memprofdata>' -memprof-print-match-info -S 2>&1 | FileCheck %s
+
+;--- memprof-dump-matched-call-site.yaml
+---
+HeapProfileRecords:
+ - GUID: main
+ AllocSites: []
+ CallSites:
+ - - { Function: main, LineOffset: 1, Column: 3, IsInlineFrame: false }
+ - GUID: _ZL2f1v
+ AllocSites: []
+ CallSites:
+ - - { Function: _ZL2f2v, LineOffset: 0, Column: 28, IsInlineFrame: true }
+ - { Function: _ZL2f1v, LineOffset: 0, Column: 54, IsInlineFrame: false }
+ - GUID: _ZL2f2v
+ AllocSites: []
+ CallSites:
+ - - { Function: _ZL2f2v, LineOffset: 0, Column: 28, IsInlineFrame: true }
+ - { Function: _ZL2f1v, LineOffset: 0, Column: 54, IsInlineFrame: false }
+ - GUID: _Z2f3v
+ AllocSites:
+ - Callstack:
+ - { Function: _Z2f3v, LineOffset: 0, Column: 47, IsInlineFrame: false }
+ - { Function: _ZL2f2v, LineOffset: 0, Column: 28, IsInlineFrame: true }
+ - { Function: _ZL2f1v, LineOffset: 0, Column: 54, IsInlineFrame: false }
+ - { Function: main, LineOffset: 1, Column: 3, IsInlineFrame: false }
+ MemInfoBlock:
+ AllocCount: 1
+ TotalSize: 4
+ TotalLifetime: 0
+ TotalLifetimeAccessDensity: 0
+ CallSites: []
+...
+;--- memprof-dump-matched-call-site.ll
+; CHECK: MemProf notcold context with id 3894143216621363392 has total profiled size 4 is matched
+; CHECK: MemProf callsite match for inline call stack 4745611964195289084 10616861955219347331
+; CHECK: MemProf callsite match for inline call stack 5401059281181789382
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z2f3v() {
+entry:
+ %call = call ptr @_Znam(i64 0), !dbg !3
+ ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define i32 @main() {
+entry:
+ call void @_ZL2f1v(), !dbg !7
+ ret i32 0
+}
+
+define void @_ZL2f1v() {
+entry:
+ %call.i = call ptr @_Z2f3v(), !dbg !9
+ ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1)
+!1 = !DIFile(filename: "match.cc", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DILocation(line: 11, column: 47, scope: !4)
+!4 = distinct !DISubprogram(name: "f3", linkageName: "_Z2f3v", scope: !1, file: !1, line: 11, type: !5, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(types: !6)
+!6 = !{}
+!7 = !DILocation(line: 18, column: 3, scope: !8)
+!8 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 17, type: !5, scopeLine: 17, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!9 = !DILocation(line: 13, column: 28, scope: !10, inlinedAt: !11)
+!10 = distinct !DISubprogram(name: "f2", linkageName: "_ZL2f2v", scope: !1, file: !1, line: 13, type: !5, scopeLine: 13, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!11 = distinct !DILocation(line: 15, column: 54, scope: !12)
+!12 = distinct !DISubprogram(name: "f1", linkageName: "_ZL2f1v", scope: !1, file: !1, line: 15, type: !13, scopeLine: 15, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!13 = !DISubroutineType(cc: DW_CC_nocall, types: !6)
diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index f0421ba60cffca..5a958de5f7f8d5 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -101,6 +101,16 @@
; MEMPROFMATCHINFO: MemProf cold context with id 15737101490731057601 has total profiled size 10 is matched
; MEMPROFMATCHINFO: MemProf cold context with id 16342802530253093571 has total profiled size 10 is matched
; MEMPROFMATCHINFO: MemProf cold context with id 18254812774972004394 has total profiled size 10 is matched
+; MEMPROFMATCHINFO: MemProf callsite match for inline call stack 748269490701775343
+; MEMPROFMATCHINFO: MemProf callsite match for inline call stack 1544787832369987002
+; MEMPROFMATCHINFO: MemProf callsite match for inline call stack 2061451396820446691
+; MEMPROFMATCHINFO: MemProf callsite match for inline call stack 2104812325165620841
+; MEMPROFMATCHINFO: MemProf callsite match for inline call stack 6281715513834610934
+; MEMPROFMATCHINFO: MemProf callsite match for inline call stack 8467819354083268568
+; MEMPROFMATCHINFO: MemProf callsite match for inline call stack 8690657650969109624
+; MEMPROFMATCHINFO: MemProf callsite match for inline call stack 9086428284934609951
+; MEMPROFMATCHINFO: MemProf callsite match for inline call stack 12481870273128938184
+; MEMPROFMATCHINFO: MemProf callsite match for inline call stack 12699492813229484831
; ModuleID = 'memprof.cc'
source_filename = "memprof.cc"
>From 05b2f6032fe1fb8250e8bcd2ef114aebc87952ee Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Wed, 5 Feb 2025 13:11:36 -0800
Subject: [PATCH 3/3] clang-format MemProfiler.cpp.
---
llvm/lib/Transforms/Instrumentation/MemProfiler.cpp | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 72806382449a8c..7d8bc3aa4c5895 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -966,12 +966,12 @@ undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
UndriftCallStack(CS);
}
-static void readMemprof(
- Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
- const TargetLibraryInfo &TLI,
- std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo,
- std::set<std::vector<uint64_t>> &MatchedCallSites,
- DenseMap<uint64_t, LocToLocMap> &UndriftMaps) {
+static void
+readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
+ const TargetLibraryInfo &TLI,
+ std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo,
+ std::set<std::vector<uint64_t>> &MatchedCallSites,
+ DenseMap<uint64_t, LocToLocMap> &UndriftMaps) {
auto &Ctx = M.getContext();
// Previously we used getIRPGOFuncName() here. If F is local linkage,
// getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But
More information about the llvm-commits
mailing list