[llvm] [MemProf] Dump inline call stacks as optimization remarks (PR #188678)
Kazu Hirata via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 25 22:16:06 PDT 2026
https://github.com/kazutakahirata created https://github.com/llvm/llvm-project/pull/188678
This patch teaches the MemProf matching pass to dump inline call
stacks as optimization remarks when -memprof-print-inline-call-stacks
is enabled like so:
frame: 704e4117e6a62739 main:10:5:0
frame: 273929e54b9f1234 foo:2:12:1
inline call stack: 704e4117e6a62739,273929e54b9f1234
The output consists of two types of remarks:
- "frame": Acts as a dictionary mapping a unique MD5-based FrameID
to source information (function name, line offset, column, and
inlined status).
- "inline call stack": Provides the full call stack for a call site
as a sequence of FrameIDs.
Both types of remarks are deduplicated to reduce the output size.
This patch is intended to be a debugging aid.
>From 4f3ddefb5904f43de6ccfdaad83c5bd35984ed52 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Wed, 25 Mar 2026 17:57:11 -0700
Subject: [PATCH] [MemProf] Dump inline call stacks as optimization remarks
This patch teaches the MemProf matching pass to dump inline call
stacks as optimization remarks when -memprof-print-inline-call-stacks
is enabled like so:
frame: 704e4117e6a62739 main:10:5:0
frame: 273929e54b9f1234 foo:2:12:1
inline call stack: 704e4117e6a62739,273929e54b9f1234
The output consists of two types of remarks:
- "frame": Acts as a dictionary mapping a unique MD5-based FrameID
to source information (function name, line offset, column, and
inlined status).
- "inline call stack": Provides the full call stack for a call site
as a sequence of FrameIDs.
Both types of remarks are deduplicated to reduce the output size.
This patch is intended to be a debugging aid.
---
.../Transforms/Instrumentation/MemProfUse.cpp | 75 ++++++++++++++++++-
.../PGOProfile/memprof-inline-call-stacks.ll | 38 ++++++++++
2 files changed, 111 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/Transforms/PGOProfile/memprof-inline-call-stacks.ll
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index 704e4117e6a63..84142dbde06ad 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Instrumentation/MemProfUse.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
@@ -30,7 +31,9 @@
#include "llvm/Support/BLAKE3.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
#include "llvm/Support/HashBuilder.h"
+#include "llvm/Support/MD5.h"
#include "llvm/Support/VirtualFileSystem.h"
#include "llvm/Transforms/Utils/LongestCommonSequence.h"
#include <map>
@@ -63,6 +66,11 @@ static cl::opt<bool>
"context in this module's profiles"),
cl::Hidden, cl::init(false));
+static cl::opt<bool> ClPrintMemProfInlineCallStacks(
+ "memprof-print-inline-call-stacks",
+ cl::desc("Print inline call stack for each callsite for debugging"),
+ cl::Hidden, cl::init(false));
+
static cl::opt<bool> PrintMatchedAllocStack(
"memprof-print-matched-alloc-stack",
cl::desc("Print full stack context for matched "
@@ -630,13 +638,67 @@ static void handleCallSite(Instruction &I, const Function *CalledFunction,
addVPMetadata(M, I, CalleeGuids.getArrayRef());
}
+// Dump inline call stack for debugging purposes.
+static void dumpInlineCallStack(Instruction &I, CallBase *CI,
+ OptimizationRemarkEmitter &ORE,
+ DenseSet<uint64_t> &SeenFrames,
+ DenseSet<uint64_t> &SeenStacks,
+ bool ProfileHasColumns) {
+ auto GetOffset = [](const DILocation *DIL) {
+ return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
+ 0xffff;
+ };
+
+ // Dump frame info. Frames are deduplicated using FrameID.
+ std::string CallStack;
+ raw_string_ostream CallStackOS(CallStack);
+ bool First = true;
+ for (const DILocation *DIL = I.getDebugLoc(); DIL;
+ DIL = DIL->getInlinedAt()) {
+ std::string FrameStr;
+ raw_string_ostream FrameOS(FrameStr);
+ StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName();
+ if (Name.empty())
+ Name = DIL->getScope()->getSubprogram()->getName();
+ FrameOS << Name << ":" << GetOffset(DIL) << ":"
+ << (ProfileHasColumns ? DIL->getColumn() : 0) << ":"
+ << (DIL->getInlinedAt() ? "1" : "0");
+ uint64_t FrameID = llvm::MD5Hash(FrameStr);
+ if (SeenFrames.insert(FrameID).second) {
+ std::string DictMsg;
+ raw_string_ostream DictOS(DictMsg);
+ DictOS << "frame: " << format_hex_no_prefix(FrameID, 16) << " "
+ << FrameStr;
+ ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemProfUse", CI)
+ << DictOS.str());
+ }
+
+ if (First)
+ First = false;
+ else
+ CallStackOS << ",";
+ CallStackOS << format_hex_no_prefix(FrameID, 16);
+ }
+
+ // Dump inline call stack info. Stacks are deduplicated using StackHash.
+ uint64_t StackHash = llvm::MD5Hash(CallStack);
+ if (SeenStacks.insert(StackHash).second) {
+ std::string Msg;
+ raw_string_ostream OS(Msg);
+ OS << "inline call stack: " << CallStack;
+ ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemProfUse", CI)
+ << OS.str());
+ }
+}
+
static void
readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
const TargetLibraryInfo &TLI,
std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo,
std::set<std::vector<uint64_t>> &MatchedCallSites,
DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
- OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize) {
+ OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize,
+ DenseSet<uint64_t> &SeenStacks, DenseSet<uint64_t> &SeenFrames) {
auto &Ctx = M.getContext();
// Previously we used getIRPGOFuncName() here. If F is local linkage,
// getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But
@@ -758,6 +820,11 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
auto *CalledFunction = CI->getCalledFunction();
if (CalledFunction && CalledFunction->isIntrinsic())
continue;
+
+ if (ClPrintMemProfInlineCallStacks)
+ dumpInlineCallStack(I, CI, ORE, SeenFrames, SeenStacks,
+ ProfileHasColumns);
+
// List of call stack ids computed from the location hashes on debug
// locations (leaf to inlined at root).
SmallVector<uint64_t, 8> InlinedCallStack;
@@ -880,6 +947,9 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
// call stack.
std::set<std::vector<uint64_t>> MatchedCallSites;
+ DenseSet<uint64_t> SeenStacks;
+ DenseSet<uint64_t> SeenFrames;
+
uint64_t MaxColdSize = 0;
if (auto *MemProfSum = MemProfReader->getMemProfSummary())
MaxColdSize = MemProfSum->getMaxColdTotalSize();
@@ -891,7 +961,8 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
readMemprof(M, F, MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo,
- MatchedCallSites, UndriftMaps, ORE, MaxColdSize);
+ MatchedCallSites, UndriftMaps, ORE, MaxColdSize, SeenStacks,
+ SeenFrames);
}
if (ClPrintMemProfMatchInfo) {
diff --git a/llvm/test/Transforms/PGOProfile/memprof-inline-call-stacks.ll b/llvm/test/Transforms/PGOProfile/memprof-inline-call-stacks.ll
new file mode 100644
index 0000000000000..0c3370d287ecb
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/memprof-inline-call-stacks.ll
@@ -0,0 +1,38 @@
+; RUN: llvm-profdata merge %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdata
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -memprof-print-inline-call-stacks -pass-remarks=memprof -S 2>&1 | FileCheck %s
+
+; CHECK: remark: memprof.cc:5:10: frame: [[FOO:[0-9a-f]+]] _Z3foov:1:10:0
+; CHECK: remark: memprof.cc:5:10: inline call stack: [[FOO]]
+; CHECK: remark: memprof.cc:9:12: frame: [[BAR:[0-9a-f]+]] _Z3barv:2:12:1
+; CHECK: remark: memprof.cc:9:12: frame: [[BAZ:[0-9a-f]+]] _Z3bazv:3:13:0
+; CHECK: remark: memprof.cc:9:12: inline call stack: [[BAR]],[[BAZ]]
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z3foov() {
+entry:
+ %call = call ptr null(i64 0), !dbg !3
+ ret ptr %call
+}
+
+define ptr @_Z3barv() {
+entry:
+ %call = call ptr @_Z3foov(), !dbg !7
+ ret ptr %call
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof.cc", directory: "/", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DILocation(line: 5, column: 10, scope: !4)
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !5, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !6)
+!5 = !DISubroutineType(types: !6)
+!6 = !{}
+!7 = !DILocation(line: 9, column: 12, scope: !8, inlinedAt: !9)
+!8 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 7, type: !5, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !6)
+!9 = !DILocation(line: 12, column: 13, scope: !10)
+!10 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 9, type: !5, scopeLine: 9, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !6)
More information about the llvm-commits
mailing list