[llvm] c8e2f43 - [MemProf] Select largest of matching contexts from profile (#165338)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 31 06:48:07 PDT 2025
Author: Teresa Johnson
Date: 2025-10-31T08:48:02-05:00
New Revision: c8e2f433798d81304a3a248e0b15098fc4779896
URL: https://github.com/llvm/llvm-project/commit/c8e2f433798d81304a3a248e0b15098fc4779896
DIFF: https://github.com/llvm/llvm-project/commit/c8e2f433798d81304a3a248e0b15098fc4779896.diff
LOG: [MemProf] Select largest of matching contexts from profile (#165338)
We aren't currently deduplicating contexts that are identical or nearly
identical (differing inline frame information) when generating the
profile. When we have multiple identical contexts we end up
conservatively marking it as non-cold, even if some are much smaller in
terms of bytes allocated.
This was causing us to lose sight of a very large cold context, because
we had a small non-cold one that only differed in the inlining (which we
don't consider when matching as the inlining could change or be
incomplete at that point in compilation). Likely the smaller one was
from binary with much smaller usage and therefore not yet detected as
cold.
Deduplicate the alloc contexts for a function before applying the
profile, selecting the largest one, or conservatively selecting the
non-cold one if they are the same size.
This caused a minor difference to an existing test
(memprof_loop_unroll.ll), which now only gets one message for the
duplicate context instead of 2. While here, convert to the text version
of the profile.
Added:
llvm/test/Transforms/PGOProfile/memprof_diff_inline.ll
Modified:
llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index 2f256dfd7b0e2..b72d41a748857 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -127,15 +127,19 @@ static uint64_t computeStackId(const memprof::Frame &Frame) {
return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column);
}
+static AllocationType getAllocType(const AllocationInfo *AllocInfo) {
+ return getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
+ AllocInfo->Info.getAllocCount(),
+ AllocInfo->Info.getTotalLifetime());
+}
+
static AllocationType addCallStack(CallStackTrie &AllocTrie,
const AllocationInfo *AllocInfo,
uint64_t FullStackId) {
SmallVector<uint64_t> StackIds;
for (const auto &StackFrame : AllocInfo->CallStack)
StackIds.push_back(computeStackId(StackFrame));
- auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
- AllocInfo->Info.getAllocCount(),
- AllocInfo->Info.getTotalLifetime());
+ auto AllocType = getAllocType(AllocInfo);
std::vector<ContextTotalSize> ContextSizeInfo;
if (recordContextSizeInfoForAnalysis()) {
auto TotalSize = AllocInfo->Info.getTotalSize();
@@ -405,22 +409,39 @@ handleAllocSite(Instruction &I, CallBase *CI,
const std::set<const AllocationInfo *> &AllocInfoSet,
std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
&FullStackIdToAllocMatchInfo) {
+ // TODO: Remove this once the profile creation logic deduplicates contexts
+ // that are the same other than the IsInlineFrame bool. Until then, keep the
+ // largest.
+ DenseMap<uint64_t, const AllocationInfo *> UniqueFullContextIdAllocInfo;
+ for (auto *AllocInfo : AllocInfoSet) {
+ auto FullStackId = computeFullStackId(AllocInfo->CallStack);
+ auto [It, Inserted] =
+ UniqueFullContextIdAllocInfo.insert({FullStackId, AllocInfo});
+ // If inserted entry, done.
+ if (Inserted)
+ continue;
+ // Keep the larger one, or the noncold one if they are the same size.
+ auto CurSize = It->second->Info.getTotalSize();
+ auto NewSize = AllocInfo->Info.getTotalSize();
+ if ((CurSize > NewSize) ||
+ (CurSize == NewSize &&
+ getAllocType(AllocInfo) != AllocationType::NotCold))
+ continue;
+ It->second = AllocInfo;
+ }
// We may match this instruction's location list to multiple MIB
// contexts. Add them to a Trie specialized for trimming the contexts to
// the minimal needed to disambiguate contexts with unique behavior.
CallStackTrie AllocTrie(&ORE, MaxColdSize);
uint64_t TotalSize = 0;
uint64_t TotalColdSize = 0;
- for (auto *AllocInfo : AllocInfoSet) {
+ for (auto &[FullStackId, AllocInfo] : UniqueFullContextIdAllocInfo) {
// Check the full inlined call stack against this one.
// If we found and thus matched all frames on the call, include
// this MIB.
if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
InlinedCallStack)) {
NumOfMemProfMatchedAllocContexts++;
- uint64_t FullStackId = 0;
- if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis())
- FullStackId = computeFullStackId(AllocInfo->CallStack);
auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
TotalSize += AllocInfo->Info.getTotalSize();
if (AllocType == AllocationType::Cold)
diff --git a/llvm/test/Transforms/PGOProfile/memprof_
diff _inline.ll b/llvm/test/Transforms/PGOProfile/memprof_
diff _inline.ll
new file mode 100644
index 0000000000000..5213a07d13d39
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/memprof_
diff _inline.ll
@@ -0,0 +1,118 @@
+;; Tests that the compiler ignores smaller contexts that
diff er only in the
+;; IsInlineFrame bool. These map to the same full context id internally, as we
+;; ignore the inline frame status which may
diff er in feedback compiles.
+;; Presumably this happens when profiles collected from
diff erent binaries are
+;; merged. If we didn't pick the largest we would default them all to noncold.
+
+;; Avoid failures on big-endian systems that can't read the profile properly
+; REQUIRES: x86_64-linux
+
+;; Generate the profile and the IR.
+; RUN: split-file %s %t
+
+;; Generate indexed profile
+; RUN: llvm-profdata merge %t/memprof_
diff _inline.yaml -o %t.memprofdata
+
+; RUN: opt < %t/memprof_
diff _inline.ll -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-print-match-info 2>&1 | FileCheck %s --check-prefixes=MEMPROF
+
+; MEMPROF: MemProf notcold context with id 10194276560488437434 has total profiled size 200 is matched with 1 frames
+; MEMPROF: MemProf cold context with id 16342802530253093571 has total profiled size 10000 is matched with 1 frames
+
+;--- memprof_
diff _inline.yaml
+---
+HeapProfileRecords:
+ - GUID: _Z3foov
+ AllocSites:
+ # Small non-cold, full context id 16342802530253093571, should ignore
+ - Callstack:
+ - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+ - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false }
+ - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false }
+ - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false }
+ MemInfoBlock:
+ AllocCount: 1
+ TotalSize: 10
+ TotalLifetime: 0
+ TotalLifetimeAccessDensity: 20000
+ # Large cold, full context id 16342802530253093571, should keep
+ - Callstack:
+ - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+ - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: true }
+ - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false }
+ - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false }
+ MemInfoBlock:
+ AllocCount: 1
+ TotalSize: 10000
+ TotalLifetime: 200000
+ TotalLifetimeAccessDensity: 0
+ # Small non-cold, full context id 16342802530253093571, should ignore
+ - Callstack:
+ - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+ - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false }
+ - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: true }
+ - { Function: main, LineOffset: 8, Column: 13, IsInlineFrame: false }
+ MemInfoBlock:
+ AllocCount: 1
+ TotalSize: 100
+ TotalLifetime: 0
+ TotalLifetimeAccessDensity: 20000
+ # Small non-cold, full context id 10194276560488437434
+ - Callstack:
+ - { Function: _Z3foov, LineOffset: 1, Column: 10, IsInlineFrame: false }
+ - { Function: _Z4foo2v, LineOffset: 1, Column: 10, IsInlineFrame: false }
+ - { Function: _Z3barv, LineOffset: 1, Column: 10, IsInlineFrame: false }
+ - { Function: main, LineOffset: 9, Column: 13, IsInlineFrame: false }
+ MemInfoBlock:
+ AllocCount: 1
+ TotalSize: 200
+ TotalLifetime: 0
+ TotalLifetimeAccessDensity: 20000
+ CallSites: []
+...
+;--- memprof_
diff _inline.ll
+; ModuleID = 'memprof_
diff _inline.cc'
+source_filename = "memprof_
diff _inline.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%"struct.std::nothrow_t" = type { i8 }
+
+ at _ZSt7nothrow = external global %"struct.std::nothrow_t", align 1
+
+define dso_local noundef ptr @_Z3foov() !dbg !10 {
+entry:
+ ; MEMPROF: call {{.*}} @_Znwm{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
+ %call = call noalias noundef align 32 ptr @_Znwm(i64 noundef 32) #6, !dbg !13
+ ret ptr %call
+}
+
+declare noundef ptr @_Znwm(i64 noundef)
+
+attributes #6 = { builtin allocsize(0) }
+
+; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]]}
+
+; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"notcold", ![[CONTEXTSIZE1:[0-9]+]]}
+; MEMPROF: ![[STACK1]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 2061451396820446691}
+;; Full context id 10194276560488437434 == -8252467513221114182
+; MEMPROF: ![[CONTEXTSIZE1]] = !{i64 -8252467513221114182, i64 200}
+
+; MEMPROF: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"cold", ![[CONTEXTSIZE2:[0-9]+]]}
+; MEMPROF: ![[STACK2]] = !{i64 2732490490862098848, i64 8467819354083268568, i64 9086428284934609951, i64 -5747251260480066785}
+;; Full context id 16342802530253093571 == -2103941543456458045
+;; We should have kept the large (cold) one.
+; MEMPROF: ![[CONTEXTSIZE2]] = !{i64 -2103941543456458045, i64 10000}
+
+; MEMPROF: ![[C1]] = !{i64 2732490490862098848}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(line: 5, column: 10, scope: !10)
diff --git a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
index 2461ca32e9821..ba53c5797208c 100644
--- a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
@@ -4,24 +4,50 @@
;; Avoid failures on big-endian systems that can't read the profile properly
; REQUIRES: x86_64-linux
-;; TODO: Use text profile inputs once that is available for memprof.
-;; # To update the Inputs below, run Inputs/update_memprof_inputs.sh.
-;; # To generate below LLVM IR for use in matching.
-;; $ clang++ -gmlt -fdebug-info-for-profiling -S %S/Inputs/memprof_loop_unroll_b.cc -emit-llvm
+; Generate the profile and the IR.
+; RUN: split-file %s %t
+
+;; Generate indexed profile
+; RUN: llvm-profdata merge %t/memprof_loop_unroll.yaml -o %t.memprofdata
-; RUN: llvm-profdata merge %S/Inputs/memprof_loop_unroll.memprofraw --profiled-binary %S/Inputs/memprof_loop_unroll.exe -o %t.memprofdata
;; Set the minimum lifetime threshold to 0 to ensure that one context is
;; considered cold (the other will be notcold).
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s
+; RUN: opt < %t/memprof_loop_unroll.ll -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s
-;; Conservatively annotate as not cold. We get two messages as there are two
-;; unrolled copies of the allocation.
-; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4
-; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and indistinguishable alloc type notcold: 4
+;; Conservatively annotate as not cold.
+; CHECK: MemProf hinting: Total size for full allocation context hash {{.*}} and single alloc type notcold: 4
; CHECK: call {{.*}} @_Znam{{.*}} #[[ATTR:[0-9]+]]
; CHECK: attributes #[[ATTR]] = { builtin allocsize(0) "memprof"="notcold" }
; CHECK-NOT: stackIds: ()
+;--- memprof_loop_unroll.yaml
+---
+HeapProfileRecords:
+ - GUID: 0x7f8d88fcc70a347b
+ AllocSites:
+ - Callstack:
+ - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false }
+ - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false }
+ MemInfoBlock:
+ AllocCount: 1
+ TotalSize: 4
+ TotalLifetime: 2
+ TotalLifetimeAccessDensity: 12500000000
+ - Callstack:
+ - { Function: 0x7f8d88fcc70a347b, LineOffset: 2, Column: 16, IsInlineFrame: false }
+ - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false }
+ MemInfoBlock:
+ AllocCount: 1
+ TotalSize: 4
+ TotalLifetime: 2
+ TotalLifetimeAccessDensity: 0
+ - GUID: 0xdb956436e78dd5fa
+ CallSites:
+ - Frames:
+ - { Function: 0xdb956436e78dd5fa, LineOffset: 1, Column: 5, IsInlineFrame: false }
+...
+
+;--- memprof_loop_unroll.ll
; ModuleID = 'memprof_loop_unroll_b.cc'
source_filename = "memprof_loop_unroll_b.cc"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
More information about the llvm-commits
mailing list