[llvm] [MemProf] Supporting hinting mostly-cold allocations after cloning (PR #120633)
Teresa Johnson via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 19 12:30:21 PST 2024
https://github.com/teresajohnson updated https://github.com/llvm/llvm-project/pull/120633
>From 6196acfdb264209a19228dd05bb821280d36945a Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Thu, 19 Dec 2024 12:14:31 -0800
Subject: [PATCH 1/2] [MemProf] Supporting hinting mostly-cold allocations
after cloning
Optionally unconditionally hint allocations as cold or not cold during
the cloning step if the percentage of bytes allocated is at least that
of the given threshold. This is similar to PR120301 which supports this
during matching, but enables the same behavior during cloning, to reduce
the false positives that can be addressed by cloning at the cost of
carrying the additional size metadata/summary.
---
.../IPO/MemProfContextDisambiguation.cpp | 77 ++++++++++++++++---
.../Instrumentation/MemProfiler.cpp | 9 ++-
.../ThinLTO/X86/memprof-missing-callsite.ll | 32 +++++++-
llvm/test/Transforms/PGOProfile/memprof.ll | 4 +
4 files changed, 110 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index cd4b9f557fb494..59c183b105c24a 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -140,6 +140,7 @@ cl::opt<bool> MemProfRequireDefinitionForPromotion(
} // namespace llvm
extern cl::opt<bool> MemProfReportHintedSizes;
+extern cl::opt<unsigned> MinClonedColdBytePercent;
namespace {
/// CRTP base for graphs built from either IR or ThinLTO summary index.
@@ -617,6 +618,11 @@ class CallsiteContextGraph {
static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
}
+ /// Get the AllocationType assigned to the given allocation instruction clone.
+ AllocationType getAllocationCallType(const CallInfo &Call) const {
+ return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
+ }
+
/// Update non-allocation call to invoke (possibly cloned) function
/// CalleeFunc.
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
@@ -711,7 +717,8 @@ class CallsiteContextGraph {
/// Map from each contextID to the profiled full contexts and their total
/// sizes (there may be more than one due to context trimming),
- /// optionally populated when requested (via MemProfReportHintedSizes).
+ /// optionally populated when requested (via MemProfReportHintedSizes or
+ /// MinClonedColdBytePercent).
DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
/// Identifies the context node created for a stack id when adding the MIB
@@ -773,6 +780,7 @@ class ModuleCallsiteContextGraph
uint64_t getLastStackId(Instruction *Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
+ AllocationType getAllocationCallType(const CallInfo &Call) const;
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
Instruction *>::FuncInfo
@@ -852,6 +860,7 @@ class IndexCallsiteContextGraph
uint64_t getLastStackId(IndexCall &Call);
std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
+ AllocationType getAllocationCallType(const CallInfo &Call) const;
void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
IndexCall>::FuncInfo
@@ -1201,8 +1210,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
ContextIdToAllocationType[++LastContextId] = AllocType;
- if (MemProfReportHintedSizes) {
- assert(!ContextSizeInfo.empty());
+ if (!ContextSizeInfo.empty()) {
auto &Entry = ContextIdToContextSizeInfos[LastContextId];
Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
}
@@ -2043,14 +2051,15 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
EmptyContext;
unsigned I = 0;
- assert(!MemProfReportHintedSizes ||
- AN.ContextSizeInfos.size() == AN.MIBs.size());
+ assert(
+ (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) ||
+ AN.ContextSizeInfos.size() == AN.MIBs.size());
// Now add all of the MIBs and their stack nodes.
for (auto &MIB : AN.MIBs) {
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
StackContext(&MIB);
std::vector<ContextTotalSize> ContextSizeInfo;
- if (MemProfReportHintedSizes) {
+ if (!AN.ContextSizeInfos.empty()) {
for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
ContextSizeInfo.push_back({FullStackId, TotalSize});
}
@@ -2825,6 +2834,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
if (!Node->IsAllocation)
continue;
DenseSet<uint32_t> ContextIds = Node->getContextIds();
+ auto AllocTypeFromCall = getAllocationCallType(Node->Call);
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
std::sort(SortedIds.begin(), SortedIds.end());
for (auto Id : SortedIds) {
@@ -2837,7 +2847,11 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
<< getAllocTypeString((uint8_t)TypeI->second)
<< " full allocation context " << Info.FullStackId
<< " with total size " << Info.TotalSize << " is "
- << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
+ << getAllocTypeString(Node->AllocTypes) << " after cloning";
+ if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
+ OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
+ << " due to cold byte percent";
+ OS << "\n";
}
}
}
@@ -3487,6 +3501,23 @@ void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
}
+AllocationType
+ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
+ const auto *CB = cast<CallBase>(Call.call());
+ if (!CB->getAttributes().hasFnAttr("memprof"))
+ return AllocationType::None;
+ return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
+ ? AllocationType::Cold
+ : AllocationType::NotCold;
+}
+
+AllocationType
+IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
+ const auto *AI = Call.call().dyn_cast<AllocInfo *>();
+ assert(AI->Versions.size() > Call.cloneNo());
+ return (AllocationType)AI->Versions[Call.cloneNo()];
+}
+
void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
FuncInfo CalleeFunc) {
if (CalleeFunc.cloneNo() > 0)
@@ -4017,6 +4048,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
}
}
+ uint8_t BothTypes =
+ (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
+
auto UpdateCalls = [&](ContextNode *Node,
DenseSet<const ContextNode *> &Visited,
auto &&UpdateCalls) {
@@ -4036,7 +4070,31 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
return;
if (Node->IsAllocation) {
- updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
+ auto AT = allocTypeToUse(Node->AllocTypes);
+ // If the allocation type is ambiguous, and more aggressive hinting
+ // has been enabled via the MinClonedColdBytePercent flag, see if this
+ // allocation should be hinted cold anyway because its fraction cold bytes
+ // allocated is at least the given threshold.
+ if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
+ !ContextIdToContextSizeInfos.empty()) {
+ uint64_t TotalCold = 0;
+ uint64_t Total = 0;
+ for (auto Id : Node->getContextIds()) {
+ auto TypeI = ContextIdToAllocationType.find(Id);
+ assert(TypeI != ContextIdToAllocationType.end());
+ auto CSI = ContextIdToContextSizeInfos.find(Id);
+ if (CSI != ContextIdToContextSizeInfos.end()) {
+ for (auto &Info : CSI->second) {
+ Total += Info.TotalSize;
+ if (TypeI->second == AllocationType::Cold)
+ TotalCold += Info.TotalSize;
+ }
+ }
+ }
+ if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
+ AT = AllocationType::Cold;
+ }
+ updateAllocationCall(Node->Call, AT);
assert(Node->MatchingCalls.empty());
return;
}
@@ -4419,7 +4477,8 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// will still be none type or should have gotten the default NotCold.
// Skip that after calling clone helper since that does some sanity
// checks that confirm we haven't decided yet that we need cloning.
- if (AllocNode.Versions.size() == 1) {
+ if (AllocNode.Versions.size() == 1 &&
+ (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
assert((AllocationType)AllocNode.Versions[0] ==
AllocationType::NotCold ||
(AllocationType)AllocNode.Versions[0] ==
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index db8999911b7f92..9831b37fb47709 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -176,6 +176,10 @@ static cl::opt<bool>
cl::desc("Salvage stale MemProf profile"),
cl::init(false), cl::Hidden);
+cl::opt<unsigned> MinClonedColdBytePercent(
+ "memprof-cloning-cold-threshold", cl::init(100), cl::Hidden,
+ cl::desc("Min percent of cold bytes to hint alloc cold during cloning"));
+
extern cl::opt<bool> MemProfReportHintedSizes;
static cl::opt<unsigned> MinMatchedColdBytePercent(
@@ -755,7 +759,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie,
AllocInfo->Info.getAllocCount(),
AllocInfo->Info.getTotalLifetime());
std::vector<ContextTotalSize> ContextSizeInfo;
- if (MemProfReportHintedSizes) {
+ if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) {
auto TotalSize = AllocInfo->Info.getTotalSize();
assert(TotalSize);
assert(FullStackId != 0);
@@ -1126,7 +1130,8 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
InlinedCallStack)) {
NumOfMemProfMatchedAllocContexts++;
uint64_t FullStackId = 0;
- if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes)
+ if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes ||
+ MinClonedColdBytePercent < 100)
FullStackId = computeFullStackId(AllocInfo->CallStack);
auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
TotalSize += AllocInfo->Info.getTotalSize();
diff --git a/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll b/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll
index e7fbb3c0bad2c0..a8002a74de965b 100644
--- a/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll
+++ b/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll
@@ -1,6 +1,7 @@
;; Test callsite context graph generation for simple call graph with
;; two memprof contexts and no inlining, where one callsite required for
-;; cloning is missing (e.g. unmatched).
+;; cloning is missing (e.g. unmatched). Use this to test aggressive hinting
+;; threshold.
;;
;; Original code looks like:
;;
@@ -30,6 +31,27 @@
; RUN: --check-prefix=SIZESUNHINTED
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --implicit-check-not "\"memprof\"=\"cold\""
+; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -supports-hot-cold-new \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-report-hinted-sizes -memprof-cloning-cold-threshold=80 \
+; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKSHINTED \
+; RUN: --check-prefix=SIZESHINTED
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED
+
+; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -supports-hot-cold-new \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-cloning-cold-threshold=80 \
+; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKSHINTED
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED
+
source_filename = "memprof-missing-callsite.ll"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@@ -68,3 +90,11 @@ attributes #0 = { noinline optnone }
; SIZESUNHINTED: NotCold full allocation context 123 with total size 100 is NotColdCold after cloning
; SIZESUNHINTED: Cold full allocation context 456 with total size 200 is NotColdCold after cloning
; SIZESUNHINTED: Cold full allocation context 789 with total size 300 is NotColdCold after cloning
+
+; SIZESHINTED: NotCold full allocation context 123 with total size 100 is NotColdCold after cloning marked Cold due to cold byte percent
+; SIZESHINTED: Cold full allocation context 456 with total size 200 is NotColdCold after cloning marked Cold due to cold byte percent
+; SIZESHINTED: Cold full allocation context 789 with total size 300 is NotColdCold after cloning marked Cold due to cold byte percent
+
+; REMARKSHINTED: call in clone _Z3foov marked with memprof allocation attribute cold
+
+; IRHINTED: "memprof"="cold"
diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index 7e47c8ded4e4a7..c0e44cccbf16ff 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -69,6 +69,10 @@
;; Check that we hint additional allocations with a threshold < 100%
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-report-hinted-sizes -memprof-matching-cold-threshold=60 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZESTHRESH60
+;; Make sure that the -memprof-cloning-cold-threshold flag is enough to cause
+;; the size metadata to be generated for the LTO link.
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-cloning-cold-threshold=80 2>&1 | FileCheck %s --check-prefixes=TOTALSIZES
+
;; Make sure we emit a random hotness seed if requested.
; RUN: llvm-profdata merge -memprof-random-hotness %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand 2>&1 | FileCheck %s --check-prefix=RAND
; RAND: random hotness seed =
>From abbf611519b8deef7e50d826655ed6a39eabac95 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Thu, 19 Dec 2024 12:28:22 -0800
Subject: [PATCH 2/2] Add a few more comments
---
llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 3 +++
llvm/test/ThinLTO/X86/memprof-missing-callsite.ll | 3 +++
2 files changed, 6 insertions(+)
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 59c183b105c24a..1bf7ff468d782b 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -4477,6 +4477,9 @@ bool MemProfContextDisambiguation::applyImport(Module &M) {
// will still be none type or should have gotten the default NotCold.
// Skip that after calling clone helper since that does some sanity
// checks that confirm we haven't decided yet that we need cloning.
+ // We might have a single version that is cold due to the
+ // MinClonedColdBytePercent heuristic, make sure we don't skip in that
+ // case.
if (AllocNode.Versions.size() == 1 &&
(AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
assert((AllocationType)AllocNode.Versions[0] ==
diff --git a/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll b/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll
index a8002a74de965b..5ce786c63ac1a7 100644
--- a/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll
+++ b/llvm/test/ThinLTO/X86/memprof-missing-callsite.ll
@@ -31,6 +31,7 @@
; RUN: --check-prefix=SIZESUNHINTED
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --implicit-check-not "\"memprof\"=\"cold\""
+;; Check that we do hint with a sufficient -memprof-cloning-cold-threshold.
; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
; RUN: -supports-hot-cold-new \
@@ -42,6 +43,8 @@
; RUN: --check-prefix=SIZESHINTED
; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IRHINTED
+;; Check again that we hint with a sufficient -memprof-cloning-cold-threshold,
+;; even if we don't specify -memprof-report-hinted-sizes.
; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
; RUN: -supports-hot-cold-new \
More information about the llvm-commits
mailing list