[llvm] fe27495 - [MemProf] Context disambiguation cloning pass [patch 1b/3]
Teresa Johnson via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 22 14:58:03 PDT 2023
Author: Teresa Johnson
Date: 2023-03-22T14:57:53-07:00
New Revision: fe27495be2040007c7b20844a9371b06156ab405
URL: https://github.com/llvm/llvm-project/commit/fe27495be2040007c7b20844a9371b06156ab405
DIFF: https://github.com/llvm/llvm-project/commit/fe27495be2040007c7b20844a9371b06156ab405.diff
LOG: [MemProf] Context disambiguation cloning pass [patch 1b/3]
Adds support for building the graph in ThinLTO from MemProf summaries.
Follow-on patches will contain the support for cloning on the graph and
in the IR.
Depends on D140908.
Differential Revision: https://reviews.llvm.org/D145836
Added:
llvm/test/ThinLTO/X86/memprof-basic.ll
llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll
llvm/test/ThinLTO/X86/memprof-indirectcall.ll
llvm/test/ThinLTO/X86/memprof-inlined.ll
llvm/test/ThinLTO/X86/memprof-inlined2.ll
Modified:
llvm/include/llvm/IR/ModuleSummaryIndex.h
llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
llvm/lib/LTO/LTO.cpp
llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 18853102799b4..0c178ccef3bbb 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -988,12 +988,22 @@ class FunctionSummary : public GlobalValueSummary {
return {};
}
+ CallsitesTy &mutableCallsites() {
+ assert(Callsites);
+ return *Callsites;
+ }
+
ArrayRef<AllocInfo> allocs() const {
if (Allocs)
return *Allocs;
return {};
}
+ AllocsTy &mutableAllocs() {
+ assert(Allocs);
+ return *Allocs;
+ }
+
friend struct GraphTraits<ValueInfo>;
};
diff --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index 56e56ed67f7df..475ea48cca932 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -19,9 +19,12 @@
#include "llvm/ADT/StringSet.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/PassManager.h"
+#include <functional>
namespace llvm {
+class GlobalValueSummary;
class Module;
+class ModuleSummaryIndex;
class MemProfContextDisambiguation
: public PassInfoMixin<MemProfContextDisambiguation> {
@@ -32,6 +35,10 @@ class MemProfContextDisambiguation
MemProfContextDisambiguation() {}
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+ void run(ModuleSummaryIndex &Index,
+ function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+ isPrevailing);
};
} // end namespace llvm
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 1f273a8e5025f..ee6b8c3aa234d 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -51,6 +51,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
#include "llvm/Transforms/Utils/FunctionImportUtils.h"
#include "llvm/Transforms/Utils/SplitModule.h"
@@ -75,6 +76,9 @@ cl::opt<bool> EnableLTOInternalization(
cl::desc("Enable global value internalization in LTO"));
}
+/// Enable MemProf context disambiguation for thin link.
+extern cl::opt<bool> EnableMemProfContextDisambiguation;
+
// Computes a unique hash for the Module considering the current list of
// export/import and other global analysis results.
// The hash is produced in \p Key.
@@ -1539,6 +1543,14 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
runWholeProgramDevirtOnIndex(ThinLTO.CombinedIndex, ExportedGUIDs,
LocalWPDTargetsMap);
+ auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) {
+ return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
+ };
+ if (EnableMemProfContextDisambiguation) {
+ MemProfContextDisambiguation ContextDisambiguation;
+ ContextDisambiguation.run(ThinLTO.CombinedIndex, isPrevailing);
+ }
+
if (Conf.OptLevel > 0)
ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
ImportLists, ExportLists);
@@ -1580,10 +1592,6 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
updateIndexWPDForExports(ThinLTO.CombinedIndex, isExported,
LocalWPDTargetsMap);
- auto isPrevailing = [&](GlobalValue::GUID GUID,
- const GlobalValueSummary *S) {
- return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
- };
thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported,
isPrevailing);
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5a6625743eecf..b2fcea1ec8694 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -14,9 +14,9 @@
// subsequently annotated with an attribute for later transformation.
//
// The transformations can be performed either directly on IR (regular LTO), or
-// (eventually) on a ThinLTO index (later applied to the IR during the ThinLTO
-// backend). Both types of LTO operate on a the same base graph representation,
-// which uses CRTP to support either IR or Index formats.
+// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
+// Both types of LTO operate on a the same base graph representation, which
+// uses CRTP to support either IR or Index formats.
//
//===----------------------------------------------------------------------===//
@@ -28,9 +28,11 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/MemoryProfileInfo.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/FileSystem.h"
@@ -458,6 +460,56 @@ class ModuleCallsiteContextGraph
const Module &Mod;
};
+/// Represents a call in the summary index graph, which can either be an
+/// allocation or an interior callsite node in an allocation's context.
+/// Holds a pointer to the corresponding data structure in the index.
+struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
+ IndexCall() : PointerUnion() {}
+ IndexCall(std::nullptr_t) : IndexCall() {}
+ IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
+ IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
+
+ IndexCall *operator->() { return this; }
+
+ void print(raw_ostream &OS) const {
+ if (auto *AI = dyn_cast<AllocInfo *>())
+ OS << *AI;
+ else {
+ auto *CI = dyn_cast<CallsiteInfo *>();
+ assert(CI);
+ OS << *CI;
+ }
+ }
+};
+
+/// CRTP derived class for graphs built from summary index (ThinLTO).
+class IndexCallsiteContextGraph
+ : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+ IndexCall> {
+public:
+ IndexCallsiteContextGraph(
+ ModuleSummaryIndex &Index,
+ function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+ isPrevailing);
+
+private:
+ friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+ IndexCall>;
+
+ uint64_t getStackId(uint64_t IdOrIndex) const;
+ bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
+ uint64_t getLastStackId(IndexCall &Call);
+ std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
+ std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
+ unsigned CloneNo) const;
+
+ // Saves mapping from function summaries containing memprof records back to
+ // its VI, for use in checking and debugging.
+ std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
+
+ const ModuleSummaryIndex &Index;
+};
+
namespace {
struct FieldSeparator {
@@ -475,6 +527,20 @@ raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
return OS << FS.Sep;
}
+// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
+// type we should actually use on the corresponding allocation.
+// If we can't clone a node that has NotCold+Cold alloc type, we will fall
+// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
+// from NotCold.
+AllocationType allocTypeToUse(uint8_t AllocTypes) {
+ assert(AllocTypes != (uint8_t)AllocationType::None);
+ if (AllocTypes ==
+ ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
+ return AllocationType::NotCold;
+ else
+ return (AllocationType)AllocTypes;
+}
+
} // end anonymous namespace
template <typename DerivedCCG, typename FuncTy, typename CallTy>
@@ -1118,6 +1184,20 @@ uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
return CallsiteContext.back();
}
+uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
+ assert(Call.is<CallsiteInfo *>());
+ CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
+ CallsiteContext(Call.dyn_cast<CallsiteInfo *>());
+ // Need to convert index into stack id.
+ return Index.getStackIdAtIndex(CallsiteContext.back());
+}
+
+static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
+ if (!CloneNo)
+ return Base.str();
+ return (Base + ".memprof." + Twine(CloneNo)).str();
+}
+
std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
const Instruction *Call,
unsigned CloneNo) const {
@@ -1126,6 +1206,22 @@ std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
.str();
}
+std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
+ const IndexCall &Call,
+ unsigned CloneNo) const {
+ auto VI = FSToVIMap.find(Func);
+ assert(VI != FSToVIMap.end());
+ if (Call.is<AllocInfo *>())
+ return (VI->second.name() + " -> alloc").str();
+ else {
+ auto *Callsite = Call.dyn_cast<CallsiteInfo *>();
+ return (VI->second.name() + " -> " +
+ getMemProfFuncName(Callsite->Callee.name(),
+ Callsite->Clones[CloneNo]))
+ .str();
+ }
+}
+
std::vector<uint64_t>
ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
Instruction *Call) {
@@ -1135,6 +1231,16 @@ ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
CallsiteContext);
}
+std::vector<uint64_t>
+IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
+ assert(Call.is<CallsiteInfo *>());
+ CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
+ CallsiteContext(Call.dyn_cast<CallsiteInfo *>());
+ return getStackIdsWithContextNodes<CallsiteInfo,
+ SmallVector<unsigned>::const_iterator>(
+ CallsiteContext);
+}
+
template <typename DerivedCCG, typename FuncTy, typename CallTy>
template <class NodeT, class IteratorT>
std::vector<uint64_t>
@@ -1207,6 +1313,84 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) {
Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
}
+IndexCallsiteContextGraph::IndexCallsiteContextGraph(
+ ModuleSummaryIndex &Index,
+ function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+ isPrevailing)
+ : Index(Index) {
+ for (auto &I : Index) {
+ auto VI = Index.getValueInfo(I);
+ for (auto &S : VI.getSummaryList()) {
+ // We should only add the prevailing nodes. Otherwise we may try to clone
+ // in a weak copy that won't be linked (and may be
diff erent than the
+ // prevailing version).
+ // We only keep the memprof summary on the prevailing copy now when
+ // building the combined index, as a space optimization, however don't
+ // rely on this optimization. The linker doesn't resolve local linkage
+ // values so don't check whether those are prevailing.
+ if (!GlobalValue::isLocalLinkage(S->linkage()) &&
+ !isPrevailing(VI.getGUID(), S.get()))
+ continue;
+ auto *FS = dyn_cast<FunctionSummary>(S.get());
+ if (!FS)
+ continue;
+ std::vector<CallInfo> CallsWithMetadata;
+ if (!FS->allocs().empty()) {
+ for (auto &AN : FS->mutableAllocs()) {
+ // This can happen because of recursion elimination handling that
+ // currently exists in ModuleSummaryAnalysis. Skip these for now.
+ // We still added them to the summary because we need to be able to
+ // correlate properly in applyImport in the backends.
+ if (AN.MIBs.empty())
+ continue;
+ CallsWithMetadata.push_back({&AN});
+ auto *AllocNode = addAllocNode({&AN}, FS);
+ // Pass an empty CallStack to the CallsiteContext (second)
+ // parameter, since for ThinLTO we already collapsed out the inlined
+ // stack ids on the allocation call during ModuleSummaryAnalysis.
+ CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
+ EmptyContext;
+ // Now add all of the MIBs and their stack nodes.
+ for (auto &MIB : AN.MIBs) {
+ CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
+ StackContext(&MIB);
+ addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
+ AllocNode, StackContext, EmptyContext, MIB.AllocType);
+ }
+ assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
+ // Initialize version 0 on the summary alloc node to the current alloc
+ // type, unless it has both types in which case make it default, so
+ // that in the case where we aren't able to clone the original version
+ // always ends up with the default allocation behavior.
+ AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
+ }
+ }
+ // For callsite metadata, add to list for this function for later use.
+ if (!FS->callsites().empty())
+ for (auto &SN : FS->mutableCallsites())
+ CallsWithMetadata.push_back({&SN});
+
+ if (!CallsWithMetadata.empty())
+ FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata});
+
+ if (!FS->allocs().empty() || !FS->callsites().empty())
+ FSToVIMap[FS] = VI;
+ }
+ }
+
+ if (DumpCCG) {
+ dbgs() << "CCG before updating call stack chains:\n";
+ dbgs() << *this;
+ }
+
+ if (ExportToDot)
+ exportToDot("prestackupdate");
+
+ updateStackNodes();
+
+ handleCallsitesWithMultipleTargets();
+}
+
template <typename DerivedCCG, typename FuncTy, typename CallTy>
void CallsiteContextGraph<DerivedCCG, FuncTy,
CallTy>::handleCallsitesWithMultipleTargets() {
@@ -1251,6 +1435,12 @@ uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
return IdOrIndex;
}
+uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
+ // In the Index case this is an index into the stack id list in the summary
+ // index, convert it to an Id.
+ return Index.getStackIdAtIndex(IdOrIndex);
+}
+
bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
const Function *Func) {
auto *CB = dyn_cast<CallBase>(Call);
@@ -1264,6 +1454,23 @@ bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
return Alias && Alias->getAliasee() == Func;
}
+bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call,
+ const FunctionSummary *Func) {
+ ValueInfo Callee = Call.dyn_cast<CallsiteInfo *>()->Callee;
+ // If there is no summary list then this is a call to an externally defined
+ // symbol.
+ AliasSummary *Alias =
+ Callee.getSummaryList().empty()
+ ? nullptr
+ : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
+ assert(FSToVIMap.count(Func));
+ return Callee == FSToVIMap[Func] ||
+ // If callee is an alias, check the aliasee, since only function
+ // summary base objects will contain the stack node summaries and thus
+ // get a context node.
+ (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]);
+}
+
static std::string getAllocTypeString(uint8_t AllocTypes) {
if (!AllocTypes)
return "None";
@@ -1581,3 +1788,11 @@ PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
+
+void MemProfContextDisambiguation::run(
+ ModuleSummaryIndex &Index,
+ function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+ isPrevailing) {
+ IndexCallsiteContextGraph CCG(Index, isPrevailing);
+ CCG.process();
+}
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
new file mode 100644
index 0000000000000..d8c78d270f277
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -0,0 +1,157 @@
+;; Test callsite context graph generation for simple call graph with
+;; two memprof contexts and no inlining.
+;;
+;; Original code looks like:
+;;
+;; char *bar() {
+;; return new char[10];
+;; }
+;;
+;; char *baz() {
+;; return bar();
+;; }
+;;
+;; char *foo() {
+;; return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;; char *x = foo();
+;; char *y = foo();
+;; memset(x, 0, 10);
+;; memset(y, 0, 10);
+;; delete[] x;
+;; sleep(10);
+;; delete[] y;
+;; return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+source_filename = "memprof-basic.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() {
+entry:
+ %call = call ptr @_Z3foov(), !callsite !0
+ %call1 = call ptr @_Z3foov(), !callsite !1
+ ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+define internal ptr @_Z3barv() {
+entry:
+ %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7
+ ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z3bazv() {
+entry:
+ %call = call ptr @_Z3barv(), !callsite !8
+ ret ptr null
+}
+
+define internal ptr @_Z3foov() {
+entry:
+ %call = call ptr @_Z3bazv(), !callsite !9
+ ret ptr null
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 1, 0 }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{!3, !5}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!5 = !{!6, !"cold"}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!7 = !{i64 9086428284934609951}
+!8 = !{i64 -5964873800580613432}
+!9 = !{i64 2732490490862098848}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 2, 3, 0
+; DUMP: AllocType 2 StackIds: 2, 3, 1
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[BAZ]]
+; DUMP: Callee: 9832687305761716512 (_Z3barv) Clones: 0 StackIds: 2 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[FOO]]
+; DUMP: Callee: 5878270615442837395 (_Z3bazv) Clones: 0 StackIds: 3 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[MAIN1]]
+; DUMP: Callee: 6731117468105397038 (_Z3foov) Clones: 0 StackIds: 0 (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 1
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: Callee: 6731117468105397038 (_Z3foov) Clones: 0 StackIds: 1 (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 2
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: CallerEdges:
+
+
+; DOT: digraph "postbuild" {
+; DOT: label="postbuild";
+; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
+; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOT: Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOT: Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: }
diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
new file mode 100644
index 0000000000000..772b319e0715e
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
@@ -0,0 +1,229 @@
+;; Test callsite context graph generation for call graph with with MIBs
+;; that have pruned contexts that partially match multiple inlined
+;; callsite contexts, requiring duplication of context ids and nodes
+;; while matching callsite nodes onto the graph.
+;;
+;; Original code looks like:
+;;
+;; char *D() {
+;; return new char[10];
+;; }
+;;
+;; char *F() {
+;; return D();
+;; }
+;;
+;; char *C() {
+;; return D();
+;; }
+;;
+;; char *B() {
+;; return C();
+;; }
+;;
+;; char *E() {
+;; return C();
+;; }
+;; int main(int argc, char **argv) {
+;; char *x = B(); // cold
+;; char *y = E(); // cold
+;; char *z = F(); // default
+;; memset(x, 0, 10);
+;; memset(y, 0, 10);
+;; memset(z, 0, 10);
+;; delete[] z;
+;; sleep(10);
+;; delete[] x;
+;; delete[] y;
+;; return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of C into both B and E.
+;; Since both allocation contexts via C are cold, the matched memprof
+;; metadata has the context pruned above C's callsite. This requires
+;; matching the stack node for C to callsites where it was inlined (i.e.
+;; the callsites in B and E that have callsite metadata that includes C's).
+;; It also requires duplication of that node in the graph as well as the
+;; duplication of the context ids along that path through the graph,
+;; so that we can represent the duplicated (via inlining) C callsite.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN: cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
+; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
+
+
+source_filename = "duplicate-context-ids.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal ptr @_Z1Dv() {
+entry:
+ %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+ ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z1Fv() {
+entry:
+ %call = call ptr @_Z1Dv(), !callsite !6
+ ret ptr null
+}
+
+define internal ptr @_Z1Cv() {
+entry:
+ %call = call ptr @_Z1Dv(), !callsite !7
+ ret ptr null
+}
+
+define internal ptr @_Z1Bv() {
+entry:
+ %call.i = call ptr @_Z1Dv(), !callsite !8
+ ret ptr null
+}
+
+define internal ptr @_Z1Ev() {
+entry:
+ %call.i = call ptr @_Z1Dv(), !callsite !9
+ ret ptr null
+}
+
+declare i32 @main()
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+!0 = !{!1, !3}
+!1 = !{!2, !"cold"}
+!2 = !{i64 6541423618768552252, i64 -6270142974039008131}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 6541423618768552252, i64 -4903163940066524832}
+!5 = !{i64 6541423618768552252}
+!6 = !{i64 -4903163940066524832}
+!7 = !{i64 -6270142974039008131}
+!8 = !{i64 -6270142974039008131, i64 -184525619819294889}
+!9 = !{i64 -6270142974039008131, i64 1905834578520680781}
+
+
+;; After adding only the alloc node memprof metadata, we only have 2 contexts.
+
+; DUMP: CCG before updating call stack chains:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D:0x[a-z0-9]+]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 2 StackIds: 0
+; DUMP: AllocType 1 StackIds: 1
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[D]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
+; DUMP: Edge from Callee [[D]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
+
+; DUMP: Node [[C]]
+; DUMP: null Call
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 1
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[D]] to Caller: [[C]] AllocTypes: Cold ContextIds: 1
+; DUMP: CallerEdges:
+
+; DUMP: Node [[F]]
+; DUMP: null Call
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 2
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: CallerEdges:
+
+;; After updating for callsite metadata, we should have generated context ids 3 and 4,
+;; along with 2 new nodes for those callsites. All have the same allocation type
+;; behavior as the original C node.
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 2 StackIds: 0
+; DUMP: AllocType 1 StackIds: 1
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2 3 4
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: Edge from Callee [[D]] to Caller: [[C2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 3
+; DUMP: Edge from Callee [[D]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: Edge from Callee [[D]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
+
+; DUMP: Node [[F]]
+; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 1 (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 2
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: CallerEdges:
+
+; DUMP: Node [[C2]]
+; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0 (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 3
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[D]] to Caller: [[C2]] AllocTypes: Cold ContextIds: 3
+; DUMP: CallerEdges:
+
+; DUMP: Node [[B]]
+; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 2 (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 4
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[D]] to Caller: [[B]] AllocTypes: Cold ContextIds: 4
+; DUMP: CallerEdges:
+
+; DUMP: Node [[E]]
+; DUMP: Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 3 (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 1
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[D]] to Caller: [[E]] AllocTypes: Cold ContextIds: 1
+; DUMP: CallerEdges:
+
+
+; DOTPRE: digraph "prestackupdate" {
+; DOTPRE: label="prestackupdate";
+; DOTPRE: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
+; DOTPRE: Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12176601099670543485\nnull call (external)}"];
+; DOTPRE: Node[[C]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"];
+; DOTPRE: Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\nnull call (external)}"];
+; DOTPRE: Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"];
+; DOTPRE: }
+
+
+; DOTPOST:digraph "postbuild" {
+; DOTPOST: label="postbuild";
+; DOTPOST: Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
+; DOTPOST: Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv}"];
+; DOTPOST: Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"];
+; DOTPOST: Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Cv -\> _Z1Dv}"];
+; DOTPOST: Node[[C]] -> Node[[D]][tooltip="ContextIds: 3",fillcolor="cyan"];
+; DOTPOST: Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Bv -\> _Z1Dv}"];
+; DOTPOST: Node[[B]] -> Node[[D]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOTPOST: Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Ev -\> _Z1Dv}"];
+; DOTPOST: Node[[E]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"];
+; DOTPOST:}
diff --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll
new file mode 100644
index 0000000000000..af7dece9421a9
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll
@@ -0,0 +1,390 @@
+;; Test callsite context graph generation for call graph with with MIBs
+;; that have pruned contexts that partially match multiple inlined
+;; callsite contexts, requiring duplication of context ids and nodes
+;; while matching callsite nodes onto the graph. This test requires more
+;; complex duplication due to multiple contexts for
diff erent allocations
+;; that share some of the same callsite nodes.
+;;
+;; Original code looks like:
+;;
+;; char *D(bool Call1) {
+;; if (Call1)
+;; return new char[10];
+;; else
+;; return new char[10];
+;; }
+;;
+;; char *C(bool Call1) {
+;; return D(Call1);
+;; }
+;;
+;; char *B(bool Call1) {
+;; if (Call1)
+;; return C(true);
+;; else
+;; return C(false);
+;; }
+;;
+;; char *A(bool Call1) {
+;; return B(Call1);
+;; }
+;;
+;; char *A1() {
+;; return A(true);
+;; }
+;;
+;; char *A2() {
+;; return A(true);
+;; }
+;;
+;; char *A3() {
+;; return A(false);
+;; }
+;;
+;; char *A4() {
+;; return A(false);
+;; }
+;;
+;; char *E() {
+;; return B(true);
+;; }
+;;
+;; char *F() {
+;; return B(false);
+;; }
+;;
+;; int main(int argc, char **argv) {
+;; char *a1 = A1(); // cold
+;; char *a2 = A2(); // cold
+;; char *e = E(); // default
+;; char *a3 = A3(); // default
+;; char *a4 = A4(); // default
+;; char *f = F(); // cold
+;; memset(a1, 0, 10);
+;; memset(a2, 0, 10);
+;; memset(e, 0, 10);
+;; memset(a3, 0, 10);
+;; memset(a4, 0, 10);
+;; memset(f, 0, 10);
+;; delete[] a3;
+;; delete[] a4;
+;; delete[] e;
+;; sleep(10);
+;; delete[] a1;
+;; delete[] a2;
+;; delete[] f;
+;; return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of A into its callers,
+;; without any other inlining or optimizations. Since both allocation contexts
+;; via A for each allocation in D have the same allocation type (cold via
+;; A1 and A2 for the first new in D, and non-cold via A3 and A4 for the second
+;; new in D, the contexts for those respective allocations are pruned above A.
+;; The allocations via E and F are to ensure we don't prune above B.
+;;
+;; The matching onto the inlined A[1234]->A sequences will require duplication
+;; of the context id assigned to the context from A for each allocation in D.
+;; This test ensures that we do this correctly in the presence of callsites
+;; shared by the
diff erent duplicated context ids (i.e. callsite in C).
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_Z1Db,plx \
+; RUN: -r=%t.o,_Z1Cb,plx \
+; RUN: -r=%t.o,_Z1Bb,plx \
+; RUN: -r=%t.o,_Z1Ab,plx \
+; RUN: -r=%t.o,_Z2A1v,plx \
+; RUN: -r=%t.o,_Z2A2v,plx \
+; RUN: -r=%t.o,_Z2A3v,plx \
+; RUN: -r=%t.o,_Z2A4v,plx \
+; RUN: -r=%t.o,_Z1Ev,plx \
+; RUN: -r=%t.o,_Z1Fv,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z1Db(i1 %Call1) {
+entry:
+ %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+ br label %return
+
+if.else: ; No predecessors!
+ %call1 = call ptr @_Znam(i64 0), !memprof !6, !callsite !11
+ br label %return
+
+return: ; preds = %if.else, %entry
+ ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define ptr @_Z1Cb(i1 %Call1) {
+entry:
+ %call = call ptr @_Z1Db(i1 false), !callsite !12
+ ret ptr null
+}
+
+define ptr @_Z1Bb(i1 %Call1) {
+entry:
+ %call = call ptr @_Z1Cb(i1 false), !callsite !13
+ br label %return
+
+if.else: ; No predecessors!
+ %call1 = call ptr @_Z1Cb(i1 false), !callsite !14
+ br label %return
+
+return: ; preds = %if.else, %entry
+ ret ptr null
+}
+
+define ptr @_Z1Ab() {
+entry:
+ %call = call ptr @_Z1Bb(i1 false), !callsite !15
+ ret ptr null
+}
+
+define ptr @_Z2A1v() {
+entry:
+ %call.i = call ptr @_Z1Bb(i1 false), !callsite !16
+ ret ptr null
+}
+
+define ptr @_Z2A2v() {
+entry:
+ %call.i = call ptr @_Z1Bb(i1 false), !callsite !17
+ ret ptr null
+}
+
+define ptr @_Z2A3v() {
+entry:
+ %call.i = call ptr @_Z1Bb(i1 false), !callsite !18
+ ret ptr null
+}
+
+define ptr @_Z2A4v() {
+entry:
+ %call.i = call ptr @_Z1Bb(i1 false), !callsite !19
+ ret ptr null
+}
+
+define ptr @_Z1Ev() {
+entry:
+ %call = call ptr @_Z1Bb(i1 false), !callsite !20
+ ret ptr null
+}
+
+define ptr @_Z1Fv() {
+entry:
+ %call = call ptr @_Z1Bb(i1 false), !callsite !21
+ ret ptr null
+}
+
+declare i32 @main()
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+; uselistorder directives
+uselistorder ptr @_Znam, { 1, 0 }
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 1905834578520680781}
+!3 = !{!4, !"cold"}
+!4 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 -6528110295079665978}
+!5 = !{i64 4854880825882961848}
+!6 = !{!7, !9}
+!7 = !{!8, !"notcold"}
+!8 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -6528110295079665978}
+!9 = !{!10, !"cold"}
+!10 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -4903163940066524832}
+!11 = !{i64 -8775068539491628272}
+!12 = !{i64 -904694911315397047}
+!13 = !{i64 6532298921261778285}
+!14 = !{i64 7859682663773658275}
+!15 = !{i64 -6528110295079665978}
+!16 = !{i64 -6528110295079665978, i64 5747919905719679568}
+!17 = !{i64 -6528110295079665978, i64 -5753238080028016843}
+!18 = !{i64 -6528110295079665978, i64 1794685869326395337}
+!19 = !{i64 -6528110295079665978, i64 5462047985461644151}
+!20 = !{i64 1905834578520680781}
+!21 = !{i64 -4903163940066524832}
+
+
+;; After adding only the alloc node memprof metadata, we only have 4 contexts (we only
+;; match the interesting parts of the pre-update graph here).
+
+; DUMP: CCG before updating call stack chains:
+; DUMP: Callsite Context Graph:
+
+; DUMP: Node [[D1:0x[a-z0-9]+]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 0, 1, 2
+; DUMP: AllocType 2 StackIds: 0, 1, 3
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2
+
+; DUMP: Node [[C:0x[a-z0-9]+]]
+; DUMP: null Call
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2 3 4
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: Edge from Callee [[D2:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4
+
+; DUMP: Node [[D2]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 0, 4, 3
+; DUMP: AllocType 2 StackIds: 0, 4, 5
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 3 4
+
+
+;; After updating for callsite metadata, we should have duplicated the context
+;; ids coming from node A (2 and 3) 4 times, for the 4
diff erent callers of A,
+;; and used those on new nodes for those callers. Note that while in reality
+;; we only have cold edges coming from A1 and A2 and noncold from A3 and A4,
+;; due to the pruning we have lost this information and thus end up duplicating
+;; both of A's contexts to all of the new nodes (which could result in some
+;; unnecessary cloning.
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D1]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 0, 1, 2
+; DUMP: AllocType 2 StackIds: 0, 1, 3
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2 5 7 9 11
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+
+; DUMP: Node [[C]]
+; DUMP: Callee: 11485875876353461977 (_Z1Db) Clones: 0 StackIds: 0 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2 3 4 5 6 7 8 9 10 11 12
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[C]] to Caller: [[B1:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: Edge from Callee [[C]] to Caller: [[B2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+
+; DUMP: Node [[B1]]
+; DUMP: Callee: 15062806102884567440 (_Z1Cb) Clones: 0 StackIds: 1 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2 5 7 9 11
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[C]] to Caller: [[B1]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[B1]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: Edge from Callee [[B1]] to Caller: [[A2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 5
+; DUMP: Edge from Callee [[B1]] to Caller: [[A3:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 7
+; DUMP: Edge from Callee [[B1]] to Caller: [[A1:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 9
+; DUMP: Edge from Callee [[B1]] to Caller: [[A4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 11
+; DUMP: Edge from Callee [[B1]] to Caller: [[A:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[E]]
+; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 2 (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 1
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[B1]] to Caller: [[E]] AllocTypes: NotCold ContextIds: 1
+; DUMP: CallerEdges:
+
+; DUMP: Node [[D2]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 0, 4, 3
+; DUMP: AllocType 2 StackIds: 0, 4, 5
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 3 4 6 8 10 12
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+
+; DUMP: Node [[B2]]
+; DUMP: Callee: 15062806102884567440 (_Z1Cb) Clones: 0 StackIds: 4 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 3 4 6 8 10 12
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[C]] to Caller: [[B2]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[B2]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6
+; DUMP: Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8
+; DUMP: Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10
+; DUMP: Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12
+; DUMP: Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3
+
+; DUMP: Node [[F]]
+; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 5 (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 4
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[B2]] to Caller: [[F]] AllocTypes: Cold ContextIds: 4
+; DUMP: CallerEdges:
+
+; DUMP: Node [[A2]]
+; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 7 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 5 6
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[B1]] to Caller: [[A2]] AllocTypes: Cold ContextIds: 5
+; DUMP: Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6
+; DUMP: CallerEdges:
+
+; DUMP: Node [[A3]]
+; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 8 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 7 8
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[B1]] to Caller: [[A3]] AllocTypes: Cold ContextIds: 7
+; DUMP: Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8
+; DUMP: CallerEdges:
+
+; DUMP: Node [[A1]]
+; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 9 10
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[B1]] to Caller: [[A1]] AllocTypes: Cold ContextIds: 9
+; DUMP: Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10
+; DUMP: CallerEdges:
+
+; DUMP: Node [[A4]]
+; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 9 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 11 12
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[B1]] to Caller: [[A4]] AllocTypes: Cold ContextIds: 11
+; DUMP: Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12
+; DUMP: CallerEdges:
+
+; DUMP: Node [[A]]
+; DUMP: Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 6 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 2 3
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[B1]] to Caller: [[A]] AllocTypes: Cold ContextIds: 2
+; DUMP: Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3
+; DUMP: CallerEdges:
diff --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
new file mode 100644
index 0000000000000..30c8bd27f37b7
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
@@ -0,0 +1,266 @@
+;; Tests callsite context graph generation for call graph containing indirect
+;; calls. Currently this should result in conservative behavior, such that the
+;; indirect call receives a null call in its graph node, to prevent subsequent
+;; cloning.
+;;
+;; Original code looks like:
+;;
+;; char *foo() {
+;; return new char[10];
+;; }
+;; class A {
+;; public:
+;; virtual char *x() { return foo(); }
+;; };
+;; class B : public A {
+;; public:
+;; char *x() final { return foo(); }
+;; };
+;; char *bar(A *a) {
+;; return a->x();
+;; }
+;; int main(int argc, char **argv) {
+;; char *x = foo();
+;; char *y = foo();
+;; B b;
+;; char *z = bar(&b);
+;; char *w = bar(&b);
+;; A a;
+;; char *r = bar(&a);
+;; char *s = bar(&a);
+;; memset(x, 0, 10);
+;; memset(y, 0, 10);
+;; memset(z, 0, 10);
+;; memset(w, 0, 10);
+;; memset(r, 0, 10);
+;; memset(s, 0, 10);
+;; delete[] x;
+;; delete[] w;
+;; delete[] r;
+;; sleep(10);
+;; delete[] y;
+;; delete[] z;
+;; delete[] s;
+;; return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; Compiled without optimization to prevent inlining and devirtualization.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \
+; RUN: -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+source_filename = "indirectcall.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr
+ at _ZTVN10__cxxabiv117__class_type_infoE = external global ptr
+
+define internal ptr @_Z3barP1A(ptr %a) {
+entry:
+ ret ptr null
+}
+
+define i32 @main() {
+entry:
+ %call = call ptr @_Z3foov(), !callsite !0
+ %call1 = call ptr @_Z3foov(), !callsite !1
+ %call2 = call ptr @_Z3barP1A(ptr null), !callsite !2
+ %call3 = call ptr @_Z3barP1A(ptr null), !callsite !3
+ %call4 = call ptr @_Z3barP1A(ptr null), !callsite !4
+ %call5 = call ptr @_Z3barP1A(ptr null), !callsite !5
+ ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+define internal ptr @_ZN1A1xEv() {
+entry:
+ %call = call ptr @_Z3foov(), !callsite !6
+ ret ptr null
+}
+
+define internal ptr @_ZN1B1xEv() {
+entry:
+ %call = call ptr @_Z3foov(), !callsite !7
+ ret ptr null
+}
+
+define internal ptr @_Z3foov() {
+entry:
+ %call = call ptr @_Znam(i64 0), !memprof !8, !callsite !21
+ ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{i64 6792096022461663180}
+!3 = !{i64 -2709642582978494015}
+!4 = !{i64 748269490701775343}
+!5 = !{i64 -5747251260480066785}
+!6 = !{i64 8256774051149711748}
+!7 = !{i64 -4831879094954754638}
+!8 = !{!9, !11, !13, !15, !17, !19}
+!9 = !{!10, !"notcold"}
+!10 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 748269490701775343}
+!11 = !{!12, !"cold"}
+!12 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 -5747251260480066785}
+!13 = !{!14, !"notcold"}
+!14 = !{i64 2732490490862098848, i64 8632435727821051414}
+!15 = !{!16, !"cold"}
+!16 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 6792096022461663180}
+!17 = !{!18, !"notcold"}
+!18 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 -2709642582978494015}
+!19 = !{!20, !"cold"}
+!20 = !{i64 2732490490862098848, i64 -3421689549917153178}
+!21 = !{i64 2732490490862098848}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[FOO:0x[a-z0-9]+]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 6, 8, 4
+; DUMP: AllocType 2 StackIds: 6, 8, 5
+; DUMP: AllocType 1 StackIds: 0
+; DUMP: AllocType 2 StackIds: 7, 8, 2
+; DUMP: AllocType 1 StackIds: 7, 8, 3
+; DUMP: AllocType 2 StackIds: 1
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2 3 4 5 6
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[FOO]] to Caller: [[AX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
+; DUMP: Edge from Callee [[FOO]] to Caller: [[BX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 6
+
+; DUMP: Node [[AX]]
+; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 6 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[FOO]] to Caller: [[AX]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[AX]] to Caller: [[BAR:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+;; Bar contains an indirect call, with multiple targets. It's call should be null.
+; DUMP: Node [[BAR]]
+; DUMP: null Call
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2 4 5
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[AX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN3:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN5:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN6:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 5
+
+; DUMP: Node [[MAIN3]]
+; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 4 (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 1
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN3]] AllocTypes: NotCold ContextIds: 1
+; DUMP: CallerEdges:
+
+; DUMP: Node [[MAIN4]]
+; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 5 (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 2
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN4]] AllocTypes: Cold ContextIds: 2
+; DUMP: CallerEdges:
+
+; DUMP: Node [[MAIN1]]
+; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 0 (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 3
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: CallerEdges:
+
+; DUMP: Node [[BX]]
+; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 7 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 4 5
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[FOO]] to Caller: [[BX]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5
+
+; DUMP: Node [[MAIN5]]
+; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 2 (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 4
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN5]] AllocTypes: Cold ContextIds: 4
+; DUMP: CallerEdges:
+
+; DUMP: Node [[MAIN6]]
+; DUMP: Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 3 (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 5
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN6]] AllocTypes: NotCold ContextIds: 5
+; DUMP: CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 1 (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 6
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 6
+; DUMP: CallerEdges:
+
+
+; DOT: digraph "postbuild" {
+; DOT: label="postbuild";
+; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"];
+; DOT: Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"];
+; DOT: Node[[AX]] -> Node[[FOO]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"];
+; DOT: Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"];
+; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"];
+; DOT: Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"];
+; DOT: Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: Node[[MAIN3]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"];
+; DOT: Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"];
+; DOT: Node[[BX]] -> Node[[FOO]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"];
+; DOT: Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"];
+; DOT: Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOT: Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"];
+; DOT: Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"];
+; DOT: Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: Node[[MAIN6]] -> Node[[FOO]][tooltip="ContextIds: 6",fillcolor="cyan"];
+; DOT: }
diff --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll
new file mode 100644
index 0000000000000..89cd878e99fb4
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll
@@ -0,0 +1,186 @@
+;; Test callsite context graph generation for call graph with two memprof
+;; contexts and partial inlining, requiring generation of a new fused node to
+;; represent the inlined sequence while matching callsite nodes onto the graph.
+;;
+;; Original code looks like:
+;;
+;; char *bar() {
+;; return new char[10];
+;; }
+;;
+;; char *baz() {
+;; return bar();
+;; }
+;;
+;; char *foo() {
+;; return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;; char *x = foo();
+;; char *y = foo();
+;; memset(x, 0, 10);
+;; memset(y, 0, 10);
+;; delete[] x;
+;; sleep(10);
+;; delete[] y;
+;; return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of baz into foo, and
+;; bar into baz. Due to the inlining of bar we will initially have two
+;; allocation nodes in the graph. This tests that we correctly match
+;; foo (with baz inlined) onto the graph nodes first, and generate a new
+;; fused node for it. We should then not match baz (with bar inlined) as that
+;; is not reached by the MIB contexts (since all calls from main will look
+;; like main -> foo(+baz) -> bar after the inlining reflected in this IR).
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+source_filename = "inlined.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal ptr @_Z3barv() {
+entry:
+ %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+ ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z3bazv() {
+entry:
+ %call.i = call ptr @_Znam(i64 0), !memprof !0, !callsite !6
+ ret ptr null
+}
+
+define internal ptr @_Z3foov() {
+entry:
+ %call.i = call ptr @_Z3barv(), !callsite !7
+ ret ptr null
+}
+
+define i32 @main() {
+entry:
+ %call = call ptr @_Z3foov(), !callsite !8
+ %call1 = call ptr @_Z3foov(), !callsite !9
+ ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!5 = !{i64 9086428284934609951}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432}
+!7 = !{i64 -5964873800580613432, i64 2732490490862098848}
+!8 = !{i64 8632435727821051414}
+!9 = !{i64 -3421689549917153178}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+
+; DUMP: Node [[BAZ:0x[a-z0-9]+]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 1, 2
+; DUMP: AllocType 2 StackIds: 1, 3
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+;; This is leftover from the MIB on the alloc inlined into baz. It is not
+;; matched with any call, since there is no such node in the IR. Due to the
+;; null call it will not participate in any context transformations.
+; DUMP: Node [[FOO2]]
+; DUMP: null Call
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[MAIN1]]
+; DUMP: Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 2 (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 1 3
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: Edge from Callee [[FOO:0x[a-z0-9]+]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 3 (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 2 4
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4
+; DUMP: CallerEdges:
+
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 0, 1, 2
+; DUMP: AllocType 2 StackIds: 0, 1, 3
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 3 4
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4
+
+;; This is the node synthesized for the call to bar in foo that was created
+;; by inlining baz into foo.
+; DUMP: Node [[FOO]]
+; DUMP: Callee: 16064618363798697104 (_Z3barv) Clones: 0 StackIds: 0, 1 (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 3 4
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4
+
+
+; DOT: digraph "postbuild" {
+; DOT: label="postbuild";
+; DOT: Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"];
+; DOT: Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"];
+; DOT: Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: Node[[MAIN1]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 3",fillcolor="brown1"];
+; DOT: Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOT: Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc2\n_Z3barv -\> alloc}"];
+; DOT: Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
+; DOT: Node[[FOO2]] -> Node[[BAR]][tooltip="ContextIds: 3 4",fillcolor="mediumorchid1"];
+; DOT: }
diff --git a/llvm/test/ThinLTO/X86/memprof-inlined2.ll b/llvm/test/ThinLTO/X86/memprof-inlined2.ll
new file mode 100644
index 0000000000000..1ffae8cd59cef
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-inlined2.ll
@@ -0,0 +1,124 @@
+;; Test callsite context graph generation for call graph with two memprof
+;; contexts and multiple levels of inlining, requiring generation of new
+;; fused nodes to represent the inlined sequence while matching callsite
+;; nodes onto the graph. In particular this tests the case where a function
+;; has inlined a callee containing an inlined callee.
+;;
+;; Original code looks like:
+;;
+;; char *bar() __attribute__((noinline)) {
+;; return new char[10];
+;; }
+;;
+;; char *baz() {
+;; return bar();
+;; }
+;;
+;; char *foo() {
+;; return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;; char *x = foo();
+;; char *y = foo();
+;; memset(x, 0, 10);
+;; memset(y, 0, 10);
+;; delete[] x;
+;; sleep(10);
+;; delete[] y;
+;; return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; Both foo and baz are inlined into main, at both foo callsites.
+;; We should update the graph for new fused nodes for both of those inlined
+;; callsites to bar.
+;;
+;; Note that baz and bar are both dead due to the inlining, but have been left
+;; in the input IR to ensure that the MIB call chain is matched to the longer
+;; inline sequences from main.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN: -r=%t.o,main,plx \
+; RUN: -r=%t.o,_Z3barv,plx \
+; RUN: -r=%t.o,_Z3bazv,plx \
+; RUN: -r=%t.o,_Z3foov,plx \
+; RUN: -r=%t.o,_ZdaPv, \
+; RUN: -r=%t.o,sleep, \
+; RUN: -r=%t.o,_Znam, \
+; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z3barv() {
+entry:
+ %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+ ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+declare ptr @_Z3bazv()
+
+declare ptr @_Z3foov()
+
+define i32 @main() {
+delete.end5:
+ %call.i.i = call ptr @_Z3barv(), !callsite !6
+ %call.i.i8 = call ptr @_Z3barv(), !callsite !7
+ ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!5 = !{i64 9086428284934609951}
+!6 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!7 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: Versions: 1 MIB:
+; DUMP: AllocType 1 StackIds: 0, 1, 2
+; DUMP: AllocType 2 StackIds: 0, 1, 3
+; DUMP: (clone 0)
+; DUMP: AllocTypes: NotColdCold
+; DUMP: ContextIds: 1 2
+; DUMP: CalleeEdges:
+; DUMP: CallerEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+;; This is the node synthesized for the first inlined call chain of main->foo->baz
+; DUMP: Node [[MAIN1]]
+; DUMP: Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 2 (clone 0)
+; DUMP: AllocTypes: NotCold
+; DUMP: ContextIds: 1
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: CallerEdges:
+
+;; This is the node synthesized for the second inlined call chain of main->foo->baz
+; DUMP: Node [[MAIN2]]
+; DUMP: Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 3 (clone 0)
+; DUMP: AllocTypes: Cold
+; DUMP: ContextIds: 2
+; DUMP: CalleeEdges:
+; DUMP: Edge from Callee [[BAR]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: CallerEdges:
More information about the llvm-commits
mailing list