[llvm] fe27495 - [MemProf] Context disambiguation cloning pass [patch 1b/3]

Teresa Johnson via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 22 14:58:03 PDT 2023


Author: Teresa Johnson
Date: 2023-03-22T14:57:53-07:00
New Revision: fe27495be2040007c7b20844a9371b06156ab405

URL: https://github.com/llvm/llvm-project/commit/fe27495be2040007c7b20844a9371b06156ab405
DIFF: https://github.com/llvm/llvm-project/commit/fe27495be2040007c7b20844a9371b06156ab405.diff

LOG: [MemProf] Context disambiguation cloning pass [patch 1b/3]

Adds support for building the graph in ThinLTO from MemProf summaries.

Follow-on patches will contain the support for cloning on the graph and
in the IR.

Depends on D140908.

Differential Revision: https://reviews.llvm.org/D145836

Added: 
    llvm/test/ThinLTO/X86/memprof-basic.ll
    llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
    llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll
    llvm/test/ThinLTO/X86/memprof-indirectcall.ll
    llvm/test/ThinLTO/X86/memprof-inlined.ll
    llvm/test/ThinLTO/X86/memprof-inlined2.ll

Modified: 
    llvm/include/llvm/IR/ModuleSummaryIndex.h
    llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
    llvm/lib/LTO/LTO.cpp
    llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 18853102799b4..0c178ccef3bbb 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -988,12 +988,22 @@ class FunctionSummary : public GlobalValueSummary {
     return {};
   }
 
+  CallsitesTy &mutableCallsites() {
+    assert(Callsites);
+    return *Callsites;
+  }
+
   ArrayRef<AllocInfo> allocs() const {
     if (Allocs)
       return *Allocs;
     return {};
   }
 
+  AllocsTy &mutableAllocs() {
+    assert(Allocs);
+    return *Allocs;
+  }
+
   friend struct GraphTraits<ValueInfo>;
 };
 

diff  --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index 56e56ed67f7df..475ea48cca932 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -19,9 +19,12 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/PassManager.h"
+#include <functional>
 
 namespace llvm {
+class GlobalValueSummary;
 class Module;
+class ModuleSummaryIndex;
 
 class MemProfContextDisambiguation
     : public PassInfoMixin<MemProfContextDisambiguation> {
@@ -32,6 +35,10 @@ class MemProfContextDisambiguation
   MemProfContextDisambiguation() {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+  void run(ModuleSummaryIndex &Index,
+           function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+               isPrevailing);
 };
 } // end namespace llvm
 

diff  --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 1f273a8e5025f..ee6b8c3aa234d 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -51,6 +51,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
@@ -75,6 +76,9 @@ cl::opt<bool> EnableLTOInternalization(
     cl::desc("Enable global value internalization in LTO"));
 }
 
+/// Enable MemProf context disambiguation for thin link.
+extern cl::opt<bool> EnableMemProfContextDisambiguation;
+
 // Computes a unique hash for the Module considering the current list of
 // export/import and other global analysis results.
 // The hash is produced in \p Key.
@@ -1539,6 +1543,14 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
   runWholeProgramDevirtOnIndex(ThinLTO.CombinedIndex, ExportedGUIDs,
                                LocalWPDTargetsMap);
 
+  auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) {
+    return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
+  };
+  if (EnableMemProfContextDisambiguation) {
+    MemProfContextDisambiguation ContextDisambiguation;
+    ContextDisambiguation.run(ThinLTO.CombinedIndex, isPrevailing);
+  }
+
   if (Conf.OptLevel > 0)
     ComputeCrossModuleImport(ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
                              ImportLists, ExportLists);
@@ -1580,10 +1592,6 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
   updateIndexWPDForExports(ThinLTO.CombinedIndex, isExported,
                            LocalWPDTargetsMap);
 
-  auto isPrevailing = [&](GlobalValue::GUID GUID,
-                          const GlobalValueSummary *S) {
-    return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
-  };
   thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported,
                                       isPrevailing);
 

diff  --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5a6625743eecf..b2fcea1ec8694 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -14,9 +14,9 @@
 // subsequently annotated with an attribute for later transformation.
 //
 // The transformations can be performed either directly on IR (regular LTO), or
-// (eventually) on a ThinLTO index (later applied to the IR during the ThinLTO
-// backend). Both types of LTO operate on a the same base graph representation,
-// which uses CRTP to support either IR or Index formats.
+// on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
+// Both types of LTO operate on a the same base graph representation, which
+// uses CRTP to support either IR or Index formats.
 //
 //===----------------------------------------------------------------------===//
 
@@ -28,9 +28,11 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/MemoryProfileInfo.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -458,6 +460,56 @@ class ModuleCallsiteContextGraph
   const Module &Mod;
 };
 
+/// Represents a call in the summary index graph, which can either be an
+/// allocation or an interior callsite node in an allocation's context.
+/// Holds a pointer to the corresponding data structure in the index.
+struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
+  IndexCall() : PointerUnion() {}
+  IndexCall(std::nullptr_t) : IndexCall() {}
+  IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
+  IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
+
+  IndexCall *operator->() { return this; }
+
+  void print(raw_ostream &OS) const {
+    if (auto *AI = dyn_cast<AllocInfo *>())
+      OS << *AI;
+    else {
+      auto *CI = dyn_cast<CallsiteInfo *>();
+      assert(CI);
+      OS << *CI;
+    }
+  }
+};
+
+/// CRTP derived class for graphs built from summary index (ThinLTO).
+class IndexCallsiteContextGraph
+    : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+                                  IndexCall> {
+public:
+  IndexCallsiteContextGraph(
+      ModuleSummaryIndex &Index,
+      function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+          isPrevailing);
+
+private:
+  friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+                              IndexCall>;
+
+  uint64_t getStackId(uint64_t IdOrIndex) const;
+  bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
+  uint64_t getLastStackId(IndexCall &Call);
+  std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
+  std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
+                       unsigned CloneNo) const;
+
+  // Saves mapping from function summaries containing memprof records back to
+  // its VI, for use in checking and debugging.
+  std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
+
+  const ModuleSummaryIndex &Index;
+};
+
 namespace {
 
 struct FieldSeparator {
@@ -475,6 +527,20 @@ raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
   return OS << FS.Sep;
 }
 
+// Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
+// type we should actually use on the corresponding allocation.
+// If we can't clone a node that has NotCold+Cold alloc type, we will fall
+// back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
+// from NotCold.
+AllocationType allocTypeToUse(uint8_t AllocTypes) {
+  assert(AllocTypes != (uint8_t)AllocationType::None);
+  if (AllocTypes ==
+      ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
+    return AllocationType::NotCold;
+  else
+    return (AllocationType)AllocTypes;
+}
+
 } // end anonymous namespace
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
@@ -1118,6 +1184,20 @@ uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
   return CallsiteContext.back();
 }
 
+uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
+  assert(Call.is<CallsiteInfo *>());
+  CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
+      CallsiteContext(Call.dyn_cast<CallsiteInfo *>());
+  // Need to convert index into stack id.
+  return Index.getStackIdAtIndex(CallsiteContext.back());
+}
+
+static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
+  if (!CloneNo)
+    return Base.str();
+  return (Base + ".memprof." + Twine(CloneNo)).str();
+}
+
 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
                                                  const Instruction *Call,
                                                  unsigned CloneNo) const {
@@ -1126,6 +1206,22 @@ std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
       .str();
 }
 
+std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
+                                                const IndexCall &Call,
+                                                unsigned CloneNo) const {
+  auto VI = FSToVIMap.find(Func);
+  assert(VI != FSToVIMap.end());
+  if (Call.is<AllocInfo *>())
+    return (VI->second.name() + " -> alloc").str();
+  else {
+    auto *Callsite = Call.dyn_cast<CallsiteInfo *>();
+    return (VI->second.name() + " -> " +
+            getMemProfFuncName(Callsite->Callee.name(),
+                               Callsite->Clones[CloneNo]))
+        .str();
+  }
+}
+
 std::vector<uint64_t>
 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
     Instruction *Call) {
@@ -1135,6 +1231,16 @@ ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
       CallsiteContext);
 }
 
+std::vector<uint64_t>
+IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
+  assert(Call.is<CallsiteInfo *>());
+  CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
+      CallsiteContext(Call.dyn_cast<CallsiteInfo *>());
+  return getStackIdsWithContextNodes<CallsiteInfo,
+                                     SmallVector<unsigned>::const_iterator>(
+      CallsiteContext);
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 template <class NodeT, class IteratorT>
 std::vector<uint64_t>
@@ -1207,6 +1313,84 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) {
       Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
 }
 
+IndexCallsiteContextGraph::IndexCallsiteContextGraph(
+    ModuleSummaryIndex &Index,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        isPrevailing)
+    : Index(Index) {
+  for (auto &I : Index) {
+    auto VI = Index.getValueInfo(I);
+    for (auto &S : VI.getSummaryList()) {
+      // We should only add the prevailing nodes. Otherwise we may try to clone
+      // in a weak copy that won't be linked (and may be 
diff erent than the
+      // prevailing version).
+      // We only keep the memprof summary on the prevailing copy now when
+      // building the combined index, as a space optimization, however don't
+      // rely on this optimization. The linker doesn't resolve local linkage
+      // values so don't check whether those are prevailing.
+      if (!GlobalValue::isLocalLinkage(S->linkage()) &&
+          !isPrevailing(VI.getGUID(), S.get()))
+        continue;
+      auto *FS = dyn_cast<FunctionSummary>(S.get());
+      if (!FS)
+        continue;
+      std::vector<CallInfo> CallsWithMetadata;
+      if (!FS->allocs().empty()) {
+        for (auto &AN : FS->mutableAllocs()) {
+          // This can happen because of recursion elimination handling that
+          // currently exists in ModuleSummaryAnalysis. Skip these for now.
+          // We still added them to the summary because we need to be able to
+          // correlate properly in applyImport in the backends.
+          if (AN.MIBs.empty())
+            continue;
+          CallsWithMetadata.push_back({&AN});
+          auto *AllocNode = addAllocNode({&AN}, FS);
+          // Pass an empty CallStack to the CallsiteContext (second)
+          // parameter, since for ThinLTO we already collapsed out the inlined
+          // stack ids on the allocation call during ModuleSummaryAnalysis.
+          CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
+              EmptyContext;
+          // Now add all of the MIBs and their stack nodes.
+          for (auto &MIB : AN.MIBs) {
+            CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
+                StackContext(&MIB);
+            addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
+                AllocNode, StackContext, EmptyContext, MIB.AllocType);
+          }
+          assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
+          // Initialize version 0 on the summary alloc node to the current alloc
+          // type, unless it has both types in which case make it default, so
+          // that in the case where we aren't able to clone the original version
+          // always ends up with the default allocation behavior.
+          AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
+        }
+      }
+      // For callsite metadata, add to list for this function for later use.
+      if (!FS->callsites().empty())
+        for (auto &SN : FS->mutableCallsites())
+          CallsWithMetadata.push_back({&SN});
+
+      if (!CallsWithMetadata.empty())
+        FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata});
+
+      if (!FS->allocs().empty() || !FS->callsites().empty())
+        FSToVIMap[FS] = VI;
+    }
+  }
+
+  if (DumpCCG) {
+    dbgs() << "CCG before updating call stack chains:\n";
+    dbgs() << *this;
+  }
+
+  if (ExportToDot)
+    exportToDot("prestackupdate");
+
+  updateStackNodes();
+
+  handleCallsitesWithMultipleTargets();
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy,
                           CallTy>::handleCallsitesWithMultipleTargets() {
@@ -1251,6 +1435,12 @@ uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
   return IdOrIndex;
 }
 
+uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
+  // In the Index case this is an index into the stack id list in the summary
+  // index, convert it to an Id.
+  return Index.getStackIdAtIndex(IdOrIndex);
+}
+
 bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
                                                    const Function *Func) {
   auto *CB = dyn_cast<CallBase>(Call);
@@ -1264,6 +1454,23 @@ bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
   return Alias && Alias->getAliasee() == Func;
 }
 
+bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call,
+                                                  const FunctionSummary *Func) {
+  ValueInfo Callee = Call.dyn_cast<CallsiteInfo *>()->Callee;
+  // If there is no summary list then this is a call to an externally defined
+  // symbol.
+  AliasSummary *Alias =
+      Callee.getSummaryList().empty()
+          ? nullptr
+          : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
+  assert(FSToVIMap.count(Func));
+  return Callee == FSToVIMap[Func] ||
+         // If callee is an alias, check the aliasee, since only function
+         // summary base objects will contain the stack node summaries and thus
+         // get a context node.
+         (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]);
+}
+
 static std::string getAllocTypeString(uint8_t AllocTypes) {
   if (!AllocTypes)
     return "None";
@@ -1581,3 +1788,11 @@ PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
+
+void MemProfContextDisambiguation::run(
+    ModuleSummaryIndex &Index,
+    function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
+        isPrevailing) {
+  IndexCallsiteContextGraph CCG(Index, isPrevailing);
+  CCG.process();
+}

diff  --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
new file mode 100644
index 0000000000000..d8c78d270f277
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -0,0 +1,157 @@
+;; Test callsite context graph generation for simple call graph with
+;; two memprof contexts and no inlining.
+;;
+;; Original code looks like:
+;;
+;; char *bar() {
+;;   return new char[10];
+;; }
+;;
+;; char *baz() {
+;;   return bar();
+;; }
+;;
+;; char *foo() {
+;;   return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   delete[] x;
+;;   sleep(10);
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:	-r=%t.o,main,plx \
+; RUN:	-r=%t.o,_ZdaPv, \
+; RUN:	-r=%t.o,sleep, \
+; RUN:	-r=%t.o,_Znam, \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+source_filename = "memprof-basic.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !0
+  %call1 = call ptr @_Z3foov(), !callsite !1
+  ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+define internal ptr @_Z3barv() {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z3bazv() {
+entry:
+  %call = call ptr @_Z3barv(), !callsite !8
+  ret ptr null
+}
+
+define internal ptr @_Z3foov() {
+entry:
+  %call = call ptr @_Z3bazv(), !callsite !9
+  ret ptr null
+}
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 1, 0 }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{!3, !5}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!5 = !{!6, !"cold"}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!7 = !{i64 9086428284934609951}
+!8 = !{i64 -5964873800580613432}
+!9 = !{i64 2732490490862098848}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 1 StackIds: 2, 3, 0
+; DUMP: 		AllocType 2 StackIds: 2, 3, 1
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[BAZ]]
+; DUMP: 	Callee: 9832687305761716512 (_Z3barv) Clones: 0 StackIds: 2	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[BAZ]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+; DUMP: Node [[FOO]]
+; DUMP: 	Callee: 5878270615442837395 (_Z3bazv) Clones: 0 StackIds: 3	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	Callee: 6731117468105397038 (_Z3foov) Clones: 0 StackIds: 0	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	Callee: 6731117468105397038 (_Z3foov) Clones: 0 StackIds: 1	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
+; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 12481870273128938184\n_Z3bazv -\> _Z3barv}"];
+; DOT: 	Node[[BAZ]] -> Node[[BAR]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\n_Z3foov -\> _Z3bazv}"];
+; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: }

diff  --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
new file mode 100644
index 0000000000000..772b319e0715e
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
@@ -0,0 +1,229 @@
+;; Test callsite context graph generation for call graph with with MIBs
+;; that have pruned contexts that partially match multiple inlined
+;; callsite contexts, requiring duplication of context ids and nodes
+;; while matching callsite nodes onto the graph.
+;;
+;; Original code looks like:
+;;
+;; char *D() {
+;;   return new char[10];
+;; }
+;;
+;; char *F() {
+;;   return D();
+;; }
+;;
+;; char *C() {
+;;   return D();
+;; }
+;;
+;; char *B() {
+;;   return C();
+;; }
+;;
+;; char *E() {
+;;   return C();
+;; }
+;; int main(int argc, char **argv) {
+;;   char *x = B(); // cold
+;;   char *y = E(); // cold
+;;   char *z = F(); // default
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   memset(z, 0, 10);
+;;   delete[] z;
+;;   sleep(10);
+;;   delete[] x;
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of C into both B and E.
+;; Since both allocation contexts via C are cold, the matched memprof
+;; metadata has the context pruned above C's callsite. This requires
+;; matching the stack node for C to callsites where it was inlined (i.e.
+;; the callsites in B and E that have callsite metadata that includes C's).
+;; It also requires duplication of that node in the graph as well as the
+;; duplication of the context ids along that path through the graph,
+;; so that we can represent the duplicated (via inlining) C callsite.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
+; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
+
+
+source_filename = "duplicate-context-ids.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal ptr @_Z1Dv() {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z1Fv() {
+entry:
+  %call = call ptr @_Z1Dv(), !callsite !6
+  ret ptr null
+}
+
+define internal ptr @_Z1Cv() {
+entry:
+  %call = call ptr @_Z1Dv(), !callsite !7
+  ret ptr null
+}
+
+define internal ptr @_Z1Bv() {
+entry:
+  %call.i = call ptr @_Z1Dv(), !callsite !8
+  ret ptr null
+}
+
+define internal ptr @_Z1Ev() {
+entry:
+  %call.i = call ptr @_Z1Dv(), !callsite !9
+  ret ptr null
+}
+
+declare i32 @main()
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+!0 = !{!1, !3}
+!1 = !{!2, !"cold"}
+!2 = !{i64 6541423618768552252, i64 -6270142974039008131}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 6541423618768552252, i64 -4903163940066524832}
+!5 = !{i64 6541423618768552252}
+!6 = !{i64 -4903163940066524832}
+!7 = !{i64 -6270142974039008131}
+!8 = !{i64 -6270142974039008131, i64 -184525619819294889}
+!9 = !{i64 -6270142974039008131, i64 1905834578520680781}
+
+
+;; After adding only the alloc node memprof metadata, we only have 2 contexts.
+
+; DUMP: CCG before updating call stack chains:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 2 StackIds: 0
+; DUMP: 		AllocType 1 StackIds: 1
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
+
+; DUMP: Node [[C]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C]] AllocTypes: Cold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[F]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+;; After updating for callsite metadata, we should have generated context ids 3 and 4,
+;; along with 2 new nodes for those callsites. All have the same allocation type
+;; behavior as the original C node.
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 2 StackIds: 0
+; DUMP: 		AllocType 1 StackIds: 1
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 3
+; DUMP: 		Edge from Callee [[D]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: 		Edge from Callee [[D]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 1
+
+; DUMP: Node [[F]]
+; DUMP: 	Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 1	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[F]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[C2]]
+; DUMP: 	Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[C2]] AllocTypes: Cold ContextIds: 3
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[B]]
+; DUMP: 	Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 2	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[B]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[E]]
+; DUMP: 	Callee: 4881081444663423788 (_Z1Dv) Clones: 0 StackIds: 0, 3	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D]] to Caller: [[E]] AllocTypes: Cold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+
+; DOTPRE: digraph "prestackupdate" {
+; DOTPRE: 	label="prestackupdate";
+; DOTPRE: 	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
+; DOTPRE: 	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12176601099670543485\nnull call (external)}"];
+; DOTPRE: 	Node[[C]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"];
+; DOTPRE: 	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\nnull call (external)}"];
+; DOTPRE: 	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"];
+; DOTPRE: }
+
+
+; DOTPOST:digraph "postbuild" {
+; DOTPOST:	label="postbuild";
+; DOTPOST:	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
+; DOTPOST:	Node[[F:0x[a-z0-9]+]] [shape=record,tooltip="N[[F]] ContextIds: 2",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 13543580133643026784\n_Z1Fv -\> _Z1Dv}"];
+; DOTPOST:	Node[[F]] -> Node[[D]][tooltip="ContextIds: 2",fillcolor="brown1"];
+; DOTPOST:	Node[[C:0x[a-z0-9]+]] [shape=record,tooltip="N[[C]] ContextIds: 3",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Cv -\> _Z1Dv}"];
+; DOTPOST:	Node[[C]] -> Node[[D]][tooltip="ContextIds: 3",fillcolor="cyan"];
+; DOTPOST:	Node[[B:0x[a-z0-9]+]] [shape=record,tooltip="N[[B]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Bv -\> _Z1Dv}"];
+; DOTPOST:	Node[[B]] -> Node[[D]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOTPOST:	Node[[E:0x[a-z0-9]+]] [shape=record,tooltip="N[[E]] ContextIds: 1",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 0\n_Z1Ev -\> _Z1Dv}"];
+; DOTPOST:	Node[[E]] -> Node[[D]][tooltip="ContextIds: 1",fillcolor="cyan"];
+; DOTPOST:}

diff  --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll
new file mode 100644
index 0000000000000..af7dece9421a9
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids2.ll
@@ -0,0 +1,390 @@
+;; Test callsite context graph generation for call graph with with MIBs
+;; that have pruned contexts that partially match multiple inlined
+;; callsite contexts, requiring duplication of context ids and nodes
+;; while matching callsite nodes onto the graph. This test requires more
+;; complex duplication due to multiple contexts for 
diff erent allocations
+;; that share some of the same callsite nodes.
+;;
+;; Original code looks like:
+;;
+;; char *D(bool Call1) {
+;;   if (Call1)
+;;     return new char[10];
+;;   else
+;;     return new char[10];
+;; }
+;;
+;; char *C(bool Call1) {
+;;   return D(Call1);
+;; }
+;;
+;; char *B(bool Call1) {
+;;   if (Call1)
+;;     return C(true);
+;;   else
+;;     return C(false);
+;; }
+;;
+;; char *A(bool Call1) {
+;;   return B(Call1);
+;; }
+;;
+;; char *A1() {
+;;   return A(true);
+;; }
+;;
+;; char *A2() {
+;;   return A(true);
+;; }
+;;
+;; char *A3() {
+;;   return A(false);
+;; }
+;;
+;; char *A4() {
+;;   return A(false);
+;; }
+;;
+;; char *E() {
+;;   return B(true);
+;; }
+;;
+;; char *F() {
+;;   return B(false);
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *a1 = A1(); // cold
+;;   char *a2 = A2(); // cold
+;;   char *e = E(); // default
+;;   char *a3 = A3(); // default
+;;   char *a4 = A4(); // default
+;;   char *f = F(); // cold
+;;   memset(a1, 0, 10);
+;;   memset(a2, 0, 10);
+;;   memset(e, 0, 10);
+;;   memset(a3, 0, 10);
+;;   memset(a4, 0, 10);
+;;   memset(f, 0, 10);
+;;   delete[] a3;
+;;   delete[] a4;
+;;   delete[] e;
+;;   sleep(10);
+;;   delete[] a1;
+;;   delete[] a2;
+;;   delete[] f;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of A into its callers,
+;; without any other inlining or optimizations. Since both allocation contexts
+;; via A for each allocation in D have the same allocation type (cold via
+;; A1 and A2 for the first new in D, and non-cold via A3 and A4 for the second
+;; new in D, the contexts for those respective allocations are pruned above A.
+;; The allocations via E and F are to ensure we don't prune above B.
+;;
+;; The matching onto the inlined A[1234]->A sequences will require duplication
+;; of the context id assigned to the context from A for each allocation in D.
+;; This test ensures that we do this correctly in the presence of callsites
+;; shared by the 
diff erent duplicated context ids (i.e. callsite in C).
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Z1Db,plx \
+; RUN:  -r=%t.o,_Z1Cb,plx \
+; RUN:  -r=%t.o,_Z1Bb,plx \
+; RUN:  -r=%t.o,_Z1Ab,plx \
+; RUN:  -r=%t.o,_Z2A1v,plx \
+; RUN:  -r=%t.o,_Z2A2v,plx \
+; RUN:  -r=%t.o,_Z2A3v,plx \
+; RUN:  -r=%t.o,_Z2A4v,plx \
+; RUN:  -r=%t.o,_Z1Ev,plx \
+; RUN:  -r=%t.o,_Z1Fv,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z1Db(i1 %Call1) {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+  br label %return
+
+if.else:                                          ; No predecessors!
+  %call1 = call ptr @_Znam(i64 0), !memprof !6, !callsite !11
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define ptr @_Z1Cb(i1 %Call1) {
+entry:
+  %call = call ptr @_Z1Db(i1 false), !callsite !12
+  ret ptr null
+}
+
+define ptr @_Z1Bb(i1 %Call1) {
+entry:
+  %call = call ptr @_Z1Cb(i1 false), !callsite !13
+  br label %return
+
+if.else:                                          ; No predecessors!
+  %call1 = call ptr @_Z1Cb(i1 false), !callsite !14
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  ret ptr null
+}
+
+define ptr @_Z1Ab() {
+entry:
+  %call = call ptr @_Z1Bb(i1 false), !callsite !15
+  ret ptr null
+}
+
+define ptr @_Z2A1v() {
+entry:
+  %call.i = call ptr @_Z1Bb(i1 false), !callsite !16
+  ret ptr null
+}
+
+define ptr @_Z2A2v() {
+entry:
+  %call.i = call ptr @_Z1Bb(i1 false), !callsite !17
+  ret ptr null
+}
+
+define ptr @_Z2A3v() {
+entry:
+  %call.i = call ptr @_Z1Bb(i1 false), !callsite !18
+  ret ptr null
+}
+
+define ptr @_Z2A4v() {
+entry:
+  %call.i = call ptr @_Z1Bb(i1 false), !callsite !19
+  ret ptr null
+}
+
+define ptr @_Z1Ev() {
+entry:
+  %call = call ptr @_Z1Bb(i1 false), !callsite !20
+  ret ptr null
+}
+
+define ptr @_Z1Fv() {
+entry:
+  %call = call ptr @_Z1Bb(i1 false), !callsite !21
+  ret ptr null
+}
+
+declare i32 @main()
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+; uselistorder directives
+uselistorder ptr @_Znam, { 1, 0 }
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 1905834578520680781}
+!3 = !{!4, !"cold"}
+!4 = !{i64 4854880825882961848, i64 -904694911315397047, i64 6532298921261778285, i64 -6528110295079665978}
+!5 = !{i64 4854880825882961848}
+!6 = !{!7, !9}
+!7 = !{!8, !"notcold"}
+!8 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -6528110295079665978}
+!9 = !{!10, !"cold"}
+!10 = !{i64 -8775068539491628272, i64 -904694911315397047, i64 7859682663773658275, i64 -4903163940066524832}
+!11 = !{i64 -8775068539491628272}
+!12 = !{i64 -904694911315397047}
+!13 = !{i64 6532298921261778285}
+!14 = !{i64 7859682663773658275}
+!15 = !{i64 -6528110295079665978}
+!16 = !{i64 -6528110295079665978, i64 5747919905719679568}
+!17 = !{i64 -6528110295079665978, i64 -5753238080028016843}
+!18 = !{i64 -6528110295079665978, i64 1794685869326395337}
+!19 = !{i64 -6528110295079665978, i64 5462047985461644151}
+!20 = !{i64 1905834578520680781}
+!21 = !{i64 -4903163940066524832}
+
+
+;; After adding only the alloc node memprof metadata, we only have 4 contexts (we only
+;; match the interesting parts of the pre-update graph here).
+
+; DUMP: CCG before updating call stack chains:
+; DUMP: Callsite Context Graph:
+
+; DUMP: Node [[D1:0x[a-z0-9]+]]
+; DUMP: Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 0, 1, 2
+; DUMP:                 AllocType 2 StackIds: 0, 1, 3
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+
+; DUMP: Node [[C:0x[a-z0-9]+]]
+; DUMP:         null Call
+; DUMP:         AllocTypes: NotColdCold
+; DUMP:         ContextIds: 1 2 3 4
+; DUMP:         CalleeEdges:
+; DUMP:                 Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP:                 Edge from Callee [[D2:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4
+
+; DUMP: Node [[D2]]
+; DUMP: Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 0, 4, 3
+; DUMP:                 AllocType 2 StackIds: 0, 4, 5
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4
+
+
+;; After updating for callsite metadata, we should have duplicated the context
+;; ids coming from node A (2 and 3) 4 times, for the 4 
diff erent callers of A,
+;; and used those on new nodes for those callers. Note that while in reality
+;; we only have cold edges coming from A1 and A2 and noncold from A3 and A4,
+;; due to the pruning we have lost this information and thus end up duplicating
+;; both of A's contexts to all of the new nodes (which could result in some
+;; unnecessary cloning.
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[D1]]
+; DUMP: Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 0, 1, 2
+; DUMP:                 AllocType 2 StackIds: 0, 1, 3
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 5 7 9 11
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+
+; DUMP: Node [[C]]
+; DUMP: 	Callee: 11485875876353461977 (_Z1Db) Clones: 0 StackIds: 0      (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 3 4 5 6 7 8 9 10 11 12
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[D1]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: 		Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B1:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+
+; DUMP: Node [[B1]]
+; DUMP: 	Callee: 15062806102884567440 (_Z1Cb) Clones: 0 StackIds: 1      (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 5 7 9 11
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B1]] AllocTypes: NotColdCold ContextIds: 1 2 5 7 9 11
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[E:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 5
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A3:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 7
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A1:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 9
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 11
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[E]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 2       (clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[E]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[D2]]
+; DUMP: Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 0, 4, 3
+; DUMP:                 AllocType 2 StackIds: 0, 4, 5
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4 6 8 10 12
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[D2]] to Caller: [[C]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+
+; DUMP: Node [[B2]]
+; DUMP: 	Callee: 15062806102884567440 (_Z1Cb) Clones: 0 StackIds: 4      (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4 6 8 10 12
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[C]] to Caller: [[B2]] AllocTypes: NotColdCold ContextIds: 3 4 6 8 10 12
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[F:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3
+
+; DUMP: Node [[F]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 5       (clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[F]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A2]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 5 6
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A2]] AllocTypes: Cold ContextIds: 5
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A2]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A3]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 8    (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 7 8
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A3]] AllocTypes: Cold ContextIds: 7
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A3]] AllocTypes: NotCold ContextIds: 8
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A1]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 9 10
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A1]] AllocTypes: Cold ContextIds: 9
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A1]] AllocTypes: NotCold ContextIds: 10
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A4]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 9    (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 11 12
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A4]] AllocTypes: Cold ContextIds: 11
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A4]] AllocTypes: NotCold ContextIds: 12
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[A]]
+; DUMP: 	Callee: 9116113196563097487 (_Z1Bb) Clones: 0 StackIds: 3, 6    (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 2 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[B1]] to Caller: [[A]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[B2]] to Caller: [[A]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	CallerEdges:

diff  --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
new file mode 100644
index 0000000000000..30c8bd27f37b7
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
@@ -0,0 +1,266 @@
+;; Tests callsite context graph generation for call graph containing indirect
+;; calls. Currently this should result in conservative behavior, such that the
+;; indirect call receives a null call in its graph node, to prevent subsequent
+;; cloning.
+;;
+;; Original code looks like:
+;;
+;; char *foo() {
+;;   return new char[10];
+;; }
+;; class A {
+;; public:
+;;     virtual char *x() { return foo(); }
+;; };
+;; class B : public A {
+;; public:
+;;     char *x() final { return foo(); }
+;; };
+;; char *bar(A *a) {
+;;   return a->x();
+;; }
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   B b;
+;;   char *z = bar(&b);
+;;   char *w = bar(&b);
+;;   A a;
+;;   char *r = bar(&a);
+;;   char *s = bar(&a);
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   memset(z, 0, 10);
+;;   memset(w, 0, 10);
+;;   memset(r, 0, 10);
+;;   memset(s, 0, 10);
+;;   delete[] x;
+;;   delete[] w;
+;;   delete[] r;
+;;   sleep(10);
+;;   delete[] y;
+;;   delete[] z;
+;;   delete[] s;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; Compiled without optimization to prevent inlining and devirtualization.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \
+; RUN:  -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+source_filename = "indirectcall.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at _ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr
+ at _ZTVN10__cxxabiv117__class_type_infoE = external global ptr
+
+define internal ptr @_Z3barP1A(ptr %a) {
+entry:
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !0
+  %call1 = call ptr @_Z3foov(), !callsite !1
+  %call2 = call ptr @_Z3barP1A(ptr null), !callsite !2
+  %call3 = call ptr @_Z3barP1A(ptr null), !callsite !3
+  %call4 = call ptr @_Z3barP1A(ptr null), !callsite !4
+  %call5 = call ptr @_Z3barP1A(ptr null), !callsite !5
+  ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+define internal ptr @_ZN1A1xEv() {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !6
+  ret ptr null
+}
+
+define internal ptr @_ZN1B1xEv() {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !7
+  ret ptr null
+}
+
+define internal ptr @_Z3foov() {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !8, !callsite !21
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+; uselistorder directives
+uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
+
+!0 = !{i64 8632435727821051414}
+!1 = !{i64 -3421689549917153178}
+!2 = !{i64 6792096022461663180}
+!3 = !{i64 -2709642582978494015}
+!4 = !{i64 748269490701775343}
+!5 = !{i64 -5747251260480066785}
+!6 = !{i64 8256774051149711748}
+!7 = !{i64 -4831879094954754638}
+!8 = !{!9, !11, !13, !15, !17, !19}
+!9 = !{!10, !"notcold"}
+!10 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 748269490701775343}
+!11 = !{!12, !"cold"}
+!12 = !{i64 2732490490862098848, i64 8256774051149711748, i64 -4820244510750103755, i64 -5747251260480066785}
+!13 = !{!14, !"notcold"}
+!14 = !{i64 2732490490862098848, i64 8632435727821051414}
+!15 = !{!16, !"cold"}
+!16 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 6792096022461663180}
+!17 = !{!18, !"notcold"}
+!18 = !{i64 2732490490862098848, i64 -4831879094954754638, i64 -4820244510750103755, i64 -2709642582978494015}
+!19 = !{!20, !"cold"}
+!20 = !{i64 2732490490862098848, i64 -3421689549917153178}
+!21 = !{i64 2732490490862098848}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[FOO:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 1 StackIds: 6, 8, 4
+; DUMP: 		AllocType 2 StackIds: 6, 8, 5
+; DUMP: 		AllocType 1 StackIds: 0
+; DUMP: 		AllocType 2 StackIds: 7, 8, 2
+; DUMP: 		AllocType 1 StackIds: 7, 8, 3
+; DUMP: 		AllocType 2 StackIds: 1
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 3 4 5 6
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[AX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[BX:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 6
+
+; DUMP: Node [[AX]]
+; DUMP: 	Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 6	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[AX]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[AX]] to Caller: [[BAR:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+;; Bar contains an indirect call, with multiple targets. It's call should be null.
+; DUMP: Node [[BAR]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2 4 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[AX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 		Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN3:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN4:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN5:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN6:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 5
+
+; DUMP: Node [[MAIN3]]
+; DUMP: 	Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 4	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN3]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN4]]
+; DUMP: 	Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 5	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN4]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 0	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[BX]]
+; DUMP: 	Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 7	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 4 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[BX]] AllocTypes: NotColdCold ContextIds: 4 5
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BX]] to Caller: [[BAR]] AllocTypes: NotColdCold ContextIds: 4 5
+
+; DUMP: Node [[MAIN5]]
+; DUMP: 	Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 2	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN5]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN6]]
+; DUMP: 	Callee: 4095956691517954349 (_Z3barP1A) Clones: 0 StackIds: 3	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN6]] AllocTypes: NotCold ContextIds: 5
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	Callee: 12914368124089294956 (_Z3foov) Clones: 0 StackIds: 1	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 6
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 6
+; DUMP: 	CallerEdges:
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"];
+; DOT: 	Node[[AX:0x[a-z0-9]+]] [shape=record,tooltip="N[[AX]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 8256774051149711748\n_ZN1A1xEv -\> _Z3foov}"];
+; DOT: 	Node[[AX]] -> Node[[FOO]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13626499562959447861\nnull call (external)}"];
+; DOT: 	Node[[BAR]] -> Node[[AX]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[BAR]] -> Node[[BX:0x[a-z0-9]+]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 748269490701775343\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN1]] -> Node[[BAR]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 12699492813229484831\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN2]] -> Node[[BAR]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: 	Node[[MAIN3:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN3]] ContextIds: 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN3]] -> Node[[FOO]][tooltip="ContextIds: 3",fillcolor="brown1"];
+; DOT: 	Node[[BX]] [shape=record,tooltip="N[[BX]] ContextIds: 4 5",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 13614864978754796978\n_ZN1B1xEv -\> _Z3foov}"];
+; DOT: 	Node[[BX]] -> Node[[FOO]][tooltip="ContextIds: 4 5",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN4:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN4]] ContextIds: 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 6792096022461663180\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN4]] -> Node[[BAR]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOT: 	Node[[MAIN5:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN5]] ContextIds: 5",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 15737101490731057601\nmain -\> _Z3barP1A}"];
+; DOT: 	Node[[MAIN5]] -> Node[[BAR]][tooltip="ContextIds: 5",fillcolor="brown1"];
+; DOT: 	Node[[MAIN6:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN6]] ContextIds: 6",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN6]] -> Node[[FOO]][tooltip="ContextIds: 6",fillcolor="cyan"];
+; DOT: }

diff  --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll
new file mode 100644
index 0000000000000..89cd878e99fb4
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll
@@ -0,0 +1,186 @@
+;; Test callsite context graph generation for call graph with two memprof
+;; contexts and partial inlining, requiring generation of a new fused node to
+;; represent the inlined sequence while matching callsite nodes onto the graph.
+;;
+;; Original code looks like:
+;;
+;; char *bar() {
+;;   return new char[10];
+;; }
+;;
+;; char *baz() {
+;;   return bar();
+;; }
+;;
+;; char *foo() {
+;;   return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   delete[] x;
+;;   sleep(10);
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The code below was created by forcing inlining of baz into foo, and
+;; bar into baz. Due to the inlining of bar we will initially have two
+;; allocation nodes in the graph. This tests that we correctly match
+;; foo (with baz inlined) onto the graph nodes first, and generate a new
+;; fused node for it. We should then not match baz (with bar inlined) as that
+;; is not reached by the MIB contexts (since all calls from main will look
+;; like main -> foo(+baz) -> bar after the inlining reflected in this IR).
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:	-r=%t.o,main,plx \
+; RUN:	-r=%t.o,_ZdaPv, \
+; RUN:	-r=%t.o,sleep, \
+; RUN:	-r=%t.o,_Znam, \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+
+
+source_filename = "inlined.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal ptr @_Z3barv() {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+define internal ptr @_Z3bazv() {
+entry:
+  %call.i = call ptr @_Znam(i64 0), !memprof !0, !callsite !6
+  ret ptr null
+}
+
+define internal ptr @_Z3foov() {
+entry:
+  %call.i = call ptr @_Z3barv(), !callsite !7
+  ret ptr null
+}
+
+define i32 @main() {
+entry:
+  %call = call ptr @_Z3foov(), !callsite !8
+  %call1 = call ptr @_Z3foov(), !callsite !9
+  ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!5 = !{i64 9086428284934609951}
+!6 = !{i64 9086428284934609951, i64 -5964873800580613432}
+!7 = !{i64 -5964873800580613432, i64 2732490490862098848}
+!8 = !{i64 8632435727821051414}
+!9 = !{i64 -3421689549917153178}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+
+; DUMP: Node [[BAZ:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 1 StackIds: 1, 2
+; DUMP: 		AllocType 2 StackIds: 1, 3
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO2:0x[a-z0-9]+]] AllocTypes: NotColdCold ContextIds: 1 2
+
+;; This is leftover from the MIB on the alloc inlined into baz. It is not
+;; matched with any call, since there is no such node in the IR. Due to the
+;; null call it will not participate in any context transformations.
+; DUMP: Node [[FOO2]]
+; DUMP: 	null Call
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAZ]] to Caller: [[FOO2]] AllocTypes: NotColdCold ContextIds: 1 2
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+; DUMP: Node [[MAIN1]]
+; DUMP: 	Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 2	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1 3
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[FOO:0x[a-z0-9]+]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[MAIN2]]
+; DUMP: 	Callee: 2229562716906371625 (_Z3foov) Clones: 0 StackIds: 3	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[FOO2]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 1 StackIds: 0, 1, 2
+; DUMP: 		AllocType 2 StackIds: 0, 1, 3
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4
+
+;; This is the node synthesized for the call to bar in foo that was created
+;; by inlining baz into foo.
+; DUMP: Node [[FOO]]
+; DUMP: 	Callee: 16064618363798697104 (_Z3barv) Clones: 0 StackIds: 0, 1	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[FOO]] AllocTypes: NotColdCold ContextIds: 3 4
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[FOO]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 4
+
+
+; DOT: digraph "postbuild" {
+; DOT: 	label="postbuild";
+; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"];
+; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 2732490490862098848\nnull call (external)}"];
+; DOT: 	Node[[FOO]] -> Node[[BAZ]][tooltip="ContextIds: 1 2",fillcolor="mediumorchid1"];
+; DOT: 	Node[[MAIN1:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN1]] ContextIds: 1 3",fillcolor="brown1",style="filled",style="filled",label="{OrigId: 8632435727821051414\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO]][tooltip="ContextIds: 1",fillcolor="brown1"];
+; DOT: 	Node[[MAIN1]] -> Node[[FOO2:0x[a-z0-9]+]][tooltip="ContextIds: 3",fillcolor="brown1"];
+; DOT: 	Node[[MAIN2:0x[a-z0-9]+]] [shape=record,tooltip="N[[MAIN2]] ContextIds: 2 4",fillcolor="cyan",style="filled",style="filled",label="{OrigId: 15025054523792398438\nmain -\> _Z3foov}"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO]][tooltip="ContextIds: 2",fillcolor="cyan"];
+; DOT: 	Node[[MAIN2]] -> Node[[FOO2]][tooltip="ContextIds: 4",fillcolor="cyan"];
+; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc2\n_Z3barv -\> alloc}"];
+; DOT: 	Node[[FOO2]] [shape=record,tooltip="N[[FOO2]] ContextIds: 3 4",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: 0\n_Z3foov -\> _Z3barv}"];
+; DOT: 	Node[[FOO2]] -> Node[[BAR]][tooltip="ContextIds: 3 4",fillcolor="mediumorchid1"];
+; DOT: }

diff  --git a/llvm/test/ThinLTO/X86/memprof-inlined2.ll b/llvm/test/ThinLTO/X86/memprof-inlined2.ll
new file mode 100644
index 0000000000000..1ffae8cd59cef
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-inlined2.ll
@@ -0,0 +1,124 @@
+;; Test callsite context graph generation for call graph with two memprof
+;; contexts and multiple levels of inlining, requiring generation of new
+;; fused nodes to represent the inlined sequence while matching callsite
+;; nodes onto the graph. In particular this tests the case where a function
+;; has inlined a callee containing an inlined callee.
+;;
+;; Original code looks like:
+;;
+;; char *bar() __attribute__((noinline)) {
+;;   return new char[10];
+;; }
+;;
+;; char *baz() {
+;;   return bar();
+;; }
+;;
+;; char *foo() {
+;;   return baz();
+;; }
+;;
+;; int main(int argc, char **argv) {
+;;   char *x = foo();
+;;   char *y = foo();
+;;   memset(x, 0, 10);
+;;   memset(y, 0, 10);
+;;   delete[] x;
+;;   sleep(10);
+;;   delete[] y;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; Both foo and baz are inlined into main, at both foo callsites.
+;; We should update the graph for new fused nodes for both of those inlined
+;; callsites to bar.
+;;
+;; Note that baz and bar are both dead due to the inlining, but have been left
+;; in the input IR to ensure that the MIB call chain is matched to the longer
+;; inline sequences from main.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Z3barv,plx \
+; RUN:  -r=%t.o,_Z3bazv,plx \
+; RUN:  -r=%t.o,_Z3foov,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define ptr @_Z3barv() {
+entry:
+  %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
+  ret ptr null
+}
+
+declare ptr @_Znam(i64)
+
+declare ptr @_Z3bazv()
+
+declare ptr @_Z3foov()
+
+define i32 @main() {
+delete.end5:
+  %call.i.i = call ptr @_Z3barv(), !callsite !6
+  %call.i.i8 = call ptr @_Z3barv(), !callsite !7
+  ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+!0 = !{!1, !3}
+!1 = !{!2, !"notcold"}
+!2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!3 = !{!4, !"cold"}
+!4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!5 = !{i64 9086428284934609951}
+!6 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!7 = !{i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+
+
+; DUMP: CCG before cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[BAR:0x[a-z0-9]+]]
+; DUMP: 	Versions: 1 MIB:
+; DUMP: 		AllocType 1 StackIds: 0, 1, 2
+; DUMP: 		AllocType 2 StackIds: 0, 1, 3
+; DUMP: 	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 2
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN1:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN2:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 2
+
+;; This is the node synthesized for the first inlined call chain of main->foo->baz
+; DUMP: Node [[MAIN1]]
+; DUMP: 	Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 2	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN1]] AllocTypes: NotCold ContextIds: 1
+; DUMP: 	CallerEdges:
+
+;; This is the node synthesized for the second inlined call chain of main->foo->baz
+; DUMP: Node [[MAIN2]]
+; DUMP: 	Callee: 17377440600225628772 (_Z3barv) Clones: 0 StackIds: 0, 1, 3	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 2
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[BAR]] to Caller: [[MAIN2]] AllocTypes: Cold ContextIds: 2
+; DUMP: 	CallerEdges:


        


More information about the llvm-commits mailing list