[llvm] bf6ff4f - [MemProf] Context disambiguation cloning pass [patch 3/4]

Teresa Johnson via llvm-commits llvm-commits at lists.llvm.org
Wed May 3 13:43:27 PDT 2023


Author: Teresa Johnson
Date: 2023-05-03T13:34:00-07:00
New Revision: bf6ff4fd4b735afffc65f92a4a79f6610e7174c3

URL: https://github.com/llvm/llvm-project/commit/bf6ff4fd4b735afffc65f92a4a79f6610e7174c3
DIFF: https://github.com/llvm/llvm-project/commit/bf6ff4fd4b735afffc65f92a4a79f6610e7174c3.diff

LOG: [MemProf] Context disambiguation cloning pass [patch 3/4]

Applies cloning decisions to the IR, cloning functions and updating
calls. For Regular LTO, the IR is updated directly during function
assignment, whereas for ThinLTO it is recorded in the summary index
(a subsequent patch will apply to the IR via the index during the
ThinLTO backend.

The function assignment and cloning proceeds greedily, and we create new
clones as needed when we find an incompatible assignment of function
clones to callsite clones (i.e. when different callers need to invoke
different combinations of callsite clones).

Depends on D140949.

Differential Revision: https://reviews.llvm.org/D141077

Added: 
    llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
    llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll

Modified: 
    llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
    llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
    llvm/test/ThinLTO/X86/memprof-basic.ll
    llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
    llvm/test/ThinLTO/X86/memprof-indirectcall.ll
    llvm/test/ThinLTO/X86/memprof-inlined.ll
    llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
    llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
    llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
    llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index 475ea48cca932..13f3a7eb7ce3f 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -25,11 +25,14 @@ namespace llvm {
 class GlobalValueSummary;
 class Module;
 class ModuleSummaryIndex;
+class OptimizationRemarkEmitter;
 
 class MemProfContextDisambiguation
     : public PassInfoMixin<MemProfContextDisambiguation> {
   /// Run the context disambiguator on \p M, returns true if any changes made.
-  bool processModule(Module &M);
+  bool processModule(
+      Module &M,
+      function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
 
 public:
   MemProfContextDisambiguation() {}

diff  --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 5c8aaddfe3bb3..658633e331118 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -27,8 +27,10 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -39,6 +41,7 @@
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include <sstream>
 #include <vector>
 using namespace llvm;
@@ -46,6 +49,13 @@ using namespace llvm::memprof;
 
 #define DEBUG_TYPE "memprof-context-disambiguation"
 
+STATISTIC(FunctionClonesAnalysis,
+          "Number of function clones created during whole program analysis");
+STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
+                            "cloned) during whole program analysis");
+STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
+                         "during whole program analysis");
+
 static cl::opt<std::string> DotFilePathPrefix(
     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
     cl::value_desc("filename"),
@@ -95,6 +105,13 @@ class CallsiteContextGraph {
   /// behavior of an allocation based on its context.
   void identifyClones();
 
+  /// Assign callsite clones to functions, cloning functions as needed to
+  /// accommodate the combinations of their callsite clones reached by callers.
+  /// For regular LTO this clones functions and callsites in the IR, but for
+  /// ThinLTO the cloning decisions are noted in the summaries and applied
+  /// later.
+  bool assignFunctions();
+
   void dump() const;
   void print(raw_ostream &OS) const;
 
@@ -375,6 +392,28 @@ class CallsiteContextGraph {
     return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
   }
 
+  /// Update the allocation call to record type of allocated memory.
+  void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
+    AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
+    static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
+  }
+
+  /// Update non-allocation call to invoke (possibly cloned) function
+  /// CalleeFunc.
+  void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
+    static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
+  }
+
+  /// Clone the given function for the given callsite, recording mapping of all
+  /// of the functions tracked calls to their new versions in the CallMap.
+  /// Assigns new clones to clone number CloneNo.
+  FuncInfo cloneFunctionForCallsite(
+      FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+      std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
+    return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
+        Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
+  }
+
   /// Gets a label to use in the dot graph for the given call clone in the given
   /// function.
   std::string getLabel(const FuncTy *Func, const CallTy Call,
@@ -469,7 +508,9 @@ class ModuleCallsiteContextGraph
     : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
                                   Instruction *> {
 public:
-  ModuleCallsiteContextGraph(Module &M);
+  ModuleCallsiteContextGraph(
+      Module &M,
+      function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
 
 private:
   friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
@@ -479,10 +520,19 @@ class ModuleCallsiteContextGraph
   bool calleeMatchesFunc(Instruction *Call, const Function *Func);
   uint64_t getLastStackId(Instruction *Call);
   std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
+  void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
+  void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
+  CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
+                       Instruction *>::FuncInfo
+  cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
+                           std::map<CallInfo, CallInfo> &CallMap,
+                           std::vector<CallInfo> &CallsWithMetadataInFunc,
+                           unsigned CloneNo);
   std::string getLabel(const Function *Func, const Instruction *Call,
                        unsigned CloneNo) const;
 
   const Module &Mod;
+  function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
 };
 
 /// Represents a call in the summary index graph, which can either be an
@@ -527,6 +577,14 @@ class IndexCallsiteContextGraph
   bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
   uint64_t getLastStackId(IndexCall &Call);
   std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
+  void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
+  void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
+  CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+                       IndexCall>::FuncInfo
+  cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
+                           std::map<CallInfo, CallInfo> &CallMap,
+                           std::vector<CallInfo> &CallsWithMetadataInFunc,
+                           unsigned CloneNo);
   std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
                        unsigned CloneNo) const;
 
@@ -1282,10 +1340,14 @@ uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
   return Index.getStackIdAtIndex(CallsiteContext.back());
 }
 
+static const std::string MemProfCloneSuffix = ".memprof.";
+
 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
+  // We use CloneNo == 0 to refer to the original version, which doesn't get
+  // renamed with a suffix.
   if (!CloneNo)
     return Base.str();
-  return (Base + ".memprof." + Twine(CloneNo)).str();
+  return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
 }
 
 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
@@ -1347,7 +1409,9 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
   return StackIds;
 }
 
-ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(Module &M) : Mod(M) {
+ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
+    Module &M, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
+    : Mod(M), OREGetter(OREGetter) {
   for (auto &F : M) {
     std::vector<CallInfo> CallsWithMetadata;
     for (auto &BB : F) {
@@ -1661,7 +1725,7 @@ static void checkEdge(
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
-                      bool CheckEdges = false) {
+                      bool CheckEdges = true) {
   if (Node->isRemoved())
     return;
   // Node's context ids should be the union of both its callee and caller edge
@@ -1701,7 +1765,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
   for (const auto Node : nodes<GraphType>(this)) {
-    checkNode<DerivedCCG, FuncTy, CallTy>(Node);
+    checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
     for (auto &Edge : Node->CallerEdges)
       checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
   }
@@ -1925,12 +1989,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     NewEdge->Callee->CallerEdges.push_back(NewEdge);
   }
   if (VerifyCCG) {
-    checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee);
-    checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee);
+    checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
+    checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
     for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
-      checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee);
+      checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
+                                            /*CheckEdges=*/false);
     for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
-      checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee);
+      checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
+                                            /*CheckEdges=*/false);
   }
 }
 
@@ -1945,7 +2011,7 @@ template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
     ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
   if (VerifyNodes)
-    checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/true);
+    checkNode<DerivedCCG, FuncTy, CallTy>(Node);
   assert(!Node->CloneOf);
 
   // If Node as a null call, then either it wasn't found in the module (regular
@@ -2099,7 +2165,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
   for (auto *Clone : Node->Clones) {
     removeNoneTypeCalleeEdges(Clone);
     if (VerifyNodes)
-      checkNode<DerivedCCG, FuncTy, CallTy>(Clone, /*CheckEdges=*/true);
+      checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
   }
   // We should still have some context ids on the original Node.
   assert(!Node->ContextIds.empty());
@@ -2120,7 +2186,581 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
                        }));
 
   if (VerifyNodes)
-    checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/true);
+    checkNode<DerivedCCG, FuncTy, CallTy>(Node);
+}
+
+static std::string getAllocTypeAttributeString(AllocationType Type) {
+  switch (Type) {
+  case AllocationType::NotCold:
+    return "notcold";
+    break;
+  case AllocationType::Cold:
+    return "cold";
+    break;
+  default:
+    dbgs() << "Unexpected alloc type " << (uint8_t)Type;
+    assert(false);
+  }
+  llvm_unreachable("invalid alloc type");
+}
+
+void ModuleCallsiteContextGraph::updateAllocationCall(
+    CallInfo &Call, AllocationType AllocType) {
+  std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
+  auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
+                                "memprof", AllocTypeString);
+  cast<CallBase>(Call.call())->addFnAttr(A);
+  OREGetter(Call.call()->getFunction())
+      .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
+            << ore::NV("AllocationCall", Call.call()) << " in clone "
+            << ore::NV("Caller", Call.call()->getFunction())
+            << " marked with memprof allocation attribute "
+            << ore::NV("Attribute", AllocTypeString));
+}
+
+void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
+                                                     AllocationType AllocType) {
+  auto *AI = Call.call().dyn_cast<AllocInfo *>();
+  assert(AI);
+  assert(AI->Versions.size() > Call.cloneNo());
+  AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
+}
+
+void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
+                                            FuncInfo CalleeFunc) {
+  if (CalleeFunc.cloneNo() > 0)
+    cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
+  OREGetter(CallerCall.call()->getFunction())
+      .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
+            << ore::NV("Call", CallerCall.call()) << " in clone "
+            << ore::NV("Caller", CallerCall.call()->getFunction())
+            << " assigned to call function clone "
+            << ore::NV("Callee", CalleeFunc.func()));
+}
+
+void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
+                                           FuncInfo CalleeFunc) {
+  auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
+  assert(CI &&
+         "Caller cannot be an allocation which should not have profiled calls");
+  assert(CI->Clones.size() > CallerCall.cloneNo());
+  CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
+}
+
+CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
+                     Instruction *>::FuncInfo
+ModuleCallsiteContextGraph::cloneFunctionForCallsite(
+    FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+    std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
+  // Use existing LLVM facilities for cloning and obtaining Call in clone
+  ValueToValueMapTy VMap;
+  auto *NewFunc = CloneFunction(Func.func(), VMap);
+  std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
+  assert(!Func.func()->getParent()->getFunction(Name));
+  NewFunc->setName(Name);
+  for (auto &Inst : CallsWithMetadataInFunc) {
+    // This map always has the initial version in it.
+    assert(Inst.cloneNo() == 0);
+    CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
+  }
+  OREGetter(Func.func())
+      .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
+            << "created clone " << ore::NV("NewFunction", NewFunc));
+  return {NewFunc, CloneNo};
+}
+
+CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
+                     IndexCall>::FuncInfo
+IndexCallsiteContextGraph::cloneFunctionForCallsite(
+    FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
+    std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
+  // Check how many clones we have of Call (and therefore function).
+  // The next clone number is the current size of versions array.
+  // Confirm this matches the CloneNo provided by the caller, which is based on
+  // the number of function clones we have.
+  assert(CloneNo ==
+         (Call.call().is<AllocInfo *>()
+              ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
+              : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
+  // Walk all the instructions in this function. Create a new version for
+  // each (by adding an entry to the Versions/Clones summary array), and copy
+  // over the version being called for the function clone being cloned here.
+  // Additionally, add an entry to the CallMap for the new function clone,
+  // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
+  // to the new call clone.
+  for (auto &Inst : CallsWithMetadataInFunc) {
+    // This map always has the initial version in it.
+    assert(Inst.cloneNo() == 0);
+    if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
+      assert(AI->Versions.size() == CloneNo);
+      // We assign the allocation type later (in updateAllocationCall), just add
+      // an entry for it here.
+      AI->Versions.push_back(0);
+    } else {
+      auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
+      assert(CI && CI->Clones.size() == CloneNo);
+      // We assign the clone number later (in updateCall), just add an entry for
+      // it here.
+      CI->Clones.push_back(0);
+    }
+    CallMap[Inst] = {Inst.call(), CloneNo};
+  }
+  return {Func.func(), CloneNo};
+}
+
+// This method assigns cloned callsites to functions, cloning the functions as
+// needed. The assignment is greedy and proceeds roughly as follows:
+//
+// For each function Func:
+//   For each call with graph Node having clones:
+//     Initialize ClonesWorklist to Node and its clones
+//     Initialize NodeCloneCount to 0
+//     While ClonesWorklist is not empty:
+//        Clone = pop front ClonesWorklist
+//        NodeCloneCount++
+//        If Func has been cloned less than NodeCloneCount times:
+//           If NodeCloneCount is 1:
+//             Assign Clone to original Func
+//             Continue
+//           Create a new function clone
+//           If other callers not assigned to call a function clone yet:
+//              Assign them to call new function clone
+//              Continue
+//           Assign any other caller calling the cloned version to new clone
+//
+//        For each caller of Clone:
+//           If caller is assigned to call a specific function clone:
+//             If we cannot assign Clone to that function clone:
+//               Create new callsite Clone NewClone
+//               Add NewClone to ClonesWorklist
+//               Continue
+//             Assign Clone to existing caller's called function clone
+//           Else:
+//             If Clone not already assigned to a function clone:
+//                Assign to first function clone without assignment
+//             Assign caller to selected function clone
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
+  bool Changed = false;
+
+  // Keep track of the assignment of nodes (callsites) to function clones they
+  // call.
+  DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
+
+  // Update caller node to call function version CalleeFunc, by recording the
+  // assignment in CallsiteToCalleeFuncCloneMap.
+  auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
+                                        const FuncInfo &CalleeFunc) {
+    assert(Caller->hasCall());
+    CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
+  };
+
+  // Walk all functions for which we saw calls with memprof metadata, and handle
+  // cloning for each of its calls.
+  for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
+    FuncInfo OrigFunc(Func);
+    // Map from each clone of OrigFunc to a map of remappings of each call of
+    // interest (from original uncloned call to the corresponding cloned call in
+    // that function clone).
+    std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
+    for (auto &Call : CallsWithMetadata) {
+      ContextNode *Node = getNodeForInst(Call);
+      // Skip call if we do not have a node for it (all uses of its stack ids
+      // were either on inlined chains or pruned from the MIBs), or if we did
+      // not create any clones for it.
+      if (!Node || Node->Clones.empty())
+        continue;
+      assert(Node->hasCall() &&
+             "Not having a call should have prevented cloning");
+
+      // Track the assignment of function clones to clones of the current
+      // callsite Node being handled.
+      std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
+
+      // Assign callsite version CallsiteClone to function version FuncClone,
+      // and also assign (possibly cloned) Call to CallsiteClone.
+      auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
+                                                CallInfo &Call,
+                                                ContextNode *CallsiteClone,
+                                                bool IsAlloc) {
+        // Record the clone of callsite node assigned to this function clone.
+        FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
+
+        assert(FuncClonesToCallMap.count(FuncClone));
+        std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
+        CallInfo CallClone(Call);
+        if (CallMap.count(Call))
+          CallClone = CallMap[Call];
+        CallsiteClone->setCall(CallClone);
+      };
+
+      // Keep track of the clones of callsite Node that need to be assigned to
+      // function clones. This list may be expanded in the loop body below if we
+      // find additional cloning is required.
+      std::deque<ContextNode *> ClonesWorklist;
+      // Ignore original Node if we moved all of its contexts to clones.
+      if (!Node->ContextIds.empty())
+        ClonesWorklist.push_back(Node);
+      ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
+                            Node->Clones.end());
+
+      // Now walk through all of the clones of this callsite Node that we need,
+      // and determine the assignment to a corresponding clone of the current
+      // function (creating new function clones as needed).
+      unsigned NodeCloneCount = 0;
+      while (!ClonesWorklist.empty()) {
+        ContextNode *Clone = ClonesWorklist.front();
+        ClonesWorklist.pop_front();
+        NodeCloneCount++;
+        if (VerifyNodes)
+          checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
+
+        // Need to create a new function clone if we have more callsite clones
+        // than existing function clones, which would have been assigned to an
+        // earlier clone in the list (we assign callsite clones to function
+        // clones greedily).
+        if (FuncClonesToCallMap.size() < NodeCloneCount) {
+          // If this is the first callsite copy, assign to original function.
+          if (NodeCloneCount == 1) {
+            // Since FuncClonesToCallMap is empty in this case, no clones have
+            // been created for this function yet, and no callers should have
+            // been assigned a function clone for this callee node yet.
+            assert(llvm::none_of(
+                Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
+                  return CallsiteToCalleeFuncCloneMap.count(E->Caller);
+                }));
+            // Initialize with empty call map, assign Clone to original function
+            // and its callers, and skip to the next clone.
+            FuncClonesToCallMap[OrigFunc] = {};
+            AssignCallsiteCloneToFuncClone(
+                OrigFunc, Call, Clone,
+                AllocationCallToContextNodeMap.count(Call));
+            for (auto &CE : Clone->CallerEdges) {
+              // Ignore any caller that does not have a recorded callsite Call.
+              if (!CE->Caller->hasCall())
+                continue;
+              RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
+            }
+            continue;
+          }
+
+          // First locate which copy of OrigFunc to clone again. If a caller
+          // of this callsite clone was already assigned to call a particular
+          // function clone, we need to redirect all of those callers to the
+          // new function clone, and update their other callees within this
+          // function.
+          FuncInfo PreviousAssignedFuncClone;
+          auto EI = llvm::find_if(
+              Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
+                return CallsiteToCalleeFuncCloneMap.count(E->Caller);
+              });
+          bool CallerAssignedToCloneOfFunc = false;
+          if (EI != Clone->CallerEdges.end()) {
+            const std::shared_ptr<ContextEdge> &Edge = *EI;
+            PreviousAssignedFuncClone =
+                CallsiteToCalleeFuncCloneMap[Edge->Caller];
+            CallerAssignedToCloneOfFunc = true;
+          }
+
+          // Clone function and save it along with the CallInfo map created
+          // during cloning in the FuncClonesToCallMap.
+          std::map<CallInfo, CallInfo> NewCallMap;
+          unsigned CloneNo = FuncClonesToCallMap.size();
+          assert(CloneNo > 0 && "Clone 0 is the original function, which "
+                                "should already exist in the map");
+          FuncInfo NewFuncClone = cloneFunctionForCallsite(
+              OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
+          FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
+          FunctionClonesAnalysis++;
+          Changed = true;
+
+          // If no caller callsites were already assigned to a clone of this
+          // function, we can simply assign this clone to the new func clone
+          // and update all callers to it, then skip to the next clone.
+          if (!CallerAssignedToCloneOfFunc) {
+            AssignCallsiteCloneToFuncClone(
+                NewFuncClone, Call, Clone,
+                AllocationCallToContextNodeMap.count(Call));
+            for (auto &CE : Clone->CallerEdges) {
+              // Ignore any caller that does not have a recorded callsite Call.
+              if (!CE->Caller->hasCall())
+                continue;
+              RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
+            }
+            continue;
+          }
+
+          // We may need to do additional node cloning in this case.
+          // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
+          // that were previously assigned to call PreviousAssignedFuncClone,
+          // to record that they now call NewFuncClone.
+          for (auto CE : Clone->CallerEdges) {
+            // Ignore any caller that does not have a recorded callsite Call.
+            if (!CE->Caller->hasCall())
+              continue;
+
+            if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
+                // We subsequently fall through to later handling that
+                // will perform any additional cloning required for
+                // callers that were calling other function clones.
+                CallsiteToCalleeFuncCloneMap[CE->Caller] !=
+                    PreviousAssignedFuncClone)
+              continue;
+
+            RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
+
+            // If we are cloning a function that was already assigned to some
+            // callers, then essentially we are creating new callsite clones
+            // of the other callsites in that function that are reached by those
+            // callers. Clone the other callees of the current callsite's caller
+            // that were already assigned to PreviousAssignedFuncClone
+            // accordingly. This is important since we subsequently update the
+            // calls from the nodes in the graph and their assignments to callee
+            // functions recorded in CallsiteToCalleeFuncCloneMap.
+            for (auto CalleeEdge : CE->Caller->CalleeEdges) {
+              // Skip any that have been removed on an earlier iteration when
+              // cleaning up newly None type callee edges.
+              if (!CalleeEdge)
+                continue;
+              ContextNode *Callee = CalleeEdge->Callee;
+              // Skip the current callsite, we are looking for other
+              // callsites Caller calls, as well as any that does not have a
+              // recorded callsite Call.
+              if (Callee == Clone || !Callee->hasCall())
+                continue;
+              ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
+              removeNoneTypeCalleeEdges(NewClone);
+              // Moving the edge may have resulted in some none type
+              // callee edges on the original Callee.
+              removeNoneTypeCalleeEdges(Callee);
+              assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
+              // If the Callee node was already assigned to call a specific
+              // function version, make sure its new clone is assigned to call
+              // that same function clone.
+              if (CallsiteToCalleeFuncCloneMap.count(Callee))
+                RecordCalleeFuncOfCallsite(
+                    NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
+              // Update NewClone with the new Call clone of this callsite's Call
+              // created for the new function clone created earlier.
+              // Recall that we have already ensured when building the graph
+              // that each caller can only call callsites within the same
+              // function, so we are guaranteed that Callee Call is in the
+              // current OrigFunc.
+              // CallMap is set up as indexed by original Call at clone 0.
+              CallInfo OrigCall(Callee->getOrigNode()->Call);
+              OrigCall.setCloneNo(0);
+              std::map<CallInfo, CallInfo> &CallMap =
+                  FuncClonesToCallMap[NewFuncClone];
+              assert(CallMap.count(OrigCall));
+              CallInfo NewCall(CallMap[OrigCall]);
+              assert(NewCall);
+              NewClone->setCall(NewCall);
+            }
+          }
+          // Fall through to handling below to perform the recording of the
+          // function for this callsite clone. This enables handling of cases
+          // where the callers were assigned to 
diff erent clones of a function.
+        }
+
+        // See if we can use existing function clone. Walk through
+        // all caller edges to see if any have already been assigned to
+        // a clone of this callsite's function. If we can use it, do so. If not,
+        // because that function clone is already assigned to a 
diff erent clone
+        // of this callsite, then we need to clone again.
+        // Basically, this checking is needed to handle the case where 
diff erent
+        // caller functions/callsites may need versions of this function
+        // containing 
diff erent mixes of callsite clones across the 
diff erent
+        // callsites within the function. If that happens, we need to create
+        // additional function clones to handle the various combinations.
+        //
+        // Keep track of any new clones of this callsite created by the
+        // following loop, as well as any existing clone that we decided to
+        // assign this clone to.
+        std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
+        FuncInfo FuncCloneAssignedToCurCallsiteClone;
+        // We need to be able to remove Edge from CallerEdges, so need to adjust
+        // iterator in the loop.
+        for (auto EI = Clone->CallerEdges.begin();
+             EI != Clone->CallerEdges.end();) {
+          auto Edge = *EI;
+          // Ignore any caller that does not have a recorded callsite Call.
+          if (!Edge->Caller->hasCall()) {
+            EI++;
+            continue;
+          }
+          // If this caller already assigned to call a version of OrigFunc, need
+          // to ensure we can assign this callsite clone to that function clone.
+          if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
+            FuncInfo FuncCloneCalledByCaller =
+                CallsiteToCalleeFuncCloneMap[Edge->Caller];
+            // First we need to confirm that this function clone is available
+            // for use by this callsite node clone.
+            //
+            // While FuncCloneToCurNodeCloneMap is built only for this Node and
+            // its callsite clones, one of those callsite clones X could have
+            // been assigned to the same function clone called by Edge's caller
+            // - if Edge's caller calls another callsite within Node's original
+            // function, and that callsite has another caller reaching clone X.
+            // We need to clone Node again in this case.
+            if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
+                 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
+                     Clone) ||
+                // Detect when we have multiple callers of this callsite that
+                // have already been assigned to specific, and 
diff erent, clones
+                // of OrigFunc (due to other unrelated callsites in Func they
+                // reach via call contexts). Is this Clone of callsite Node
+                // assigned to a 
diff erent clone of OrigFunc? If so, clone Node
+                // again.
+                (FuncCloneAssignedToCurCallsiteClone &&
+                 FuncCloneAssignedToCurCallsiteClone !=
+                     FuncCloneCalledByCaller)) {
+              // We need to use a 
diff erent newly created callsite clone, in
+              // order to assign it to another new function clone on a
+              // subsequent iteration over the Clones array (adjusted below).
+              // Note we specifically do not reset the
+              // CallsiteToCalleeFuncCloneMap entry for this caller, so that
+              // when this new clone is processed later we know which version of
+              // the function to copy (so that other callsite clones we have
+              // assigned to that function clone are properly cloned over). See
+              // comments in the function cloning handling earlier.
+
+              // Check if we already have cloned this callsite again while
+              // walking through caller edges, for a caller calling the same
+              // function clone. If so, we can move this edge to that new clone
+              // rather than creating yet another new clone.
+              if (FuncCloneToNewCallsiteCloneMap.count(
+                      FuncCloneCalledByCaller)) {
+                ContextNode *NewClone =
+                    FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
+                moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
+                // Cleanup any none type edges cloned over.
+                removeNoneTypeCalleeEdges(NewClone);
+              } else {
+                // Create a new callsite clone.
+                ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
+                removeNoneTypeCalleeEdges(NewClone);
+                FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
+                    NewClone;
+                // Add to list of clones and process later.
+                ClonesWorklist.push_back(NewClone);
+                assert(EI == Clone->CallerEdges.end() ||
+                       Clone->AllocTypes != (uint8_t)AllocationType::None);
+                assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
+              }
+              // Moving the caller edge may have resulted in some none type
+              // callee edges.
+              removeNoneTypeCalleeEdges(Clone);
+              // We will handle the newly created callsite clone in a subsequent
+              // iteration over this Node's Clones. Continue here since we
+              // already adjusted iterator EI while moving the edge.
+              continue;
+            }
+
+            // Otherwise, we can use the function clone already assigned to this
+            // caller.
+            if (!FuncCloneAssignedToCurCallsiteClone) {
+              FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
+              // Assign Clone to FuncCloneCalledByCaller
+              AssignCallsiteCloneToFuncClone(
+                  FuncCloneCalledByCaller, Call, Clone,
+                  AllocationCallToContextNodeMap.count(Call));
+            } else
+              // Don't need to do anything - callsite is already calling this
+              // function clone.
+              assert(FuncCloneAssignedToCurCallsiteClone ==
+                     FuncCloneCalledByCaller);
+
+          } else {
+            // We have not already assigned this caller to a version of
+            // OrigFunc. Do the assignment now.
+
+            // First check if we have already assigned this callsite clone to a
+            // clone of OrigFunc for another caller during this iteration over
+            // its caller edges.
+            if (!FuncCloneAssignedToCurCallsiteClone) {
+              // Find first function in FuncClonesToCallMap without an assigned
+              // clone of this callsite Node. We should always have one
+              // available at this point due to the earlier cloning when the
+              // FuncClonesToCallMap size was smaller than the clone number.
+              for (auto &CF : FuncClonesToCallMap) {
+                if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
+                  FuncCloneAssignedToCurCallsiteClone = CF.first;
+                  break;
+                }
+              }
+              assert(FuncCloneAssignedToCurCallsiteClone);
+              // Assign Clone to FuncCloneAssignedToCurCallsiteClone
+              AssignCallsiteCloneToFuncClone(
+                  FuncCloneAssignedToCurCallsiteClone, Call, Clone,
+                  AllocationCallToContextNodeMap.count(Call));
+            } else
+              assert(FuncCloneToCurNodeCloneMap
+                         [FuncCloneAssignedToCurCallsiteClone] == Clone);
+            // Update callers to record function version called.
+            RecordCalleeFuncOfCallsite(Edge->Caller,
+                                       FuncCloneAssignedToCurCallsiteClone);
+          }
+
+          EI++;
+        }
+      }
+      if (VerifyCCG) {
+        checkNode<DerivedCCG, FuncTy, CallTy>(Node);
+        for (const auto &PE : Node->CalleeEdges)
+          checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
+        for (const auto &CE : Node->CallerEdges)
+          checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
+        for (auto *Clone : Node->Clones) {
+          checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
+          for (const auto &PE : Clone->CalleeEdges)
+            checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
+          for (const auto &CE : Clone->CallerEdges)
+            checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
+        }
+      }
+    }
+  }
+
+  auto UpdateCalls = [&](ContextNode *Node,
+                         DenseSet<const ContextNode *> &Visited,
+                         auto &&UpdateCalls) {
+    auto Inserted = Visited.insert(Node);
+    if (!Inserted.second)
+      return;
+
+    for (auto *Clone : Node->Clones)
+      UpdateCalls(Clone, Visited, UpdateCalls);
+
+    for (auto &Edge : Node->CallerEdges)
+      UpdateCalls(Edge->Caller, Visited, UpdateCalls);
+
+    // Skip if either no call to update, or if we ended up with no context ids
+    // (we moved all edges onto other clones).
+    if (!Node->hasCall() || Node->ContextIds.empty())
+      return;
+
+    if (Node->IsAllocation) {
+      updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
+      return;
+    }
+
+    if (!CallsiteToCalleeFuncCloneMap.count(Node))
+      return;
+
+    auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
+    updateCall(Node->Call, CalleeFunc);
+  };
+
+  // Performs DFS traversal starting from allocation nodes to update calls to
+  // reflect cloning decisions recorded earlier. For regular LTO this will
+  // update the actual calls in the IR to call the appropriate function clone
+  // (and add attributes to allocation calls), whereas for ThinLTO the decisions
+  // are recorded in the summary entries.
+  DenseSet<const ContextNode *> Visited;
+  for (auto &Entry : AllocationCallToContextNodeMap)
+    UpdateCalls(Entry.second, Visited, UpdateCalls);
+
+  return Changed;
 }
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
@@ -2149,13 +2789,24 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
   if (ExportToDot)
     exportToDot("cloned");
 
-  return false;
+  bool Changed = assignFunctions();
+
+  if (DumpCCG) {
+    dbgs() << "CCG after assigning function clones:\n";
+    dbgs() << *this;
+  }
+  if (ExportToDot)
+    exportToDot("clonefuncassign");
+
+  return Changed;
 }
 
-bool MemProfContextDisambiguation::processModule(Module &M) {
+bool MemProfContextDisambiguation::processModule(
+    Module &M,
+    function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
   bool Changed = false;
 
-  ModuleCallsiteContextGraph CCG(M);
+  ModuleCallsiteContextGraph CCG(M, OREGetter);
   Changed = CCG.process();
 
   return Changed;
@@ -2163,7 +2814,11 @@ bool MemProfContextDisambiguation::processModule(Module &M) {
 
 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
                                                     ModuleAnalysisManager &AM) {
-  if (!processModule(M))
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
+    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+  };
+  if (!processModule(M, OREGetter))
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }

diff  --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index eaac271fbef2c..06dc6b1c87807 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -39,13 +39,35 @@
 ; RUN:	-r=%t.o,_Znam, \
 ; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:	-stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:	--check-prefix=STATS
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should have cloned bar, baz, and foo, for the cold memory allocation.
 ; RUN:	cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
 
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:	-r=%t.o,main,plx \
+; RUN:	-r=%t.o,_ZdaPv, \
+; RUN:	-r=%t.o,sleep, \
+; RUN:	-r=%t.o,_Znam, \
+; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN:	-stats -pass-remarks=memprof-context-disambiguation \
+; RUN:	-o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:	--check-prefix=STATS
+
+; RUN:	cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should have cloned bar, baz, and foo, for the cold memory allocation.
+; RUN:	cat %t2.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
+;; Check distributed index
+; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
+
 source_filename = "memprof-basic.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -227,6 +249,11 @@ uselistorder ptr @_Z3foov, { 1, 0 }
 ; DUMP:		Clone of [[BAR]]
 
 
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
 ; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
@@ -258,3 +285,9 @@ uselistorder ptr @_Z3foov, { 1, 0 }
 ; DOTCLONED: 	Node[[BAZ2]] -> Node[[BAR2:0x[a-z0-9]+]][tooltip="ContextIds: 2",fillcolor="cyan"];
 ; DOTCLONED: 	Node[[BAR2]] [shape=record,tooltip="N[[BAR2]] ContextIds: 2",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z3barv -\> alloc}"];
 ; DOTCLONED: }
+
+
+; DISTRIB: ^[[BAZ:[0-9]+]] = gv: (guid: 5878270615442837395, {{.*}} callsites: ((callee: ^[[BAR:[0-9]+]], clones: (0, 1)
+; DISTRIB: ^[[FOO:[0-9]+]] = gv: (guid: 6731117468105397038, {{.*}} callsites: ((callee: ^[[BAZ]], clones: (0, 1)
+; DISTRIB: ^[[BAR]] = gv: (guid: 9832687305761716512, {{.*}} allocs: ((versions: (notcold, cold)
+; DISTRIB: ^[[MAIN:[0-9]+]] = gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^[[FOO]], clones: (0), {{.*}} (callee: ^[[FOO]], clones: (1)

diff  --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
index 6f89b369897ec..d4aceabfa8aaf 100644
--- a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
+++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
@@ -1,7 +1,8 @@
 ;; Test callsite context graph generation for call graph with with MIBs
 ;; that have pruned contexts that partially match multiple inlined
 ;; callsite contexts, requiring duplication of context ids and nodes
-;; while matching callsite nodes onto the graph.
+;; while matching callsite nodes onto the graph. Also tests graph and IR
+;; cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -60,7 +61,9 @@
 ; RUN:  -r=%t.o,_Znam, \
 ; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
 
 ; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
@@ -68,6 +71,27 @@
 ; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
 
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
+
+; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
+; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
+;; We should clone D once for the cold allocations via C.
+; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
+;; Check distributed index
+; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
+
 source_filename = "duplicate-context-ids.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -104,7 +128,13 @@ entry:
   ret ptr null
 }
 
-declare i32 @main()
+define i32 @main() {
+entry:
+  call ptr @_Z1Bv()
+  call ptr @_Z1Ev()
+  call ptr @_Z1Fv()
+  ret i32 0
+}
 
 declare void @_ZdaPv()
 
@@ -268,6 +298,11 @@ declare i32 @sleep()
 ; DUMP:         Clone of [[D]]
 
 
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOTPRE: digraph "prestackupdate" {
 ; DOTPRE: 	label="prestackupdate";
 ; DOTPRE: 	Node[[D:0x[a-z0-9]+]] [shape=record,tooltip="N[[D]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
@@ -305,3 +340,9 @@ declare i32 @sleep()
 ; DOTCLONED: 	Node[[E]] -> Node[[D2]][tooltip="ContextIds: 1",fillcolor="cyan"];
 ; DOTCLONED: 	Node[[D2]] [shape=record,tooltip="N[[D2]] ContextIds: 1 3 4",fillcolor="cyan",style="filled",color="blue",style="filled,bold,dashed",label="{OrigId: Alloc0\n_Z1Dv -\> alloc}"];
 ; DOTCLONED: }
+
+; DISTRIB: ^[[C:[0-9]+]] = gv: (guid: 1643923691937891493, {{.*}} callsites: ((callee: ^[[D:[0-9]+]], clones: (1)
+; DISTRIB: ^[[D]] = gv: (guid: 4881081444663423788, {{.*}} allocs: ((versions: (notcold, cold)
+; DISTRIB: ^[[B:[0-9]+]] = gv: (guid: 14590037969532473829, {{.*}} callsites: ((callee: ^[[D]], clones: (1)
+; DISTRIB: ^[[F:[0-9]+]] = gv: (guid: 17035303613541779335, {{.*}} callsites: ((callee: ^[[D]], clones: (0)
+; DISTRIB: ^[[E:[0-9]+]] = gv: (guid: 17820708772846654376, {{.*}} callsites: ((callee: ^[[D]], clones: (1)

diff  --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
new file mode 100644
index 0000000000000..59d1b1be156db
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
@@ -0,0 +1,232 @@
+;; Test context disambiguation for a callgraph containing multiple memprof
+;; contexts and no inlining, where we need to perform additional cloning
+;; during function assignment/cloning to handle the combination of contexts
+;; to 2 
diff erent allocations.
+;;
+;; void E(char **buf1, char **buf2) {
+;;   *buf1 = new char[10];
+;;   *buf2 = new char[10];
+;; }
+;;
+;; void B(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;;
+;; void C(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;;
+;; void D(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;; int main(int argc, char **argv) {
+;;   char *cold1, *cold2, *default1, *default2, *default3, *default4;
+;;   B(&default1, &default2);
+;;   C(&default3, &cold1);
+;;   D(&cold2, &default4);
+;;   memset(cold1, 0, 10);
+;;   memset(cold2, 0, 10);
+;;   memset(default1, 0, 10);
+;;   memset(default2, 0, 10);
+;;   memset(default3, 0, 10);
+;;   memset(default4, 0, 10);
+;;   delete[] default1;
+;;   delete[] default2;
+;;   delete[] default3;
+;;   delete[] default4;
+;;   sleep(10);
+;;   delete[] cold1;
+;;   delete[] cold2;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+
+; RUN: opt -thinlto-bc %s >%t.o
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
+
+
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
+
+
+source_filename = "funcassigncloning.ll"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline optnone
+define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) {
+entry:
+  %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7
+  %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15
+  ret void
+}
+
+declare ptr @_Znam(i64)
+
+define internal void @_Z1BPPcS0_() {
+entry:
+  call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16
+  ret void
+}
+
+define internal void @_Z1CPPcS0_() {
+entry:
+  call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17
+  ret void
+}
+
+define internal void @_Z1DPPcS0_() {
+entry:
+  call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18
+  ret void
+}
+
+; Function Attrs: noinline optnone
+define i32 @main() {
+entry:
+  call void @_Z1BPPcS0_()
+  call void @_Z1CPPcS0_()
+  call void @_Z1DPPcS0_()
+  ret i32 0
+}
+
+declare void @_ZdaPv()
+
+declare i32 @sleep()
+
+; uselistorder directives
+uselistorder ptr @_Znam, { 1, 0 }
+
+!0 = !{!1, !3, !5}
+!1 = !{!2, !"cold"}
+!2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 -3461278137325233666, i64 -3483158674395044949}
+!5 = !{!6, !"notcold"}
+!6 = !{i64 -3461278137325233666, i64 -2441057035866683071}
+!7 = !{i64 -3461278137325233666}
+!8 = !{!9, !11, !13}
+!9 = !{!10, !"notcold"}
+!10 = !{i64 -1415475215210681400, i64 -2441057035866683071}
+!11 = !{!12, !"cold"}
+!12 = !{i64 -1415475215210681400, i64 -3483158674395044949}
+!13 = !{!14, !"notcold"}
+!14 = !{i64 -1415475215210681400, i64 -7799663586031895603}
+!15 = !{i64 -1415475215210681400}
+!16 = !{i64 -2441057035866683071}
+!17 = !{i64 -3483158674395044949}
+!18 = !{i64 -7799663586031895603}
+
+
+;; Originally we create a single clone of each call to new from E, since each
+;; allocates cold memory for a single caller.
+
+; DUMP: CCG after cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]]
+; DUMP:         Versions: 1 MIB:
+; DUMP:                 AllocType 2 StackIds: 0
+; DUMP:                 AllocType 1 StackIds: 1
+; DUMP:                 AllocType 1 StackIds: 2
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 2 3
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	Clones: [[ENEW1CLONE:0x[a-z0-9]+]]
+
+; DUMP: Node [[D:0x[a-z0-9]+]]
+; DUMP: 	Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 0 (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 1 6
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
+; DUMP: 		Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[C]]
+; DUMP: 	Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 1 (clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 2 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[B]]
+; DUMP: 	Callee: 10758063066234039248 (_Z1EPPcS0_) Clones: 0 StackIds: 2 (clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[ENEW2ORIG]]
+; DUMP:         Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 2
+; DUMP:                 AllocType 2 StackIds: 1
+; DUMP:                 AllocType 1 StackIds: 0
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 4 6
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 	Clones: [[ENEW2CLONE]]
+
+; DUMP: Node [[ENEW1CLONE]]
+; DUMP:         Versions: 1 MIB:
+; DUMP:                 AllocType 2 StackIds: 0
+; DUMP:                 AllocType 1 StackIds: 1
+; DUMP:                 AllocType 1 StackIds: 2
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
+; DUMP: 	Clone of [[ENEW1ORIG]]
+
+; DUMP: Node [[ENEW2CLONE]]
+; DUMP:         Versions: 1 MIB:
+; DUMP:                 AllocType 1 StackIds: 2
+; DUMP:                 AllocType 2 StackIds: 1
+; DUMP:                 AllocType 1 StackIds: 0
+; DUMP:         (clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 5
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
+; DUMP: 	Clone of [[ENEW2ORIG]]
+
+
+; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis

diff  --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
index 8ba958a1ccd76..a27b4b639d772 100644
--- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
+++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
@@ -1,7 +1,7 @@
 ;; Tests callsite context graph generation for call graph containing indirect
 ;; calls. Currently this should result in conservative behavior, such that the
 ;; indirect call receives a null call in its graph node, to prevent subsequent
-;; cloning.
+;; cloning. Also tests graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -61,7 +61,9 @@
 ; RUN:  -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
 ; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
 
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should only create a single clone of foo, for the direct call
@@ -69,6 +71,26 @@
 ; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
 
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -r=%t.o,_ZTVN10__cxxabiv120__si_class_type_infoE, \
+; RUN:  -r=%t.o,_ZTVN10__cxxabiv117__class_type_infoE, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
+
+; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should only create a single clone of foo, for the direct call
+;; from main allocating cold memory.
+; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
 source_filename = "indirectcall.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -359,6 +381,11 @@ uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
 ; DUMP:		Clone of [[FOO]]
 
 
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
 ; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> alloc}"];

diff  --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll
index d6fa0d39a9cea..ea4c9c7a4dc88 100644
--- a/llvm/test/ThinLTO/X86/memprof-inlined.ll
+++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll
@@ -1,6 +1,7 @@
 ;; Test callsite context graph generation for call graph with two memprof
 ;; contexts and partial inlining, requiring generation of a new fused node to
 ;; represent the inlined sequence while matching callsite nodes onto the graph.
+;; Also tests graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -48,7 +49,9 @@
 ; RUN:	-r=%t.o,_Znam, \
 ; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation -save-temps \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should create clones for foo and bar for the call from main to allocate
@@ -56,6 +59,24 @@
 ; RUN:	cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
 
+;; Try again but with distributed ThinLTO
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -thinlto-distributed-indexes \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_ZdaPv, \
+; RUN:  -r=%t.o,sleep, \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:  --check-prefix=STATS
+
+; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
+;; We should create clones for foo and bar for the call from main to allocate
+;; cold memory.
+; RUN:	cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
+
 source_filename = "inlined.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -257,6 +278,11 @@ declare i32 @sleep()
 ; DUMP:         Clone of [[BAR]]
 
 
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
 ; DOT: 	Node[[BAZ:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAZ]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3bazv -\> alloc}"];

diff  --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
index 27e65219b8c60..d7fdaaef0f03a 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
@@ -1,5 +1,5 @@
 ;; Test callsite context graph generation for simple call graph with
-;; two memprof contexts and no inlining.
+;; two memprof contexts and no inlining, as well as graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -34,7 +34,9 @@
 ; RUN: opt -passes=memprof-context-disambiguation \
 ; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:	-stats -pass-remarks=memprof-context-disambiguation \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN:	--check-prefix=STATS --check-prefix=REMARKS
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should have cloned bar, baz, and foo, for the cold memory allocation.
@@ -222,6 +224,48 @@ attributes #6 = { builtin }
 ; DUMP:		Clone of [[BAR]]
 
 
+; REMARKS: created clone _Z3barv.memprof.1
+; REMARKS: created clone _Z3bazv.memprof.1
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1
+; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1
+; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov assigned to call function clone _Z3bazv
+; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+
+
+; IR: define {{.*}} @main
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   call {{.*}} @_Z3foov.memprof.1()
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv()
+; IR:   call {{.*}} @_Z3barv()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   call {{.*}} @_Z3bazv()
+; IR: define internal {{.*}} @_Z3barv.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv.memprof.1()
+; IR:   call {{.*}} @_Z3barv.memprof.1()
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Z3bazv.memprof.1()
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
 ; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];

diff  --git a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
index 193b31b4a705a..4108461793a60 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/duplicate-context-ids.ll
@@ -1,7 +1,8 @@
 ;; Test callsite context graph generation for call graph with with MIBs
 ;; that have pruned contexts that partially match multiple inlined
 ;; callsite contexts, requiring duplication of context ids and nodes
-;; while matching callsite nodes onto the graph.
+;; while matching callsite nodes onto the graph. Also tests graph and IR
+;; cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -55,7 +56,9 @@
 ; RUN: opt -passes=memprof-context-disambiguation \
 ; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
 
 ; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
@@ -263,6 +266,39 @@ attributes #6 = { builtin }
 ; DUMP: 		Edge from Callee [[D2]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
 ; DUMP:         Clone of [[D]]
 
+; REMARKS: created clone _Z1Dv.memprof.1
+; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.memprof.1
+; REMARKS: call in clone _Z1Cv assigned to call function clone _Z1Dv.memprof.1
+; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.memprof.1
+; REMARKS: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1Fv assigned to call function clone _Z1Dv
+; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold
+
+
+;; The allocation via F does not allocate cold memory. It should call the
+;; original D, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR: define internal {{.*}} @_Z1Dv()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z1Fv()
+; IR:   call {{.*}} @_Z1Dv()
+;; The allocations via B and E allocate cold memory. They should call the
+;; cloned D, which ultimately call the cloned allocation decorated with a
+;; "cold" attribute.
+; IR: define internal {{.*}} @_Z1Bv()
+; IR:   call {{.*}} @_Z1Dv.memprof.1()
+; IR: define internal {{.*}} @_Z1Ev()
+; IR:   call {{.*}} @_Z1Dv.memprof.1()
+; IR: define internal {{.*}} @_Z1Dv.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
 
 ; DOTPRE: digraph "prestackupdate" {
 ; DOTPRE: 	label="prestackupdate";

diff  --git a/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll
new file mode 100644
index 0000000000000..8786d57d03ceb
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning.ll
@@ -0,0 +1,244 @@
+;; Test context disambiguation for a callgraph containing multiple memprof
+;; contexts and no inlining, where we need to perform additional cloning
+;; during function assignment/cloning to handle the combination of contexts
+;; to 2 
diff erent allocations.
+;;
+;; void E(char **buf1, char **buf2) {
+;;   *buf1 = new char[10];
+;;   *buf2 = new char[10];
+;; }
+;;
+;; void B(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;;
+;; void C(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;;
+;; void D(char **buf1, char **buf2) {
+;;   E(buf1, buf2);
+;; }
+;; int main(int argc, char **argv) {
+;;   char *cold1, *cold2, *default1, *default2, *default3, *default4;
+;;   B(&default1, &default2);
+;;   C(&default3, &cold1);
+;;   D(&cold2, &default4);
+;;   memset(cold1, 0, 10);
+;;   memset(cold2, 0, 10);
+;;   memset(default1, 0, 10);
+;;   memset(default2, 0, 10);
+;;   memset(default3, 0, 10);
+;;   memset(default4, 0, 10);
+;;   delete[] default1;
+;;   delete[] default2;
+;;   delete[] default3;
+;;   delete[] default4;
+;;   sleep(10);
+;;   delete[] cold1;
+;;   delete[] cold2;
+;;   return 0;
+;; }
+;;
+;; Code compiled with -mllvm -memprof-min-lifetime-cold-threshold=5 so that the
+;; memory freed after sleep(10) results in cold lifetimes.
+;;
+;; The IR was then reduced using llvm-reduce with the expected FileCheck input.
+
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
+
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 {
+entry:
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !7
+  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !8, !callsite !15
+  ret void
+}
+
+declare ptr @_Znam(i64) #1
+
+define internal void @_Z1BPPcS0_(ptr %0, ptr %1) {
+entry:
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !16
+  ret void
+}
+
+; Function Attrs: noinline
+define internal void @_Z1CPPcS0_(ptr %0, ptr %1) #2 {
+entry:
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !17
+  ret void
+}
+
+define internal void @_Z1DPPcS0_(ptr %0, ptr %1) #3 {
+entry:
+  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !18
+  ret void
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4
+
+declare i32 @sleep() #5
+
+; uselistorder directives
+uselistorder ptr @_Znam, { 1, 0 }
+
+attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" }
+attributes #1 = { "no-trapping-math"="true" }
+attributes #2 = { noinline }
+attributes #3 = { "frame-pointer"="all" }
+attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #5 = { "disable-tail-calls"="true" }
+attributes #6 = { builtin }
+
+!0 = !{!1, !3, !5}
+!1 = !{!2, !"cold"}
+!2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
+!3 = !{!4, !"notcold"}
+!4 = !{i64 -3461278137325233666, i64 -3483158674395044949}
+!5 = !{!6, !"notcold"}
+!6 = !{i64 -3461278137325233666, i64 -2441057035866683071}
+!7 = !{i64 -3461278137325233666}
+!8 = !{!9, !11, !13}
+!9 = !{!10, !"notcold"}
+!10 = !{i64 -1415475215210681400, i64 -2441057035866683071}
+!11 = !{!12, !"cold"}
+!12 = !{i64 -1415475215210681400, i64 -3483158674395044949}
+!13 = !{!14, !"notcold"}
+!14 = !{i64 -1415475215210681400, i64 -7799663586031895603}
+!15 = !{i64 -1415475215210681400}
+!16 = !{i64 -2441057035866683071}
+!17 = !{i64 -3483158674395044949}
+!18 = !{i64 -7799663586031895603}
+
+
+;; Originally we create a single clone of each call to new from E, since each
+;; allocates cold memory for a single caller.
+
+; DUMP: CCG after cloning:
+; DUMP: Callsite Context Graph:
+; DUMP: Node [[ENEW1ORIG:0x[a-z0-9]+]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 2 3
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[C:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 	Clones: [[ENEW1CLONE:0x[a-z0-9]+]]
+
+; DUMP: Node [[D:0x[a-z0-9]+]]
+; DUMP:           call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1) (clone 0)
+; DUMP:         AllocTypes: NotColdCold
+; DUMP:         ContextIds: 1 6
+; DUMP:         CalleeEdges:
+; DUMP:                 Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
+; DUMP:                 Edge from Callee [[ENEW2ORIG:0x[a-z0-9]+]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
+; DUMP:         CallerEdges:
+
+; DUMP: Node [[C]]
+; DUMP: 	  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1)	(clone 0)
+; DUMP: 	AllocTypes: NotColdCold
+; DUMP: 	ContextIds: 2 5
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[C]] AllocTypes: NotCold ContextIds: 2
+; DUMP: 		Edge from Callee [[ENEW2CLONE:0x[a-z0-9]+]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[B]]
+; DUMP: 	  call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1)	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 3 4
+; DUMP: 	CalleeEdges:
+; DUMP: 		Edge from Callee [[ENEW1ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 3
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
+; DUMP: 	CallerEdges:
+
+; DUMP: Node [[ENEW2ORIG]]
+; DUMP: 	  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: NotCold
+; DUMP: 	ContextIds: 4 6
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[B]] AllocTypes: NotCold ContextIds: 4
+; DUMP: 		Edge from Callee [[ENEW2ORIG]] to Caller: [[D]] AllocTypes: NotCold ContextIds: 6
+; DUMP: 	Clones: [[ENEW2CLONE]]
+
+; DUMP: Node [[ENEW1CLONE]]
+; DUMP: 	  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 1
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW1CLONE]] to Caller: [[D]] AllocTypes: Cold ContextIds: 1
+; DUMP: 	Clone of [[ENEW1ORIG]]
+
+; DUMP: Node [[ENEW2CLONE]]
+; DUMP: 	  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6	(clone 0)
+; DUMP: 	AllocTypes: Cold
+; DUMP: 	ContextIds: 5
+; DUMP: 	CalleeEdges:
+; DUMP: 	CallerEdges:
+; DUMP: 		Edge from Callee [[ENEW2CLONE]] to Caller: [[C]] AllocTypes: Cold ContextIds: 5
+; DUMP: 	Clone of [[ENEW2ORIG]]
+
+
+;; We greedily create a clone of E that is initially used by the clones of the
+;; first call to new. However, we end up with an incompatible set of callers
+;; given the second call to new which has clones with a 
diff erent combination of
+;; callers. Eventually, we create 2 more clones, and the first clone becomes dead.
+; REMARKS: created clone _Z1EPPcS0_.memprof.1
+; REMARKS: created clone _Z1EPPcS0_.memprof.2
+; REMARKS: created clone _Z1EPPcS0_.memprof.3
+; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.2
+; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.3
+; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1BPPcS0_ assigned to call function clone _Z1EPPcS0_
+; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
+
+
+;; Original version of E is used for the non-cold allocations, both from B.
+; IR: define internal {{.*}} @_Z1EPPcS0_(
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define internal {{.*}} @_Z1BPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_(
+;; C calls a clone of E with the first new allocating cold memory and the
+;; second allocating non-cold memory.
+; IR: define internal {{.*}} @_Z1CPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_.memprof.3(
+;; D calls a clone of E with the first new allocating non-cold memory and the
+;; second allocating cold memory.
+; IR: define internal {{.*}} @_Z1DPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_.memprof.2(
+;; Transient clone that will get removed as it ends up with no callers.
+;; Its calls to new never get updated with a memprof attribute as a result.
+; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.1(
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT:[0-9]+]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[DEFAULT]]
+; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.2(
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.3(
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD]]
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[DEFAULT]] = { builtin }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis

diff  --git a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
index f28435f7ee3a6..5daaa966b0b54 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/indirectcall.ll
@@ -1,7 +1,7 @@
 ;; Tests callsite context graph generation for call graph containing indirect
 ;; calls. Currently this should result in conservative behavior, such that the
 ;; indirect call receives a null call in its graph node, to prevent subsequent
-;; cloning.
+;; cloning. Also tests graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -54,7 +54,9 @@
 ; RUN: opt -passes=memprof-context-disambiguation \
 ; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
 
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should only create a single clone of foo, for the direct call
@@ -340,6 +342,41 @@ attributes #7 = { builtin }
 ; DUMP:		Clone of [[FOO]]
 
 
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _ZN1A1xEv assigned to call function clone _Z3foov
+; REMARKS: call in clone _ZN1B1xEv assigned to call function clone _Z3foov
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold
+
+
+; IR: define {{.*}} @main(
+; IR:   call {{.*}} @_Z3foov()
+;; Only the second call to foo, which allocates cold memory via direct calls,
+;; is replaced with a call to a clone that calls a cold allocation.
+; IR:   call {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Z3barP1A(
+; IR:   call {{.*}} @_Z3barP1A(
+; IR:   call {{.*}} @_Z3barP1A(
+; IR:   call {{.*}} @_Z3barP1A(
+; IR: define internal {{.*}} @_ZN1A1xEv(
+; IR:   call {{.*}} @_Z3foov()
+; IR: define internal {{.*}} @_ZN1B1xEv(
+; IR:   call {{.*}} @_Z3foov()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
 ; DOT: 	Node[[FOO:0x[a-z0-9]+]] [shape=record,tooltip="N[[FOO]] ContextIds: 1 2 3 4 5 6",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3foov -\> _Znam}"];

diff  --git a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
index 81f52638ee935..208fa19bf3d4c 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/inlined.ll
@@ -1,6 +1,7 @@
 ;; Test callsite context graph generation for call graph with two memprof
 ;; contexts and partial inlining, requiring generation of a new fused node to
 ;; represent the inlined sequence while matching callsite nodes onto the graph.
+;; Also tests graph and IR cloning.
 ;;
 ;; Original code looks like:
 ;;
@@ -43,7 +44,9 @@
 ; RUN: opt -passes=memprof-context-disambiguation \
 ; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
-; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
+; RUN:  --check-prefix=STATS --check-prefix=REMARKS
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should create clones for foo and bar for the call from main to allocate
@@ -251,6 +254,42 @@ attributes #7 = { builtin }
 ; DUMP:         Clone of [[BAR]]
 
 
+; REMARKS: created clone _Z3barv.memprof.1
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3barv.memprof.1
+; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone main assigned to call function clone _Z3foov
+; REMARKS: call in clone _Z3foov assigned to call function clone _Z3barv
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3bazv marked with memprof allocation attribute notcold
+
+
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   call {{.*}} @_Z3barv()
+; IR: define {{.*}} @main()
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   call {{.*}} @_Z3foov.memprof.1()
+; IR: define internal {{.*}} @_Z3barv.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Z3barv.memprof.1()
+; IR: attributes #[[NOTCOLD]] = { builtin "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { builtin "memprof"="cold" }
+
+
+; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
+
+
 ; DOT: digraph "postbuild" {
 ; DOT: 	label="postbuild";
 ; DOT: 	Node[[BAR:0x[a-z0-9]+]] [shape=record,tooltip="N[[BAR]] ContextIds: 1 2",fillcolor="mediumorchid1",style="filled",style="filled",label="{OrigId: Alloc0\n_Z3barv -\> _Znam}"];


        


More information about the llvm-commits mailing list