[llvm] cfad2d3 - [MemProf] Context disambiguation cloning pass [patch 4/4]

Fri May 5 16:26:47 PDT 2023

Author: Teresa Johnson
Date: 2023-05-05T16:26:32-07:00
New Revision: cfad2d3a3d62fee089ad2ac1e87029bb3d00f17f

URL: https://github.com/llvm/llvm-project/commit/cfad2d3a3d62fee089ad2ac1e87029bb3d00f17f
DIFF: https://github.com/llvm/llvm-project/commit/cfad2d3a3d62fee089ad2ac1e87029bb3d00f17f.diff

LOG: [MemProf] Context disambiguation cloning pass [patch 4/4]

Applies ThinLTO cloning decisions made during the thin link and
recorded in the summary index to the IR during the ThinLTO backend.

Depends on D141077.

Differential Revision: https://reviews.llvm.org/D149117

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h
    llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
    llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
    llvm/lib/Passes/PassBuilderPipelines.cpp
    llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
    llvm/test/ThinLTO/X86/memprof-basic.ll
    llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
    llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
    llvm/test/ThinLTO/X86/memprof-indirectcall.ll
    llvm/test/ThinLTO/X86/memprof-inlined.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h b/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h
index 9a809171e8709..e36dea58cec45 100644

--- a/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h
+++ b/llvm/include/llvm/Analysis/ModuleSummaryAnalysis.h
@@ -99,6 +99,10 @@ class ImmutableModuleSummaryIndexWrapperPass : public ImmutablePass {
 ImmutablePass *
 createImmutableModuleSummaryIndexWrapperPass(const ModuleSummaryIndex *Index);
 
+/// Returns true if the instruction could have memprof metadata, used to ensure
+/// consistency between summary analysis and the ThinLTO backend processing.
+bool mayHaveMemprofSummary(const CallBase *CB);
+
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_MODULESUMMARYANALYSIS_H

diff  --git a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
index 13f3a7eb7ce3f..f4c20a5749f0b 100644
--- a/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
+++ b/llvm/include/llvm/Transforms/IPO/MemProfContextDisambiguation.h
@@ -18,13 +18,13 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/PassManager.h"
 #include <functional>
 
 namespace llvm {
 class GlobalValueSummary;
 class Module;
-class ModuleSummaryIndex;
 class OptimizationRemarkEmitter;
 
 class MemProfContextDisambiguation
@@ -34,8 +34,19 @@ class MemProfContextDisambiguation
       Module &M,
       function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
 
+  /// In the ThinLTO backend, apply the cloning decisions in ImportSummary to
+  /// the IR.
+  bool applyImport(Module &M);
+
+  /// Import summary containing cloning decisions for the ThinLTO backend.
+  const ModuleSummaryIndex *ImportSummary;
+
+  // Owns the import summary specified by internal options for testing the
+  // ThinLTO backend via opt (to simulate distributed ThinLTO).
+  std::unique_ptr<ModuleSummaryIndex> ImportSummaryForTesting;
+
 public:
-  MemProfContextDisambiguation() {}
+  MemProfContextDisambiguation(const ModuleSummaryIndex *Summary = nullptr);
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 

diff  --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index d84cfbfe80f56..3830edc453255 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -284,6 +284,10 @@ static void computeFunctionSummary(
   std::vector<CallsiteInfo> Callsites;
   std::vector<AllocInfo> Allocs;
 
+#ifndef NDEBUG
+  DenseSet<const CallBase *> CallsThatMayHaveMemprofSummary;
+#endif
+
   bool HasInlineAsmMaybeReferencingInternal = false;
   bool HasIndirBranchToBlockAddress = false;
   bool HasUnknownCall = false;
@@ -427,6 +431,10 @@ static void computeFunctionSummary(
               .updateHotness(getHotness(Candidate.Count, PSI));
       }
 
+      // Summarize memprof related metadata. This is only needed for ThinLTO.
+      if (!IsThinLTO)
+        continue;
+
       // TODO: Skip indirect calls for now. Need to handle these better, likely
       // by creating multiple Callsites, one per target, then speculatively
       // devirtualize while applying clone info in the ThinLTO backends. This
@@ -437,6 +445,14 @@ static void computeFunctionSummary(
       if (!CalledFunction)
         continue;
 
+      // Ensure we keep this analysis in sync with the handling in the ThinLTO
+      // backend (see MemProfContextDisambiguation::applyImport). Save this call
+      // so that we can skip it in checking the reverse case later.
+      assert(mayHaveMemprofSummary(CB));
+#ifndef NDEBUG
+      CallsThatMayHaveMemprofSummary.insert(CB);
+#endif
+
       // Compute the list of stack ids first (so we can trim them from the stack
       // ids on any MIBs).
       CallStack<MDNode, MDNode::op_iterator> InstCallsite(
@@ -546,6 +562,25 @@ static void computeFunctionSummary(
             ? CalleeInfo::HotnessType::Cold
             : CalleeInfo::HotnessType::Critical);
 
+#ifndef NDEBUG
+  // Make sure that all calls we decided could not have memprof summaries get a
+  // false value for mayHaveMemprofSummary, to ensure that this handling remains
+  // in sync with the ThinLTO backend handling.
+  if (IsThinLTO) {
+    for (const BasicBlock &BB : F) {
+      for (const Instruction &I : BB) {
+        const auto *CB = dyn_cast<CallBase>(&I);
+        if (!CB)
+          continue;
+        // We already checked these above.
+        if (CallsThatMayHaveMemprofSummary.count(CB))
+          continue;
+        assert(!mayHaveMemprofSummary(CB));
+      }
+    }
+  }
+#endif
+
   bool NonRenamableLocal = isNonRenamableLocal(F);
   bool NotEligibleForImport = NonRenamableLocal ||
                               HasInlineAsmMaybeReferencingInternal ||
@@ -1042,3 +1077,36 @@ ImmutablePass *llvm::createImmutableModuleSummaryIndexWrapperPass(
 
 INITIALIZE_PASS(ImmutableModuleSummaryIndexWrapperPass, "module-summary-info",
                 "Module summary info", false, true)
+
+bool llvm::mayHaveMemprofSummary(const CallBase *CB) {
+  if (!CB)
+    return false;
+  if (CB->isDebugOrPseudoInst())
+    return false;
+  auto *CI = dyn_cast<CallInst>(CB);
+  auto *CalledValue = CB->getCalledOperand();
+  auto *CalledFunction = CB->getCalledFunction();
+  if (CalledValue && !CalledFunction) {
+    CalledValue = CalledValue->stripPointerCasts();
+    // Stripping pointer casts can reveal a called function.
+    CalledFunction = dyn_cast<Function>(CalledValue);
+  }
+  // Check if this is an alias to a function. If so, get the
+  // called aliasee for the checks below.
+  if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
+    assert(!CalledFunction &&
+           "Expected null called function in callsite for alias");
+    CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
+  }
+  // Check if this is a direct call to a known function or a known
+  // intrinsic, or an indirect call with profile data.
+  if (CalledFunction) {
+    if (CI && CalledFunction->isIntrinsic())
+      return false;
+  } else {
+    // TODO: For now skip indirect calls. See comments in
+    // computeFunctionSummary for what is needed to handle this.
+    return false;
+  }
+  return true;
+}

diff  --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 3f929906313ab..66b49da572075 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1531,6 +1531,11 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
   ModulePassManager MPM;
 
   if (ImportSummary) {
+    // For ThinLTO we must apply the context disambiguation decisions early, to
+    // ensure we can correctly match the callsites to summary data.
+    if (EnableMemProfContextDisambiguation)
+      MPM.addPass(MemProfContextDisambiguation(ImportSummary));
+
     // These passes import type identifier resolutions for whole-program
     // devirtualization and CFI. They must run early because other passes may
     // disturb the specific instruction patterns that these passes look for,

diff  --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index d9a433e79b1be..44aea71c31925 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -52,10 +53,30 @@ using namespace llvm::memprof;
 
 STATISTIC(FunctionClonesAnalysis,
           "Number of function clones created during whole program analysis");
+STATISTIC(FunctionClonesThinBackend,
+          "Number of function clones created during ThinLTO backend");
+STATISTIC(FunctionsClonedThinBackend,
+          "Number of functions that had clones created during ThinLTO backend");
 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
                             "cloned) during whole program analysis");
 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
                          "during whole program analysis");
+STATISTIC(AllocTypeNotColdThinBackend,
+          "Number of not cold static allocations (possibly cloned) during "
+          "ThinLTO backend");
+STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
+                                    "(possibly cloned) during ThinLTO backend");
+STATISTIC(OrigAllocsThinBackend,
+          "Number of original (not cloned) allocations with memprof profiles "
+          "during ThinLTO backend");
+STATISTIC(
+    AllocVersionsThinBackend,
+    "Number of allocation versions (including clones) during ThinLTO backend");
+STATISTIC(MaxAllocVersionsThinBackend,
+          "Maximum number of allocation versions created for an original "
+          "allocation during ThinLTO backend");
+STATISTIC(UnclonableAllocsThinBackend,
+          "Number of unclonable ambigous allocations during ThinLTO backend");
 
 static cl::opt<std::string> DotFilePathPrefix(
     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -78,6 +99,11 @@ static cl::opt<bool>
     VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
                 cl::desc("Perform frequent verification checks on nodes."));
 
+static cl::opt<std::string> MemProfImportSummary(
+    "memprof-import-summary",
+    cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
+    cl::Hidden);
+
 /// CRTP base for graphs built from either IR or ThinLTO summary index.
 ///
 /// The graph represents the call contexts in all memprof metadata on allocation
@@ -109,8 +135,8 @@ class CallsiteContextGraph {
   /// Assign callsite clones to functions, cloning functions as needed to
   /// accommodate the combinations of their callsite clones reached by callers.
   /// For regular LTO this clones functions and callsites in the IR, but for
-  /// ThinLTO the cloning decisions are noted in the summaries and applied
-  /// later.
+  /// ThinLTO the cloning decisions are noted in the summaries and later applied
+  /// in applyImport.
   bool assignFunctions();
 
   void dump() const;
@@ -2779,6 +2805,358 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
   return Changed;
 }
 
+static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
+    Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
+    std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
+        &FuncToAliasMap) {
+  // The first "clone" is the original copy, we should only call this if we
+  // needed to create new clones.
+  assert(NumClones > 1);
+  SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
+  VMaps.reserve(NumClones - 1);
+  FunctionsClonedThinBackend++;
+  for (unsigned I = 1; I < NumClones; I++) {
+    VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
+    auto *NewF = CloneFunction(&F, *VMaps.back());
+    FunctionClonesThinBackend++;
+    // Strip memprof and callsite metadata from clone as they are no longer
+    // needed.
+    for (auto &BB : *NewF) {
+      for (auto &Inst : BB) {
+        Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
+        Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
+      }
+    }
+    std::string Name = getMemProfFuncName(F.getName(), I);
+    auto *PrevF = M.getFunction(Name);
+    if (PrevF) {
+      // We might have created this when adjusting callsite in another
+      // function. It should be a declaration.
+      assert(PrevF->isDeclaration());
+      NewF->takeName(PrevF);
+      PrevF->replaceAllUsesWith(NewF);
+      PrevF->eraseFromParent();
+    } else
+      NewF->setName(Name);
+    ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
+             << "created clone " << ore::NV("NewFunction", NewF));
+
+    // Now handle aliases to this function, and clone those as well.
+    if (!FuncToAliasMap.count(&F))
+      continue;
+    for (auto *A : FuncToAliasMap[&F]) {
+      std::string Name = getMemProfFuncName(A->getName(), I);
+      auto *PrevA = M.getNamedAlias(Name);
+      auto *NewA = GlobalAlias::create(A->getValueType(),
+                                       A->getType()->getPointerAddressSpace(),
+                                       A->getLinkage(), Name, NewF);
+      NewA->copyAttributesFrom(A);
+      if (PrevA) {
+        // We might have created this when adjusting callsite in another
+        // function. It should be a declaration.
+        assert(PrevA->isDeclaration());
+        NewA->takeName(PrevA);
+        PrevA->replaceAllUsesWith(NewA);
+        PrevA->eraseFromParent();
+      }
+    }
+  }
+  return VMaps;
+}
+
+// Locate the summary for F. This is complicated by the fact that it might
+// have been internalized or promoted.
+static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
+                                      const ModuleSummaryIndex *ImportSummary) {
+  // FIXME: Ideally we would retain the original GUID in some fashion on the
+  // function (e.g. as metadata), but for now do our best to locate the
+  // summary without that information.
+  ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
+  if (!TheFnVI)
+    // See if theFn was internalized, by checking index directly with
+    // original name (this avoids the name adjustment done by getGUID() for
+    // internal symbols).
+    TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
+  if (TheFnVI)
+    return TheFnVI;
+  // Now query with the original name before any promotion was performed.
+  StringRef OrigName =
+      ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
+  std::string OrigId = GlobalValue::getGlobalIdentifier(
+      OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
+  TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
+  if (TheFnVI)
+    return TheFnVI;
+  // Could be a promoted local imported from another module. We need to pass
+  // down more info here to find the original module id. For now, try with
+  // the OrigName which might have been stored in the OidGuidMap in the
+  // index. This would not work if there were same-named locals in multiple
+  // modules, however.
+  auto OrigGUID =
+      ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
+  if (OrigGUID)
+    TheFnVI = ImportSummary->getValueInfo(OrigGUID);
+  return TheFnVI;
+}
+
+bool MemProfContextDisambiguation::applyImport(Module &M) {
+  assert(ImportSummary);
+  bool Changed = false;
+
+  auto IsMemProfClone = [](const Function &F) {
+    return F.getName().contains(MemProfCloneSuffix);
+  };
+
+  // We also need to clone any aliases that reference cloned functions, because
+  // the modified callsites may invoke via the alias. Keep track of the aliases
+  // for each function.
+  std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
+      FuncToAliasMap;
+  for (auto &A : M.aliases()) {
+    auto *Aliasee = A.getAliaseeObject();
+    if (auto *F = dyn_cast<Function>(Aliasee))
+      FuncToAliasMap[F].insert(&A);
+  }
+
+  for (auto &F : M) {
+    if (F.isDeclaration() || IsMemProfClone(F))
+      continue;
+
+    OptimizationRemarkEmitter ORE(&F);
+
+    SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
+    bool ClonesCreated = false;
+    unsigned NumClonesCreated = 0;
+    auto CloneFuncIfNeeded = [&](unsigned NumClones) {
+      // We should at least have version 0 which is the original copy.
+      assert(NumClones > 0);
+      // If only one copy needed use original.
+      if (NumClones == 1)
+        return;
+      // If we already performed cloning of this function, confirm that the
+      // requested number of clones matches (the thin link should ensure the
+      // number of clones for each constituent callsite is consistent within
+      // each function), before returning.
+      if (ClonesCreated) {
+        assert(NumClonesCreated == NumClones);
+        return;
+      }
+      VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
+      // The first "clone" is the original copy, which doesn't have a VMap.
+      assert(VMaps.size() == NumClones - 1);
+      Changed = true;
+      ClonesCreated = true;
+      NumClonesCreated = NumClones;
+    };
+
+    // Locate the summary for F.
+    ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
+    // If not found, this could be an imported local (see comment in
+    // findValueInfoForFunc). Skip for now as it will be cloned in its original
+    // module (where it would have been promoted to global scope so should
+    // satisfy any reference in this module).
+    if (!TheFnVI)
+      continue;
+
+    auto *GVSummary =
+        ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
+    if (!GVSummary)
+      // Must have been imported, use the first summary (might be multiple if
+      // this was a linkonce_odr).
+      GVSummary = TheFnVI.getSummaryList().front().get();
+
+    // If this was an imported alias skip it as we won't have the function
+    // summary, and it should be cloned in the original module.
+    if (isa<AliasSummary>(GVSummary))
+      continue;
+
+    auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
+
+    if (FS->allocs().empty() && FS->callsites().empty())
+      continue;
+
+    auto SI = FS->callsites().begin();
+    auto AI = FS->allocs().begin();
+
+    // Assume for now that the instructions are in the exact same order
+    // as when the summary was created, but confirm this is correct by
+    // matching the stack ids.
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        auto *CB = dyn_cast<CallBase>(&I);
+        // Same handling as when creating module summary.
+        if (!mayHaveMemprofSummary(CB))
+          continue;
+
+        CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
+            I.getMetadata(LLVMContext::MD_callsite));
+        auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
+
+        // Include allocs that were already assigned a memprof function
+        // attribute in the statistics.
+        if (CB->getAttributes().hasFnAttr("memprof")) {
+          assert(!MemProfMD);
+          CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
+              ? AllocTypeColdThinBackend++
+              : AllocTypeNotColdThinBackend++;
+          OrigAllocsThinBackend++;
+          AllocVersionsThinBackend++;
+          if (!MaxAllocVersionsThinBackend)
+            MaxAllocVersionsThinBackend = 1;
+          // Remove any remaining callsite metadata and we can skip the rest of
+          // the handling for this instruction, since no cloning needed.
+          I.setMetadata(LLVMContext::MD_callsite, nullptr);
+          continue;
+        }
+
+        if (MemProfMD) {
+          // Consult the next alloc node.
+          assert(AI != FS->allocs().end());
+          auto &AllocNode = *(AI++);
+
+          // Sanity check that the MIB stack ids match between the summary and
+          // instruction metadata.
+          auto MIBIter = AllocNode.MIBs.begin();
+          for (auto &MDOp : MemProfMD->operands()) {
+            assert(MIBIter != AllocNode.MIBs.end());
+            auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
+            auto *MIBMD = cast<const MDNode>(MDOp);
+            MDNode *StackMDNode = getMIBStackNode(MIBMD);
+            assert(StackMDNode);
+            SmallVector<unsigned> StackIdsFromMetadata;
+            CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
+            for (auto ContextIter =
+                     StackContext.beginAfterSharedPrefix(CallsiteContext);
+                 ContextIter != StackContext.end(); ++ContextIter) {
+              // If this is a direct recursion, simply skip the duplicate
+              // entries, to be consistent with how the summary ids were
+              // generated during ModuleSummaryAnalysis.
+              if (!StackIdsFromMetadata.empty() &&
+                  StackIdsFromMetadata.back() == *ContextIter)
+                continue;
+              assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
+              assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
+                     *ContextIter);
+              StackIdIndexIter++;
+            }
+            MIBIter++;
+          }
+
+          // Perform cloning if not yet done.
+          CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
+
+          OrigAllocsThinBackend++;
+          AllocVersionsThinBackend += AllocNode.Versions.size();
+          if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
+            MaxAllocVersionsThinBackend = AllocNode.Versions.size();
+
+          // If there is only one version that means we didn't end up
+          // considering this function for cloning, and in that case the alloc
+          // will still be none type or should have gotten the default NotCold.
+          // Skip that after calling clone helper since that does some sanity
+          // checks that confirm we haven't decided yet that we need cloning.
+          if (AllocNode.Versions.size() == 1) {
+            assert((AllocationType)AllocNode.Versions[0] ==
+                       AllocationType::NotCold ||
+                   (AllocationType)AllocNode.Versions[0] ==
+                       AllocationType::None);
+            UnclonableAllocsThinBackend++;
+            continue;
+          }
+
+          // All versions should have a singular allocation type.
+          assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
+            return Type == ((uint8_t)AllocationType::NotCold |
+                            (uint8_t)AllocationType::Cold);
+          }));
+
+          // Update the allocation types per the summary info.
+          for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
+            // Ignore any that didn't get an assigned allocation type.
+            if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
+              continue;
+            AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
+            AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
+                                            : AllocTypeNotColdThinBackend++;
+            std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
+            auto A = llvm::Attribute::get(F.getContext(), "memprof",
+                                          AllocTypeString);
+            CallBase *CBClone;
+            // Copy 0 is the original function.
+            if (!J)
+              CBClone = CB;
+            else
+              // Since VMaps are only created for new clones, we index with
+              // clone J-1 (J==0 is the original clone and does not have a VMaps
+              // entry).
+              CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+            CBClone->addFnAttr(A);
+            ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
+                     << ore::NV("AllocationCall", CBClone) << " in clone "
+                     << ore::NV("Caller", CBClone->getFunction())
+                     << " marked with memprof allocation attribute "
+                     << ore::NV("Attribute", AllocTypeString));
+          }
+        } else if (!CallsiteContext.empty()) {
+          // Consult the next callsite node.
+          assert(SI != FS->callsites().end());
+          auto &StackNode = *(SI++);
+
+#ifndef NDEBUG
+          // Sanity check that the stack ids match between the summary and
+          // instruction metadata.
+          auto StackIdIndexIter = StackNode.StackIdIndices.begin();
+          for (auto StackId : CallsiteContext) {
+            assert(StackIdIndexIter != StackNode.StackIdIndices.end());
+            assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
+                   StackId);
+            StackIdIndexIter++;
+          }
+#endif
+
+          // Perform cloning if not yet done.
+          CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
+
+          // Should have skipped indirect calls via mayHaveMemprofSummary.
+          assert(CB->getCalledFunction());
+          assert(!IsMemProfClone(*CB->getCalledFunction()));
+
+          // Update the calls per the summary info.
+          // Save orig name since it gets updated in the first iteration
+          // below.
+          auto CalleeOrigName = CB->getCalledFunction()->getName();
+          for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
+            // Do nothing if this version calls the original version of its
+            // callee.
+            if (!StackNode.Clones[J])
+              continue;
+            auto NewF = M.getOrInsertFunction(
+                getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
+                CB->getCalledFunction()->getFunctionType());
+            CallBase *CBClone;
+            // Copy 0 is the original function.
+            if (!J)
+              CBClone = CB;
+            else
+              CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
+            CBClone->setCalledFunction(NewF);
+            ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
+                     << ore::NV("Call", CBClone) << " in clone "
+                     << ore::NV("Caller", CBClone->getFunction())
+                     << " assigned to call function clone "
+                     << ore::NV("Callee", NewF.getCallee()));
+          }
+        }
+        // Memprof and callsite metadata on memory allocations no longer needed.
+        I.setMetadata(LLVMContext::MD_memprof, nullptr);
+        I.setMetadata(LLVMContext::MD_callsite, nullptr);
+      }
+    }
+  }
+
+  return Changed;
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
   if (DumpCCG) {
@@ -2820,12 +3198,46 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
 bool MemProfContextDisambiguation::processModule(
     Module &M,
     function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
-  bool Changed = false;
+
+  // If we have an import summary, then the cloning decisions were made during
+  // the thin link on the index. Apply them and return.
+  if (ImportSummary)
+    return applyImport(M);
 
   ModuleCallsiteContextGraph CCG(M, OREGetter);
-  Changed = CCG.process();
+  return CCG.process();
+}
 
-  return Changed;
+MemProfContextDisambiguation::MemProfContextDisambiguation(
+    const ModuleSummaryIndex *Summary)
+    : ImportSummary(Summary) {
+  if (ImportSummary) {
+    // The MemProfImportSummary should only be used for testing ThinLTO
+    // distributed backend handling via opt, in which case we don't have a
+    // summary from the pass pipeline.
+    assert(MemProfImportSummary.empty());
+    return;
+  }
+  if (MemProfImportSummary.empty())
+    return;
+
+  auto ReadSummaryFile =
+      errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
+  if (!ReadSummaryFile) {
+    logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
+                          "Error loading file '" + MemProfImportSummary +
+                              "': ");
+    return;
+  }
+  auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
+  if (!ImportSummaryForTestingOrErr) {
+    logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
+                          "Error parsing file '" + MemProfImportSummary +
+                              "': ");
+    return;
+  }
+  ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
+  ImportSummary = ImportSummaryForTesting.get();
 }
 
 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,

diff  --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 11378cf5bef47..ac3119fff2ae5 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -44,12 +44,14 @@
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
 ; RUN:	-stats -pass-remarks=memprof-context-disambiguation -save-temps \
 ; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
-; RUN:	--check-prefix=STATS
+; RUN:	--check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should have cloned bar, baz, and foo, for the cold memory allocation.
 ; RUN:	cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
 
 ;; Try again but with distributed ThinLTO
 ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
@@ -71,11 +73,18 @@
 ;; Check distributed index
 ; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
 
+;; Run ThinLTO backend
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:	-memprof-import-summary=%t.o.thinlto.bc \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE --check-prefix=REMARKS
+
 source_filename = "memprof-basic.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define i32 @main() {
+define i32 @main() #0 {
 entry:
   %call = call ptr @_Z3foov(), !callsite !0
   %call1 = call ptr @_Z3foov(), !callsite !1
@@ -86,7 +95,7 @@ declare void @_ZdaPv()
 
 declare i32 @sleep()
 
-define internal ptr @_Z3barv() {
+define internal ptr @_Z3barv() #0 {
 entry:
   %call = call ptr @_Znam(i64 0), !memprof !2, !callsite !7
   ret ptr null
@@ -94,13 +103,13 @@ entry:
 
 declare ptr @_Znam(i64)
 
-define internal ptr @_Z3bazv() {
+define internal ptr @_Z3bazv() #0 {
 entry:
   %call = call ptr @_Z3barv(), !callsite !8
   ret ptr null
 }
 
-define internal ptr @_Z3foov() {
+define internal ptr @_Z3foov() #0 {
 entry:
   %call = call ptr @_Z3bazv(), !callsite !9
   ret ptr null
@@ -109,6 +118,8 @@ entry:
 ; uselistorder directives
 uselistorder ptr @_Z3foov, { 1, 0 }
 
+attributes #0 = { noinline optnone }
+
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
@@ -252,9 +263,50 @@ uselistorder ptr @_Z3foov, { 1, 0 }
 ; DUMP:		Clone of [[BAR]]
 
 
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: created clone _Z3barv.memprof.1
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: created clone _Z3bazv.memprof.1
+; REMARKS: call in clone _Z3bazv.memprof.1 assigned to call function clone _Z3barv.memprof.1
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3bazv.memprof.1
+
+
+; IR: define {{.*}} @main
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   call {{.*}} @_Z3foov.memprof.1()
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv()
+; IR:   call {{.*}} @_Z3barv()
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   call {{.*}} @_Z3bazv()
+; IR: define internal {{.*}} @_Z3barv.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3bazv.memprof.1()
+; IR:   call {{.*}} @_Z3barv.memprof.1()
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Z3bazv.memprof.1()
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+
 ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
 ; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
 ; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 3 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
 
 
 ; DOT: digraph "postbuild" {

diff  --git a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
index 7f7447eaf58e4..55aab19241084 100644
--- a/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
+++ b/llvm/test/ThinLTO/X86/memprof-duplicate-context-ids.ll
@@ -66,13 +66,15 @@
 ; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
 ; RUN:  -stats -pass-remarks=memprof-context-disambiguation -save-temps \
 ; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
-; RUN:  --check-prefix=STATS
+; RUN:  --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
 
 ; RUN:  cat %t.ccg.prestackupdate.dot | FileCheck %s --check-prefix=DOTPRE
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOTPOST
 ;; We should clone D once for the cold allocations via C.
 ; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
 
 ;; Try again but with distributed ThinLTO
 ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
@@ -95,11 +97,18 @@
 ;; Check distributed index
 ; RUN: llvm-dis %t.o.thinlto.bc -o - | FileCheck %s --check-prefix=DISTRIB
 
+;; Run ThinLTO backend
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:  -memprof-import-summary=%t.o.thinlto.bc \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE --check-prefix=REMARKS
+
 source_filename = "duplicate-context-ids.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define internal ptr @_Z1Dv() {
+define internal ptr @_Z1Dv() #0 {
 entry:
   %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
   ret ptr null
@@ -107,31 +116,31 @@ entry:
 
 declare ptr @_Znam(i64)
 
-define internal ptr @_Z1Fv() {
+define internal ptr @_Z1Fv() #0 {
 entry:
   %call = call ptr @_Z1Dv(), !callsite !6
   ret ptr null
 }
 
-define internal ptr @_Z1Cv() {
+define internal ptr @_Z1Cv() #0 {
 entry:
   %call = call ptr @_Z1Dv(), !callsite !7
   ret ptr null
 }
 
-define internal ptr @_Z1Bv() {
+define internal ptr @_Z1Bv() #0 {
 entry:
   %call.i = call ptr @_Z1Dv(), !callsite !8
   ret ptr null
 }
 
-define internal ptr @_Z1Ev() {
+define internal ptr @_Z1Ev() #0 {
 entry:
   %call.i = call ptr @_Z1Dv(), !callsite !9
   ret ptr null
 }
 
-define i32 @main() {
+define i32 @main() #0 {
 entry:
   call ptr @_Z1Bv()
   call ptr @_Z1Ev()
@@ -143,6 +152,8 @@ declare void @_ZdaPv()
 
 declare i32 @sleep()
 
+attributes #0 = { noinline optnone}
+
 !0 = !{!1, !3}
 !1 = !{!2, !"cold"}
 !2 = !{i64 6541423618768552252, i64 -6270142974039008131}
@@ -300,10 +311,43 @@ declare i32 @sleep()
 ; DUMP: 		Edge from Callee [[D2]] to Caller: [[B:0x[a-z0-9]+]] AllocTypes: Cold ContextIds: 4
 ; DUMP:         Clone of [[D]]
 
+; REMARKS: created clone _Z1Dv.memprof.1
+; REMARKS: call in clone _Z1Dv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1Dv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1Bv assigned to call function clone _Z1Dv.memprof.1
+; REMARKS: call in clone _Z1Ev assigned to call function clone _Z1Dv.memprof.1
+
+
+;; The allocation via F does not allocate cold memory. It should call the
+;; original D, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR: define internal {{.*}} @_Z1Dv()
+; IR:   call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z1Fv()
+; IR:   call {{.*}} @_Z1Dv()
+;; The allocations via B and E allocate cold memory. They should call the
+;; cloned D, which ultimately call the cloned allocation decorated with a
+;; "cold" attribute.
+; IR: define internal {{.*}} @_Z1Bv()
+; IR:   call {{.*}} @_Z1Dv.memprof.1()
+; IR: define internal {{.*}} @_Z1Ev()
+; IR:   call {{.*}} @_Z1Dv.memprof.1()
+; IR: define internal {{.*}} @_Z1Dv.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
 
 ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
 ; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
 ; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 1 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
 
 
 ; DOTPRE: digraph "prestackupdate" {

diff  --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
index 54aad0dc94ac0..25e974e98f45c 100644
--- a/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
+++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning.ll
@@ -58,7 +58,9 @@
 ; RUN:  -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:  -stats -pass-remarks=memprof-context-disambiguation -save-temps \
 ; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
-; RUN:  --check-prefix=STATS
+; RUN:  --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
+
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
 
 
 ;; Try again but with distributed ThinLTO
@@ -73,13 +75,20 @@
 ; RUN:  -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
 ; RUN:  --check-prefix=STATS
 
+;; Run ThinLTO backend
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:  -memprof-import-summary=%t.o.thinlto.bc \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE --check-prefix=REMARKS
+
 
 source_filename = "funcassigncloning.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: noinline optnone
-define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) {
+define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 {
 entry:
   %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7
   %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15
@@ -107,7 +116,7 @@ entry:
 }
 
 ; Function Attrs: noinline optnone
-define i32 @main() {
+define i32 @main() #0 {
 entry:
   call void @_Z1BPPcS0_()
   call void @_Z1CPPcS0_()
@@ -122,6 +131,8 @@ declare i32 @sleep()
 ; uselistorder directives
 uselistorder ptr @_Znam, { 1, 0 }
 
+attributes #0 = { noinline optnone }
+
 !0 = !{!1, !3, !5}
 !1 = !{!2, !"cold"}
 !2 = !{i64 -3461278137325233666, i64 -7799663586031895603}
@@ -230,6 +241,54 @@ uselistorder ptr @_Znam, { 1, 0 }
 ; DUMP: 	Clone of [[ENEW2ORIG]]
 
 
+;; We greedily create a clone of E that is initially used by the clones of the
+;; first call to new. However, we end up with an incompatible set of callers
+;; given the second call to new which has clones with a 
diff erent combination of
+;; callers. Eventually, we create 2 more clones, and the first clone becomes dead.
+; REMARKS: created clone _Z1EPPcS0_.memprof.1
+; REMARKS: created clone _Z1EPPcS0_.memprof.2
+; REMARKS: created clone _Z1EPPcS0_.memprof.3
+; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_ marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.memprof.2 marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z1EPPcS0_.memprof.3 marked with memprof allocation attribute cold
+; REMARKS: call in clone _Z1CPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.3
+; REMARKS: call in clone _Z1DPPcS0_ assigned to call function clone _Z1EPPcS0_.memprof.2
+
+
+;; Original version of E is used for the non-cold allocations, both from B.
+; IR: define internal {{.*}} @_Z1EPPcS0_(
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD:[0-9]+]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define internal {{.*}} @_Z1BPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_(
+;; C calls a clone of E with the first new allocating cold memory and the
+;; second allocating non-cold memory.
+; IR: define internal {{.*}} @_Z1CPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_.memprof.3(
+;; D calls a clone of E with the first new allocating non-cold memory and the
+;; second allocating cold memory.
+; IR: define internal {{.*}} @_Z1DPPcS0_(
+; IR:   call {{.*}} @_Z1EPPcS0_.memprof.2(
+; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.2(
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD:[0-9]+]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR: define internal {{.*}} @_Z1EPPcS0_.memprof.3(
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[NOTCOLD]]
+; IR:   call {{.*}} @_Znam(i64 noundef 10) #[[COLD]]
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+
 ; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
 ; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE: 8 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
 ; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 4 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend

diff  --git a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
index c311d6243688f..8b9b64f0c1920 100644
--- a/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
+++ b/llvm/test/ThinLTO/X86/memprof-indirectcall.ll
@@ -66,13 +66,15 @@
 ; RUN:  -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
 ; RUN:  -stats -pass-remarks=memprof-context-disambiguation -save-temps \
 ; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
-; RUN:  --check-prefix=STATS
+; RUN:  --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
 
 ; RUN:  cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should only create a single clone of foo, for the direct call
 ;; from main allocating cold memory.
 ; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
 
 ;; Try again but with distributed ThinLTO
 ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
@@ -94,6 +96,13 @@
 ;; from main allocating cold memory.
 ; RUN:  cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
+;; Run ThinLTO backend
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:  -memprof-import-summary=%t.o.thinlto.bc \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE --check-prefix=REMARKS
+
 source_filename = "indirectcall.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -101,12 +110,12 @@ target triple = "x86_64-unknown-linux-gnu"
 @_ZTVN10__cxxabiv120__si_class_type_infoE = external global ptr
 @_ZTVN10__cxxabiv117__class_type_infoE = external global ptr
 
-define internal ptr @_Z3barP1A(ptr %a) {
+define internal ptr @_Z3barP1A(ptr %a) #0 {
 entry:
   ret ptr null
 }
 
-define i32 @main() {
+define i32 @main() #0 {
 entry:
   %call = call ptr @_Z3foov(), !callsite !0
   %call1 = call ptr @_Z3foov(), !callsite !1
@@ -121,19 +130,19 @@ declare void @_ZdaPv()
 
 declare i32 @sleep()
 
-define internal ptr @_ZN1A1xEv() {
+define internal ptr @_ZN1A1xEv() #0 {
 entry:
   %call = call ptr @_Z3foov(), !callsite !6
   ret ptr null
 }
 
-define internal ptr @_ZN1B1xEv() {
+define internal ptr @_ZN1B1xEv() #0 {
 entry:
   %call = call ptr @_Z3foov(), !callsite !7
   ret ptr null
 }
 
-define internal ptr @_Z3foov() {
+define internal ptr @_Z3foov() #0 {
 entry:
   %call = call ptr @_Znam(i64 0), !memprof !8, !callsite !21
   ret ptr null
@@ -144,6 +153,8 @@ declare ptr @_Znam(i64)
 ; uselistorder directives
 uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
 
+attributes #0 = { noinline optnone }
+
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{i64 6792096022461663180}
@@ -384,9 +395,39 @@ uselistorder ptr @_Z3foov, { 3, 2, 1, 0 }
 ; DUMP:		Clone of [[FOO]]
 
 
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3foov.memprof.1 marked with memprof allocation attribute cold
+
+
+; IR: define {{.*}} @main(
+; IR:   call {{.*}} @_Z3foov()
+;; Only the second call to foo, which allocates cold memory via direct calls,
+;; is replaced with a call to a clone that calls a cold allocation.
+; IR:   call {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Z3barP1A(
+; IR:   call {{.*}} @_Z3barP1A(
+; IR:   call {{.*}} @_Z3barP1A(
+; IR:   call {{.*}} @_Z3barP1A(
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+
 ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
 ; STATS: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
 ; STATS: 1 memprof-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 1 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 1 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
 
 
 ; DOT: digraph "postbuild" {

diff  --git a/llvm/test/ThinLTO/X86/memprof-inlined.ll b/llvm/test/ThinLTO/X86/memprof-inlined.ll
index 27eab8a5bcd20..bc537a5f3fbba 100644
--- a/llvm/test/ThinLTO/X86/memprof-inlined.ll
+++ b/llvm/test/ThinLTO/X86/memprof-inlined.ll
@@ -54,13 +54,16 @@
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
 ; RUN:  -stats -pass-remarks=memprof-context-disambiguation -save-temps \
 ; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
-; RUN:  --check-prefix=STATS
+; RUN:  --check-prefix=STATS --check-prefix=STATS-BE \
+; RUN:  --check-prefix=STATS-INPROCESS-BE --check-prefix=REMARKS
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should create clones for foo and bar for the call from main to allocate
 ;; cold memory.
 ; RUN:	cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
+; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR
+
 
 ;; Try again but with distributed ThinLTO
 ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
@@ -80,11 +83,19 @@
 ;; cold memory.
 ; RUN:	cat %t.ccg.cloned.dot | FileCheck %s --check-prefix=DOTCLONED
 
+;; Run ThinLTO backend
+; RUN: opt -passes=memprof-context-disambiguation \
+; RUN:  -memprof-import-summary=%t.o.thinlto.bc \
+; RUN:  -stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \
+; RUN:  --check-prefix=STATS-BE --check-prefix=STATS-DISTRIB-BE \
+; RUN:  --check-prefix=REMARKS
+
 source_filename = "inlined.ll"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define internal ptr @_Z3barv() {
+define internal ptr @_Z3barv() #0 {
 entry:
   %call = call ptr @_Znam(i64 0), !memprof !0, !callsite !5
   ret ptr null
@@ -92,19 +103,19 @@ entry:
 
 declare ptr @_Znam(i64)
 
-define internal ptr @_Z3bazv() {
+define internal ptr @_Z3bazv() #0 {
 entry:
   %call.i = call ptr @_Znam(i64 0), !memprof !0, !callsite !6
   ret ptr null
 }
 
-define internal ptr @_Z3foov() {
+define internal ptr @_Z3foov() #0 {
 entry:
   %call.i = call ptr @_Z3barv(), !callsite !7
   ret ptr null
 }
 
-define i32 @main() {
+define i32 @main() #0 {
 entry:
   %call = call ptr @_Z3foov(), !callsite !8
   %call1 = call ptr @_Z3foov(), !callsite !9
@@ -115,6 +126,8 @@ declare void @_ZdaPv()
 
 declare i32 @sleep()
 
+attributes #0 = { noinline optnone }
+
 !0 = !{!1, !3}
 !1 = !{!2, !"notcold"}
 !2 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
@@ -281,9 +294,50 @@ declare i32 @sleep()
 ; DUMP:         Clone of [[BAR]]
 
 
+; REMARKS: created clone _Z3barv.memprof.1
+; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
+; REMARKS: call in clone _Z3barv.memprof.1 marked with memprof allocation attribute cold
+; REMARKS: created clone _Z3foov.memprof.1
+; REMARKS: call in clone _Z3foov.memprof.1 assigned to call function clone _Z3barv.memprof.1
+; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
+
+
+; IR: define internal {{.*}} @_Z3barv()
+; IR:   call {{.*}} @_Znam(i64 0) #[[NOTCOLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov()
+; IR:   call {{.*}} @_Z3barv()
+; IR: define {{.*}} @main()
+;; The first call to foo does not allocate cold memory. It should call the
+;; original functions, which ultimately call the original allocation decorated
+;; with a "notcold" attribute.
+; IR:   call {{.*}} @_Z3foov()
+;; The second call to foo allocates cold memory. It should call cloned functions
+;; which ultimately call a cloned allocation decorated with a "cold" attribute.
+; IR:   call {{.*}} @_Z3foov.memprof.1()
+; IR: define internal {{.*}} @_Z3barv.memprof.1()
+; IR:   call {{.*}} @_Znam(i64 0) #[[COLD:[0-9]+]]
+; IR: define internal {{.*}} @_Z3foov.memprof.1()
+; IR:   call {{.*}} @_Z3barv.memprof.1()
+; IR: attributes #[[NOTCOLD]] = { "memprof"="notcold" }
+; IR: attributes #[[COLD]] = { "memprof"="cold" }
+
+
 ; STATS: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
 ; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned)
+; STATS-BE: 1 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-INPROCESS-BE: 2 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
+;; The distributed backend hasn't yet eliminated the now-dead baz with
+;; the allocation from bar inlined, so it has one more allocation.
+; STATS-DISTRIB-BE: 3 memprof-context-disambiguation - Number of allocation versions (including clones) during ThinLTO backend
 ; STATS: 2 memprof-context-disambiguation - Number of function clones created during whole program analysis
+; STATS-BE: 2 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Number of functions that had clones created during ThinLTO backend
+; STATS-BE: 2 memprof-context-disambiguation - Maximum number of allocation versions created for an original allocation during ThinLTO backend
+; STATS-INPROCESS-BE: 1 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
+;; The distributed backend hasn't yet eliminated the now-dead baz with
+;; the allocation from bar inlined, so it has one more allocation.
+; STATS-DISTRIB-BE: 2 memprof-context-disambiguation - Number of original (not cloned) allocations with memprof profiles during ThinLTO backend
 
 
 ; DOT: digraph "postbuild" {