[llvm] 9eacbba - Restore "[MemProf] ThinLTO summary support" with more fixes

Teresa Johnson via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 16 09:42:53 PST 2022


Author: Teresa Johnson
Date: 2022-11-16T09:42:41-08:00
New Revision: 9eacbba290d87ba23796e3ba6a57138d99aca9e5

URL: https://github.com/llvm/llvm-project/commit/9eacbba290d87ba23796e3ba6a57138d99aca9e5
DIFF: https://github.com/llvm/llvm-project/commit/9eacbba290d87ba23796e3ba6a57138d99aca9e5.diff

LOG: Restore "[MemProf] ThinLTO summary support" with more fixes

This restores commit 98ed423361de2f9dc0113a31be2aa04524489ca9 and
follow on fix 00c22351ba697dbddb4b5bf0ad94e4bcea4b316b, which were
reverted in 5d938eb6f79b16f55266dd23d5df831f552ea082 due to an
MSVC bot failure. I've included a fix for that failure.

Differential Revision: https://reviews.llvm.org/D135714

Added: 
    llvm/test/Assembler/thinlto-memprof-summary.ll
    llvm/test/ThinLTO/X86/memprof-summary.ll

Modified: 
    llvm/include/llvm/Analysis/MemoryProfileInfo.h
    llvm/include/llvm/AsmParser/LLParser.h
    llvm/include/llvm/AsmParser/LLToken.h
    llvm/include/llvm/AsmParser/Parser.h
    llvm/include/llvm/Bitcode/BitcodeReader.h
    llvm/include/llvm/Bitcode/LLVMBitCodes.h
    llvm/include/llvm/IR/ModuleSummaryIndex.h
    llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
    llvm/lib/Analysis/MemoryProfileInfo.cpp
    llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
    llvm/lib/AsmParser/LLLexer.cpp
    llvm/lib/AsmParser/LLParser.cpp
    llvm/lib/AsmParser/Parser.cpp
    llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
    llvm/lib/Bitcode/Reader/BitcodeReader.cpp
    llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
    llvm/lib/IR/AsmWriter.cpp
    llvm/lib/LTO/LTO.cpp
    llvm/unittests/Analysis/MemoryProfileInfoTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index 1b12e78eaeba..24956e781572 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -17,18 +17,12 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include <map>
 
 namespace llvm {
 namespace memprof {
 
-// Allocation type assigned to an allocation reached by a given context.
-// More can be added but initially this is just noncold and cold.
-// Values should be powers of two so that they can be ORed, in particular to
-// track allocations that have 
diff erent behavior with 
diff erent calling
-// contexts.
-enum class AllocationType : uint8_t { None = 0, NotCold = 1, Cold = 2 };
-
 /// Return the allocation type for a given set of memory profile values.
 AllocationType getAllocType(uint64_t MaxAccessCount, uint64_t MinSize,
                             uint64_t MinLifetime);
@@ -106,6 +100,77 @@ class CallStackTrie {
   bool buildAndAttachMIBMetadata(CallBase *CI);
 };
 
+/// Helper class to iterate through stack ids in both metadata (memprof MIB and
+/// callsite) and the corresponding ThinLTO summary data structures
+/// (CallsiteInfo and MIBInfo). This simplifies implementation of client code
+/// which doesn't need to worry about whether we are operating with IR (Regular
+/// LTO), or summary (ThinLTO).
+template <class NodeT, class IteratorT> class CallStack {
+public:
+  CallStack(const NodeT *N = nullptr) : N(N) {}
+
+  // Implement minimum required methods for range-based for loop.
+  // The default implementation assumes we are operating on ThinLTO data
+  // structures, which have a vector of StackIdIndices. There are specialized
+  // versions provided to iterate through metadata.
+  struct CallStackIterator {
+    const NodeT *N = nullptr;
+    IteratorT Iter;
+    CallStackIterator(const NodeT *N, bool End);
+    uint64_t operator*();
+    bool operator==(const CallStackIterator &rhs) { return Iter == rhs.Iter; }
+    bool operator!=(const CallStackIterator &rhs) { return !(*this == rhs); }
+    void operator++() { ++Iter; }
+  };
+
+  bool empty() const { return N == nullptr; }
+
+  CallStackIterator begin() const;
+  CallStackIterator end() const { return CallStackIterator(N, /*End*/ true); }
+  CallStackIterator beginAfterSharedPrefix(CallStack &Other);
+
+private:
+  const NodeT *N = nullptr;
+};
+
+template <class NodeT, class IteratorT>
+CallStack<NodeT, IteratorT>::CallStackIterator::CallStackIterator(
+    const NodeT *N, bool End)
+    : N(N) {
+  if (!N)
+    return;
+  Iter = End ? N->StackIdIndices.end() : N->StackIdIndices.begin();
+}
+
+template <class NodeT, class IteratorT>
+uint64_t CallStack<NodeT, IteratorT>::CallStackIterator::operator*() {
+  assert(Iter != N->StackIdIndices.end());
+  return *Iter;
+}
+
+template <class NodeT, class IteratorT>
+typename CallStack<NodeT, IteratorT>::CallStackIterator
+CallStack<NodeT, IteratorT>::begin() const {
+  return CallStackIterator(N, /*End*/ false);
+}
+
+template <class NodeT, class IteratorT>
+typename CallStack<NodeT, IteratorT>::CallStackIterator
+CallStack<NodeT, IteratorT>::beginAfterSharedPrefix(CallStack &Other) {
+  CallStackIterator Cur = begin();
+  for (CallStackIterator OtherCur = Other.begin();
+       Cur != end() && OtherCur != Other.end(); ++Cur, ++OtherCur)
+    assert(*Cur == *OtherCur);
+  return Cur;
+}
+
+/// Specializations for iterating through IR metadata stack contexts.
+template <>
+CallStack<MDNode, MDNode::op_iterator>::CallStackIterator::CallStackIterator(
+    const MDNode *N, bool End);
+template <>
+uint64_t CallStack<MDNode, MDNode::op_iterator>::CallStackIterator::operator*();
+
 } // end namespace memprof
 } // end namespace llvm
 

diff  --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index e9813c34ce37..d936c6f36b6f 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -406,6 +406,10 @@ namespace llvm {
     void addGlobalValueToIndex(std::string Name, GlobalValue::GUID,
                                GlobalValue::LinkageTypes Linkage, unsigned ID,
                                std::unique_ptr<GlobalValueSummary> Summary);
+    bool parseOptionalAllocs(std::vector<AllocInfo> &Allocs);
+    bool parseMemProfs(std::vector<MIBInfo> &MIBs);
+    bool parseAllocType(uint8_t &AllocType);
+    bool parseOptionalCallsites(std::vector<CallsiteInfo> &Callsites);
 
     // Type Parsing.
     bool parseType(Type *&Result, const Twine &Msg, bool AllowVoid = false);

diff  --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index 5fc192f145aa..8a679007a504 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -406,6 +406,15 @@ enum Kind {
   kw_byte,
   kw_bit,
   kw_varFlags,
+  // The following are used by MemProf summary info.
+  kw_callsites,
+  kw_clones,
+  kw_stackIds,
+  kw_allocs,
+  kw_versions,
+  kw_memProf,
+  kw_notcold,
+  kw_notcoldandcold,
 
   // GV's with __attribute__((no_sanitize("address"))), or things in
   // -fsanitize-ignorelist when built with ASan.

diff  --git a/llvm/include/llvm/AsmParser/Parser.h b/llvm/include/llvm/AsmParser/Parser.h
index 6710ae6e358d..336e95c2399f 100644
--- a/llvm/include/llvm/AsmParser/Parser.h
+++ b/llvm/include/llvm/AsmParser/Parser.h
@@ -105,6 +105,17 @@ ParsedModuleAndIndex parseAssemblyFileWithIndexNoUpgradeDebugInfo(
 std::unique_ptr<ModuleSummaryIndex>
 parseSummaryIndexAssemblyFile(StringRef Filename, SMDiagnostic &Err);
 
+/// The function is a secondary interface to the LLVM Assembly Parser. It parses
+/// an ASCII string that (presumably) contains LLVM Assembly code for a module
+/// summary. It returns a a ModuleSummaryIndex with the corresponding features.
+/// Note that this does not verify that the generated Index is valid, so you
+/// should run the verifier after parsing the file to check that it is okay.
+/// Parse LLVM Assembly from a string
+/// \param AsmString The string containing assembly
+/// \param Err Error result info.
+std::unique_ptr<ModuleSummaryIndex>
+parseSummaryIndexAssemblyString(StringRef AsmString, SMDiagnostic &Err);
+
 /// parseAssemblyFile and parseAssemblyString are wrappers around this function.
 /// Parse LLVM Assembly from a MemoryBuffer.
 /// \param F The MemoryBuffer containing assembly

diff  --git a/llvm/include/llvm/Bitcode/BitcodeReader.h b/llvm/include/llvm/Bitcode/BitcodeReader.h
index 39ea48c33fc3..bc1f88f73229 100644
--- a/llvm/include/llvm/Bitcode/BitcodeReader.h
+++ b/llvm/include/llvm/Bitcode/BitcodeReader.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitstream/BitCodeEnums.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
@@ -117,8 +118,10 @@ typedef llvm::function_ref<Optional<std::string>(StringRef)>
 
     /// Parse the specified bitcode buffer and merge its module summary index
     /// into CombinedIndex.
-    Error readSummary(ModuleSummaryIndex &CombinedIndex, StringRef ModulePath,
-                      uint64_t ModuleId);
+    Error
+    readSummary(ModuleSummaryIndex &CombinedIndex, StringRef ModulePath,
+                uint64_t ModuleId,
+                std::function<bool(GlobalValue::GUID)> IsPrevailing = nullptr);
   };
 
   struct BitcodeFileContents {

diff  --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 74a51d5ce690..2b474b67425c 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -301,6 +301,22 @@ enum GlobalValueSummarySymtabCodes {
   // Range information for accessed offsets for every argument.
   // [n x (paramno, range, numcalls, numcalls x (callee_guid, paramno, range))]
   FS_PARAM_ACCESS = 25,
+  // Summary of per-module memprof callsite metadata.
+  // [valueid, n x stackidindex]
+  FS_PERMODULE_CALLSITE_INFO = 26,
+  // Summary of per-module allocation memprof metadata.
+  // [n x (alloc type, nummib, nummib x stackidindex)]
+  FS_PERMODULE_ALLOC_INFO = 27,
+  // Summary of combined index memprof callsite metadata.
+  // [valueid, numstackindices, numver,
+  //  numstackindices x stackidindex, numver x version]
+  FS_COMBINED_CALLSITE_INFO = 28,
+  // Summary of combined index allocation memprof metadata.
+  // [nummib, numver,
+  //  nummib x (alloc type, numstackids, numstackids x stackidindex),
+  //  numver x version]
+  FS_COMBINED_ALLOC_INFO = 29,
+  FS_STACK_IDS = 30,
 };
 
 enum MetadataCodes {

diff  --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 9f25b5a1e550..4bcf21edfecb 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -284,6 +285,79 @@ template <> struct DenseMapInfo<ValueInfo> {
   static unsigned getHashValue(ValueInfo I) { return (uintptr_t)I.getRef(); }
 };
 
+/// Summary of memprof callsite metadata.
+struct CallsiteInfo {
+  // Actual callee function.
+  ValueInfo Callee;
+
+  // Used to record whole program analysis cloning decisions.
+  // The ThinLTO backend will need to create as many clones as there are entries
+  // in the vector (it is expected and should be confirmed that all such
+  // summaries in the same FunctionSummary have the same number of entries).
+  // Each index records version info for the corresponding clone of this
+  // function. The value is the callee clone it calls (becomes the appended
+  // suffix id). Index 0 is the original version, and a value of 0 calls the
+  // original callee.
+  SmallVector<unsigned> Clones{0};
+
+  // Represents stack ids in this context, recorded as indices into the
+  // StackIds vector in the summary index, which in turn holds the full 64-bit
+  // stack ids. This reduces memory as there are in practice far fewer unique
+  // stack ids than stack id references.
+  SmallVector<unsigned> StackIdIndices;
+
+  CallsiteInfo(ValueInfo Callee, SmallVector<unsigned> StackIdIndices)
+      : Callee(Callee), StackIdIndices(std::move(StackIdIndices)) {}
+  CallsiteInfo(ValueInfo Callee, SmallVector<unsigned> Clones,
+               SmallVector<unsigned> StackIdIndices)
+      : Callee(Callee), Clones(std::move(Clones)),
+        StackIdIndices(std::move(StackIdIndices)) {}
+};
+
+// Allocation type assigned to an allocation reached by a given context.
+// More can be added but initially this is just noncold and cold.
+// Values should be powers of two so that they can be ORed, in particular to
+// track allocations that have 
diff erent behavior with 
diff erent calling
+// contexts.
+enum class AllocationType : uint8_t { None = 0, NotCold = 1, Cold = 2 };
+
+/// Summary of a single MIB in a memprof metadata on allocations.
+struct MIBInfo {
+  // The allocation type for this profiled context.
+  AllocationType AllocType;
+
+  // Represents stack ids in this context, recorded as indices into the
+  // StackIds vector in the summary index, which in turn holds the full 64-bit
+  // stack ids. This reduces memory as there are in practice far fewer unique
+  // stack ids than stack id references.
+  SmallVector<unsigned> StackIdIndices;
+
+  MIBInfo(AllocationType AllocType, SmallVector<unsigned> StackIdIndices)
+      : AllocType(AllocType), StackIdIndices(std::move(StackIdIndices)) {}
+};
+
+/// Summary of memprof metadata on allocations.
+struct AllocInfo {
+  // Used to record whole program analysis cloning decisions.
+  // The ThinLTO backend will need to create as many clones as there are entries
+  // in the vector (it is expected and should be confirmed that all such
+  // summaries in the same FunctionSummary have the same number of entries).
+  // Each index records version info for the corresponding clone of this
+  // function. The value is the allocation type of the corresponding allocation.
+  // Index 0 is the original version. Before cloning, index 0 may have more than
+  // one allocation type.
+  SmallVector<uint8_t> Versions;
+
+  // Vector of MIBs in this memprof metadata.
+  std::vector<MIBInfo> MIBs;
+
+  AllocInfo(std::vector<MIBInfo> MIBs) : MIBs(std::move(MIBs)) {
+    Versions.push_back(0);
+  }
+  AllocInfo(SmallVector<uint8_t> Versions, std::vector<MIBInfo> MIBs)
+      : Versions(std::move(Versions)), MIBs(std::move(MIBs)) {}
+};
+
 /// Function and variable summary information to aid decisions and
 /// implementation of importing.
 class GlobalValueSummary {
@@ -678,7 +752,8 @@ class FunctionSummary : public GlobalValueSummary {
         std::vector<FunctionSummary::VFuncId>(),
         std::vector<FunctionSummary::ConstVCall>(),
         std::vector<FunctionSummary::ConstVCall>(),
-        std::vector<FunctionSummary::ParamAccess>());
+        std::vector<FunctionSummary::ParamAccess>(),
+        std::vector<CallsiteInfo>(), std::vector<AllocInfo>());
   }
 
   /// A dummy node to reference external functions that aren't in the index
@@ -706,6 +781,25 @@ class FunctionSummary : public GlobalValueSummary {
   using ParamAccessesTy = std::vector<ParamAccess>;
   std::unique_ptr<ParamAccessesTy> ParamAccesses;
 
+  /// Optional list of memprof callsite metadata summaries. The correspondence
+  /// between the callsite summary and the callsites in the function is implied
+  /// by the order in the vector (and can be validated by comparing the stack
+  /// ids in the CallsiteInfo to those in the instruction callsite metadata).
+  /// As a memory savings optimization, we only create these for the prevailing
+  /// copy of a symbol when creating the combined index during LTO.
+  using CallsitesTy = std::vector<CallsiteInfo>;
+  std::unique_ptr<CallsitesTy> Callsites;
+
+  /// Optional list of allocation memprof metadata summaries. The correspondence
+  /// between the alloc memprof summary and the allocation callsites in the
+  /// function is implied by the order in the vector (and can be validated by
+  /// comparing the stack ids in the AllocInfo to those in the instruction
+  /// memprof metadata).
+  /// As a memory savings optimization, we only create these for the prevailing
+  /// copy of a symbol when creating the combined index during LTO.
+  using AllocsTy = std::vector<AllocInfo>;
+  std::unique_ptr<AllocsTy> Allocs;
+
 public:
   FunctionSummary(GVFlags Flags, unsigned NumInsts, FFlags FunFlags,
                   uint64_t EntryCount, std::vector<ValueInfo> Refs,
@@ -715,7 +809,8 @@ class FunctionSummary : public GlobalValueSummary {
                   std::vector<VFuncId> TypeCheckedLoadVCalls,
                   std::vector<ConstVCall> TypeTestAssumeConstVCalls,
                   std::vector<ConstVCall> TypeCheckedLoadConstVCalls,
-                  std::vector<ParamAccess> Params)
+                  std::vector<ParamAccess> Params, CallsitesTy CallsiteList,
+                  AllocsTy AllocList)
       : GlobalValueSummary(FunctionKind, Flags, std::move(Refs)),
         InstCount(NumInsts), FunFlags(FunFlags), EntryCount(EntryCount),
         CallGraphEdgeList(std::move(CGEdges)) {
@@ -729,6 +824,10 @@ class FunctionSummary : public GlobalValueSummary {
                      std::move(TypeCheckedLoadConstVCalls)});
     if (!Params.empty())
       ParamAccesses = std::make_unique<ParamAccessesTy>(std::move(Params));
+    if (!CallsiteList.empty())
+      Callsites = std::make_unique<CallsitesTy>(std::move(CallsiteList));
+    if (!AllocList.empty())
+      Allocs = std::make_unique<AllocsTy>(std::move(AllocList));
   }
   // Gets the number of readonly and writeonly refs in RefEdgeList
   std::pair<unsigned, unsigned> specialRefCounts() const;
@@ -832,6 +931,18 @@ class FunctionSummary : public GlobalValueSummary {
 
   const TypeIdInfo *getTypeIdInfo() const { return TIdInfo.get(); };
 
+  ArrayRef<CallsiteInfo> callsites() const {
+    if (Callsites)
+      return *Callsites;
+    return {};
+  }
+
+  ArrayRef<AllocInfo> allocs() const {
+    if (Allocs)
+      return *Allocs;
+    return {};
+  }
+
   friend struct GraphTraits<ValueInfo>;
 };
 
@@ -1163,6 +1274,16 @@ class ModuleSummaryIndex {
   // the total number of basic blocks in the LTO unit in the combined index.
   uint64_t BlockCount;
 
+  // List of unique stack ids (hashes). We use a 4B index of the id in the
+  // stack id lists on the alloc and callsite summaries for memory savings,
+  // since the number of unique ids is in practice much smaller than the
+  // number of stack id references in the summaries.
+  std::vector<uint64_t> StackIds;
+
+  // Temporary map while building StackIds list. Clear when index is completely
+  // built via releaseTemporaryMemory.
+  std::map<uint64_t, unsigned> StackIdToIndex;
+
   // YAML I/O support.
   friend yaml::MappingTraits<ModuleSummaryIndex>;
 
@@ -1205,6 +1326,31 @@ class ModuleSummaryIndex {
   const_gvsummary_iterator end() const { return GlobalValueMap.end(); }
   size_t size() const { return GlobalValueMap.size(); }
 
+  const std::vector<uint64_t> &stackIds() const { return StackIds; }
+
+  unsigned addOrGetStackIdIndex(uint64_t StackId) {
+    auto Inserted = StackIdToIndex.insert({StackId, StackIds.size()});
+    if (Inserted.second)
+      StackIds.push_back(StackId);
+    return Inserted.first->second;
+  }
+
+  uint64_t getStackIdAtIndex(unsigned Index) const {
+    assert(StackIds.size() > Index);
+    return StackIds[Index];
+  }
+
+  // Facility to release memory from data structures only needed during index
+  // construction (including while building combined index). Currently this only
+  // releases the temporary map used while constructing a correspondence between
+  // stack ids and their index in the StackIds vector. Mostly impactful when
+  // building a large combined index.
+  void releaseTemporaryMemory() {
+    assert(StackIdToIndex.size() == StackIds.size());
+    StackIdToIndex.clear();
+    StackIds.shrink_to_fit();
+  }
+
   /// Convenience function for doing a DFS on a ValueInfo. Marks the function in
   /// the FunctionHasParent map.
   static void discoverNodes(ValueInfo V,

diff  --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index 74e92797f15c..33e57e5f2102 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -234,7 +234,8 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
           std::move(FSum.TypeCheckedLoadVCalls),
           std::move(FSum.TypeTestAssumeConstVCalls),
           std::move(FSum.TypeCheckedLoadConstVCalls),
-          ArrayRef<FunctionSummary::ParamAccess>{}));
+          ArrayRef<FunctionSummary::ParamAccess>{}, ArrayRef<CallsiteInfo>{},
+          ArrayRef<AllocInfo>{}));
     }
   }
   static void output(IO &io, GlobalValueSummaryMapTy &V) {

diff  --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index f28ddbbfc849..f42fff5f195f 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -224,3 +224,21 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
   CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));
   return true;
 }
+
+template <>
+CallStack<MDNode, MDNode::op_iterator>::CallStackIterator::CallStackIterator(
+    const MDNode *N, bool End)
+    : N(N) {
+  if (!N)
+    return;
+  Iter = End ? N->op_end() : N->op_begin();
+}
+
+template <>
+uint64_t
+CallStack<MDNode, MDNode::op_iterator>::CallStackIterator::operator*() {
+  assert(Iter != N->op_end());
+  ConstantInt *StackIdCInt = mdconst::dyn_extract<ConstantInt>(*Iter);
+  assert(StackIdCInt);
+  return StackIdCInt->getZExtValue();
+}

diff  --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 52827c210b5a..e8309f51a498 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
@@ -56,6 +57,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace llvm::memprof;
 
 #define DEBUG_TYPE "module-summary-analysis"
 
@@ -275,6 +277,9 @@ static void computeFunctionSummary(
   std::vector<const Instruction *> NonVolatileLoads;
   std::vector<const Instruction *> NonVolatileStores;
 
+  std::vector<CallsiteInfo> Callsites;
+  std::vector<AllocInfo> Allocs;
+
   bool HasInlineAsmMaybeReferencingInternal = false;
   bool HasIndirBranchToBlockAddress = false;
   bool HasUnknownCall = false;
@@ -417,6 +422,57 @@ static void computeFunctionSummary(
           CallGraphEdges[Index.getOrInsertValueInfo(Candidate.Value)]
               .updateHotness(getHotness(Candidate.Count, PSI));
       }
+
+      // TODO: Skip indirect calls for now. Need to handle these better, likely
+      // by creating multiple Callsites, one per target, then speculatively
+      // devirtualize while applying clone info in the ThinLTO backends. This
+      // will also be important because we will have a 
diff erent set of clone
+      // versions per target. This handling needs to match that in the ThinLTO
+      // backend so we handle things consistently for matching of callsite
+      // summaries to instructions.
+      if (!CalledFunction)
+        continue;
+
+      // Compute the list of stack ids first (so we can trim them from the stack
+      // ids on any MIBs).
+      CallStack<MDNode, MDNode::op_iterator> InstCallsite(
+          I.getMetadata(LLVMContext::MD_callsite));
+      auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
+      if (MemProfMD) {
+        std::vector<MIBInfo> MIBs;
+        for (auto &MDOp : MemProfMD->operands()) {
+          auto *MIBMD = cast<const MDNode>(MDOp);
+          MDNode *StackNode = getMIBStackNode(MIBMD);
+          assert(StackNode);
+          SmallVector<unsigned> StackIdIndices;
+          CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
+          // Collapse out any on the allocation call (inlining).
+          for (auto ContextIter =
+                   StackContext.beginAfterSharedPrefix(InstCallsite);
+               ContextIter != StackContext.end(); ++ContextIter) {
+            unsigned StackIdIdx = Index.addOrGetStackIdIndex(*ContextIter);
+            // If this is a direct recursion, simply skip the duplicate
+            // entries. If this is mutual recursion, handling is left to
+            // the LTO link analysis client.
+            if (StackIdIndices.empty() || StackIdIndices.back() != StackIdIdx)
+              StackIdIndices.push_back(StackIdIdx);
+          }
+          MIBs.push_back(
+              MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices)));
+        }
+        Allocs.push_back(AllocInfo(std::move(MIBs)));
+      } else if (!InstCallsite.empty()) {
+        SmallVector<unsigned> StackIdIndices;
+        for (auto StackId : InstCallsite)
+          StackIdIndices.push_back(Index.addOrGetStackIdIndex(StackId));
+        // Use the original CalledValue, in case it was an alias. We want
+        // to record the call edge to the alias in that case. Eventually
+        // an alias summary will be created to associate the alias and
+        // aliasee.
+        auto CalleeValueInfo =
+            Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
+        Callsites.push_back({CalleeValueInfo, StackIdIndices});
+      }
     }
   }
   Index.addBlockCount(F.size());
@@ -508,7 +564,8 @@ static void computeFunctionSummary(
       CallGraphEdges.takeVector(), TypeTests.takeVector(),
       TypeTestAssumeVCalls.takeVector(), TypeCheckedLoadVCalls.takeVector(),
       TypeTestAssumeConstVCalls.takeVector(),
-      TypeCheckedLoadConstVCalls.takeVector(), std::move(ParamAccesses));
+      TypeCheckedLoadConstVCalls.takeVector(), std::move(ParamAccesses),
+      std::move(Callsites), std::move(Allocs));
   if (NonRenamableLocal)
     CantBePromoted.insert(F.getGUID());
   Index.addGlobalValueSummary(F, std::move(FuncSummary));
@@ -757,7 +814,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                     ArrayRef<FunctionSummary::VFuncId>{},
                     ArrayRef<FunctionSummary::ConstVCall>{},
                     ArrayRef<FunctionSummary::ConstVCall>{},
-                    ArrayRef<FunctionSummary::ParamAccess>{});
+                    ArrayRef<FunctionSummary::ParamAccess>{},
+                    ArrayRef<CallsiteInfo>{}, ArrayRef<AllocInfo>{});
             Index.addGlobalValueSummary(*GV, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =

diff  --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 2a171df168fc..c33dc9710f35 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -772,6 +772,14 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(byte);
   KEYWORD(bit);
   KEYWORD(varFlags);
+  KEYWORD(callsites);
+  KEYWORD(clones);
+  KEYWORD(stackIds);
+  KEYWORD(allocs);
+  KEYWORD(versions);
+  KEYWORD(memProf);
+  KEYWORD(notcold);
+  KEYWORD(notcoldandcold);
 
 #undef KEYWORD
 

diff  --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 7fc51682533f..dde0672da8e4 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -8682,6 +8682,8 @@ bool LLParser::parseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
   FunctionSummary::TypeIdInfo TypeIdInfo;
   std::vector<FunctionSummary::ParamAccess> ParamAccesses;
   std::vector<ValueInfo> Refs;
+  std::vector<CallsiteInfo> Callsites;
+  std::vector<AllocInfo> Allocs;
   // Default is all-zeros (conservative values).
   FunctionSummary::FFlags FFlags = {};
   if (parseToken(lltok::colon, "expected ':' here") ||
@@ -8716,6 +8718,14 @@ bool LLParser::parseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
       if (parseOptionalParamAccesses(ParamAccesses))
         return true;
       break;
+    case lltok::kw_allocs:
+      if (parseOptionalAllocs(Allocs))
+        return true;
+      break;
+    case lltok::kw_callsites:
+      if (parseOptionalCallsites(Callsites))
+        return true;
+      break;
     default:
       return error(Lex.getLoc(), "expected optional function summary field");
     }
@@ -8731,7 +8741,7 @@ bool LLParser::parseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
       std::move(TypeIdInfo.TypeCheckedLoadVCalls),
       std::move(TypeIdInfo.TypeTestAssumeConstVCalls),
       std::move(TypeIdInfo.TypeCheckedLoadConstVCalls),
-      std::move(ParamAccesses));
+      std::move(ParamAccesses), std::move(Callsites), std::move(Allocs));
 
   FS->setModulePath(ModulePath);
 
@@ -9683,3 +9693,220 @@ bool LLParser::parseGVReference(ValueInfo &VI, unsigned &GVId) {
     VI.setWriteOnly();
   return false;
 }
+
+/// OptionalAllocs
+///   := 'allocs' ':' '(' Alloc [',' Alloc]* ')'
+/// Alloc ::= '(' 'versions' ':' '(' Version [',' Version]* ')'
+///              ',' MemProfs ')'
+/// Version ::= UInt32
+bool LLParser::parseOptionalAllocs(std::vector<AllocInfo> &Allocs) {
+  assert(Lex.getKind() == lltok::kw_allocs);
+  Lex.Lex();
+
+  if (parseToken(lltok::colon, "expected ':' in allocs") ||
+      parseToken(lltok::lparen, "expected '(' in allocs"))
+    return true;
+
+  // parse each alloc
+  do {
+    if (parseToken(lltok::lparen, "expected '(' in alloc") ||
+        parseToken(lltok::kw_versions, "expected 'versions' in alloc") ||
+        parseToken(lltok::colon, "expected ':'") ||
+        parseToken(lltok::lparen, "expected '(' in versions"))
+      return true;
+
+    SmallVector<uint8_t> Versions;
+    do {
+      uint8_t V = 0;
+      if (parseAllocType(V))
+        return true;
+      Versions.push_back(V);
+    } while (EatIfPresent(lltok::comma));
+
+    if (parseToken(lltok::rparen, "expected ')' in versions") ||
+        parseToken(lltok::comma, "expected ',' in alloc"))
+      return true;
+
+    std::vector<MIBInfo> MIBs;
+    if (parseMemProfs(MIBs))
+      return true;
+
+    Allocs.push_back({Versions, MIBs});
+
+    if (parseToken(lltok::rparen, "expected ')' in alloc"))
+      return true;
+  } while (EatIfPresent(lltok::comma));
+
+  if (parseToken(lltok::rparen, "expected ')' in allocs"))
+    return true;
+
+  return false;
+}
+
+/// MemProfs
+///   := 'memProf' ':' '(' MemProf [',' MemProf]* ')'
+/// MemProf ::= '(' 'type' ':' AllocType
+///              ',' 'stackIds' ':' '(' StackId [',' StackId]* ')' ')'
+/// StackId ::= UInt64
+bool LLParser::parseMemProfs(std::vector<MIBInfo> &MIBs) {
+  assert(Lex.getKind() == lltok::kw_memProf);
+  Lex.Lex();
+
+  if (parseToken(lltok::colon, "expected ':' in memprof") ||
+      parseToken(lltok::lparen, "expected '(' in memprof"))
+    return true;
+
+  // parse each MIB
+  do {
+    if (parseToken(lltok::lparen, "expected '(' in memprof") ||
+        parseToken(lltok::kw_type, "expected 'type' in memprof") ||
+        parseToken(lltok::colon, "expected ':'"))
+      return true;
+
+    uint8_t AllocType;
+    if (parseAllocType(AllocType))
+      return true;
+
+    if (parseToken(lltok::comma, "expected ',' in memprof") ||
+        parseToken(lltok::kw_stackIds, "expected 'stackIds' in memprof") ||
+        parseToken(lltok::colon, "expected ':'") ||
+        parseToken(lltok::lparen, "expected '(' in stackIds"))
+      return true;
+
+    SmallVector<unsigned> StackIdIndices;
+    do {
+      uint64_t StackId = 0;
+      if (parseUInt64(StackId))
+        return true;
+      StackIdIndices.push_back(Index->addOrGetStackIdIndex(StackId));
+    } while (EatIfPresent(lltok::comma));
+
+    if (parseToken(lltok::rparen, "expected ')' in stackIds"))
+      return true;
+
+    MIBs.push_back({(AllocationType)AllocType, StackIdIndices});
+
+    if (parseToken(lltok::rparen, "expected ')' in memprof"))
+      return true;
+  } while (EatIfPresent(lltok::comma));
+
+  if (parseToken(lltok::rparen, "expected ')' in memprof"))
+    return true;
+
+  return false;
+}
+
+/// AllocType
+///   := ('none'|'notcold'|'cold'|'notcoldandcold')
+bool LLParser::parseAllocType(uint8_t &AllocType) {
+  switch (Lex.getKind()) {
+  case lltok::kw_none:
+    AllocType = (uint8_t)AllocationType::None;
+    break;
+  case lltok::kw_notcold:
+    AllocType = (uint8_t)AllocationType::NotCold;
+    break;
+  case lltok::kw_cold:
+    AllocType = (uint8_t)AllocationType::Cold;
+    break;
+  case lltok::kw_notcoldandcold:
+    AllocType =
+        (uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold;
+    break;
+  default:
+    return error(Lex.getLoc(), "invalid alloc type");
+  }
+  Lex.Lex();
+  return false;
+}
+
+/// OptionalCallsites
+///   := 'callsites' ':' '(' Callsite [',' Callsite]* ')'
+/// Callsite ::= '(' 'callee' ':' GVReference
+///              ',' 'clones' ':' '(' Version [',' Version]* ')'
+///              ',' 'stackIds' ':' '(' StackId [',' StackId]* ')' ')'
+/// Version ::= UInt32
+/// StackId ::= UInt64
+bool LLParser::parseOptionalCallsites(std::vector<CallsiteInfo> &Callsites) {
+  assert(Lex.getKind() == lltok::kw_callsites);
+  Lex.Lex();
+
+  if (parseToken(lltok::colon, "expected ':' in callsites") ||
+      parseToken(lltok::lparen, "expected '(' in callsites"))
+    return true;
+
+  IdToIndexMapType IdToIndexMap;
+  // parse each callsite
+  do {
+    if (parseToken(lltok::lparen, "expected '(' in callsite") ||
+        parseToken(lltok::kw_callee, "expected 'callee' in callsite") ||
+        parseToken(lltok::colon, "expected ':'"))
+      return true;
+
+    ValueInfo VI;
+    unsigned GVId = 0;
+    LocTy Loc = Lex.getLoc();
+    if (!EatIfPresent(lltok::kw_null)) {
+      if (parseGVReference(VI, GVId))
+        return true;
+    }
+
+    if (parseToken(lltok::comma, "expected ',' in callsite") ||
+        parseToken(lltok::kw_clones, "expected 'clones' in callsite") ||
+        parseToken(lltok::colon, "expected ':'") ||
+        parseToken(lltok::lparen, "expected '(' in clones"))
+      return true;
+
+    SmallVector<unsigned> Clones;
+    do {
+      unsigned V = 0;
+      if (parseUInt32(V))
+        return true;
+      Clones.push_back(V);
+    } while (EatIfPresent(lltok::comma));
+
+    if (parseToken(lltok::rparen, "expected ')' in clones") ||
+        parseToken(lltok::comma, "expected ',' in callsite") ||
+        parseToken(lltok::kw_stackIds, "expected 'stackIds' in callsite") ||
+        parseToken(lltok::colon, "expected ':'") ||
+        parseToken(lltok::lparen, "expected '(' in stackIds"))
+      return true;
+
+    SmallVector<unsigned> StackIdIndices;
+    do {
+      uint64_t StackId = 0;
+      if (parseUInt64(StackId))
+        return true;
+      StackIdIndices.push_back(Index->addOrGetStackIdIndex(StackId));
+    } while (EatIfPresent(lltok::comma));
+
+    if (parseToken(lltok::rparen, "expected ')' in stackIds"))
+      return true;
+
+    // Keep track of the Callsites array index needing a forward reference.
+    // We will save the location of the ValueInfo needing an update, but
+    // can only do so once the SmallVector is finalized.
+    if (VI.getRef() == FwdVIRef)
+      IdToIndexMap[GVId].push_back(std::make_pair(Callsites.size(), Loc));
+    Callsites.push_back({VI, Clones, StackIdIndices});
+
+    if (parseToken(lltok::rparen, "expected ')' in callsite"))
+      return true;
+  } while (EatIfPresent(lltok::comma));
+
+  // Now that the Callsites vector is finalized, it is safe to save the
+  // locations of any forward GV references that need updating later.
+  for (auto I : IdToIndexMap) {
+    auto &Infos = ForwardRefValueInfos[I.first];
+    for (auto P : I.second) {
+      assert(Callsites[P.first].Callee.getRef() == FwdVIRef &&
+             "Forward referenced ValueInfo expected to be empty");
+      Infos.emplace_back(&Callsites[P.first].Callee, P.second);
+    }
+  }
+
+  if (parseToken(lltok::rparen, "expected ')' in callsites"))
+    return true;
+
+  return false;
+}

diff  --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp
index 95b9079f0f9c..4458ae757c16 100644
--- a/llvm/lib/AsmParser/Parser.cpp
+++ b/llvm/lib/AsmParser/Parser.cpp
@@ -177,6 +177,12 @@ llvm::parseSummaryIndexAssemblyFile(StringRef Filename, SMDiagnostic &Err) {
   return parseSummaryIndexAssembly(FileOrErr.get()->getMemBufferRef(), Err);
 }
 
+std::unique_ptr<ModuleSummaryIndex>
+llvm::parseSummaryIndexAssemblyString(StringRef AsmString, SMDiagnostic &Err) {
+  MemoryBufferRef F(AsmString, "<string>");
+  return parseSummaryIndexAssembly(F, Err);
+}
+
 Constant *llvm::parseConstantValue(StringRef Asm, SMDiagnostic &Err,
                                    const Module &M, const SlotMapping *Slots) {
   SourceMgr SM;

diff  --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index dd3cac8b8a6f..97bc828066a5 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -315,6 +315,11 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, TYPE_ID_METADATA)
       STRINGIFY_CODE(FS, BLOCK_COUNT)
       STRINGIFY_CODE(FS, PARAM_ACCESS)
+      STRINGIFY_CODE(FS, PERMODULE_CALLSITE_INFO)
+      STRINGIFY_CODE(FS, PERMODULE_ALLOC_INFO)
+      STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO)
+      STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO)
+      STRINGIFY_CODE(FS, STACK_IDS)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch (CodeID) {

diff  --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 0a2415cbfdef..c4805602d1ad 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -883,8 +883,10 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   // they are recorded in the summary index being built.
   // We save a GUID which refers to the same global as the ValueInfo, but
   // ignoring the linkage, i.e. for values other than local linkage they are
-  // identical.
-  DenseMap<unsigned, std::tuple<ValueInfo, GlobalValue::GUID>>
+  // identical (this is the second tuple member).
+  // The third tuple member is the real GUID of the ValueInfo.
+  DenseMap<unsigned,
+           std::tuple<ValueInfo, GlobalValue::GUID, GlobalValue::GUID>>
       ValueIdToValueInfoMap;
 
   /// Map populated during module path string table parsing, from the
@@ -904,10 +906,19 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   /// this module by the client.
   unsigned ModuleId;
 
+  /// Callback to ask whether a symbol is the prevailing copy when invoked
+  /// during combined index building.
+  std::function<bool(GlobalValue::GUID)> IsPrevailing;
+
+  /// Saves the stack ids from the STACK_IDS record to consult when adding stack
+  /// ids from the lists in the callsite and alloc entries to the index.
+  std::vector<uint64_t> StackIds;
+
 public:
-  ModuleSummaryIndexBitcodeReader(BitstreamCursor Stream, StringRef Strtab,
-                                  ModuleSummaryIndex &TheIndex,
-                                  StringRef ModulePath, unsigned ModuleId);
+  ModuleSummaryIndexBitcodeReader(
+      BitstreamCursor Stream, StringRef Strtab, ModuleSummaryIndex &TheIndex,
+      StringRef ModulePath, unsigned ModuleId,
+      std::function<bool(GlobalValue::GUID)> IsPrevailing = nullptr);
 
   Error parseModule();
 
@@ -931,7 +942,8 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   std::vector<FunctionSummary::ParamAccess>
   parseParamAccesses(ArrayRef<uint64_t> Record);
 
-  std::tuple<ValueInfo, GlobalValue::GUID>
+  template <bool AllowNullValueInfo = false>
+  std::tuple<ValueInfo, GlobalValue::GUID, GlobalValue::GUID>
   getValueInfoFromValueId(unsigned ValueId);
 
   void addThisModule();
@@ -6643,9 +6655,10 @@ std::vector<StructType *> BitcodeReader::getIdentifiedStructTypes() const {
 
 ModuleSummaryIndexBitcodeReader::ModuleSummaryIndexBitcodeReader(
     BitstreamCursor Cursor, StringRef Strtab, ModuleSummaryIndex &TheIndex,
-    StringRef ModulePath, unsigned ModuleId)
+    StringRef ModulePath, unsigned ModuleId,
+    std::function<bool(GlobalValue::GUID)> IsPrevailing)
     : BitcodeReaderBase(std::move(Cursor), Strtab), TheIndex(TheIndex),
-      ModulePath(ModulePath), ModuleId(ModuleId) {}
+      ModulePath(ModulePath), ModuleId(ModuleId), IsPrevailing(IsPrevailing) {}
 
 void ModuleSummaryIndexBitcodeReader::addThisModule() {
   TheIndex.addModule(ModulePath, ModuleId);
@@ -6656,10 +6669,15 @@ ModuleSummaryIndexBitcodeReader::getThisModule() {
   return TheIndex.getModule(ModulePath);
 }
 
-std::tuple<ValueInfo, GlobalValue::GUID>
+template <bool AllowNullValueInfo>
+std::tuple<ValueInfo, GlobalValue::GUID, GlobalValue::GUID>
 ModuleSummaryIndexBitcodeReader::getValueInfoFromValueId(unsigned ValueId) {
   auto VGI = ValueIdToValueInfoMap[ValueId];
-  assert(std::get<0>(VGI));
+  // We can have a null value info for memprof callsite info records in
+  // distributed ThinLTO index files when the callee function summary is not
+  // included in the index. The bitcode writer records 0 in that case,
+  // and the caller of this helper will set AllowNullValueInfo to true.
+  assert(AllowNullValueInfo || std::get<0>(VGI));
   return VGI;
 }
 
@@ -6682,7 +6700,7 @@ void ModuleSummaryIndexBitcodeReader::setValueGUID(
   ValueIdToValueInfoMap[ValueID] = std::make_tuple(
       TheIndex.getOrInsertValueInfo(
           ValueGUID, UseStrtab ? ValueName : TheIndex.saveString(ValueName)),
-      OriginalNameID);
+      OriginalNameID, ValueGUID);
 }
 
 // Specialized value symbol table parser used when reading module index
@@ -6770,8 +6788,8 @@ Error ModuleSummaryIndexBitcodeReader::parseValueSymbolTable(
       GlobalValue::GUID RefGUID = Record[1];
       // The "original name", which is the second value of the pair will be
       // overriden later by a FS_COMBINED_ORIGINAL_NAME in the combined index.
-      ValueIdToValueInfoMap[ValueID] =
-          std::make_tuple(TheIndex.getOrInsertValueInfo(RefGUID), RefGUID);
+      ValueIdToValueInfoMap[ValueID] = std::make_tuple(
+          TheIndex.getOrInsertValueInfo(RefGUID), RefGUID, RefGUID);
       break;
     }
     }
@@ -7116,6 +7134,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       PendingTypeCheckedLoadConstVCalls;
   std::vector<FunctionSummary::ParamAccess> PendingParamAccesses;
 
+  std::vector<CallsiteInfo> PendingCallsites;
+  std::vector<AllocInfo> PendingAllocs;
+
   while (true) {
     Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
     if (!MaybeEntry)
@@ -7154,8 +7175,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     case bitc::FS_VALUE_GUID: { // [valueid, refguid]
       uint64_t ValueID = Record[0];
       GlobalValue::GUID RefGUID = Record[1];
-      ValueIdToValueInfoMap[ValueID] =
-          std::make_tuple(TheIndex.getOrInsertValueInfo(RefGUID), RefGUID);
+      ValueIdToValueInfoMap[ValueID] = std::make_tuple(
+          TheIndex.getOrInsertValueInfo(RefGUID), RefGUID, RefGUID);
       break;
     }
     // FS_PERMODULE: [valueid, flags, instcount, fflags, numrefs,
@@ -7207,6 +7228,13 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           ArrayRef<uint64_t>(Record).slice(CallGraphEdgeStartIndex),
           IsOldProfileFormat, HasProfile, HasRelBF);
       setSpecialRefs(Refs, NumRORefs, NumWORefs);
+      auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID);
+      // In order to save memory, only record the memprof summaries if this is
+      // the prevailing copy of a symbol.
+      if (IsPrevailing && !IsPrevailing(std::get<2>(VIAndOriginalGUID))) {
+        PendingCallsites.clear();
+        PendingAllocs.clear();
+      }
       auto FS = std::make_unique<FunctionSummary>(
           Flags, InstCount, getDecodedFFlags(RawFunFlags), /*EntryCount=*/0,
           std::move(Refs), std::move(Calls), std::move(PendingTypeTests),
@@ -7214,8 +7242,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           std::move(PendingTypeCheckedLoadVCalls),
           std::move(PendingTypeTestAssumeConstVCalls),
           std::move(PendingTypeCheckedLoadConstVCalls),
-          std::move(PendingParamAccesses));
-      auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID);
+          std::move(PendingParamAccesses), std::move(PendingCallsites),
+          std::move(PendingAllocs));
       FS->setModulePath(getThisModule()->first());
       FS->setOriginalName(std::get<1>(VIAndOriginalGUID));
       TheIndex.addGlobalValueSummary(std::get<0>(VIAndOriginalGUID),
@@ -7358,7 +7386,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           std::move(PendingTypeCheckedLoadVCalls),
           std::move(PendingTypeTestAssumeConstVCalls),
           std::move(PendingTypeCheckedLoadConstVCalls),
-          std::move(PendingParamAccesses));
+          std::move(PendingParamAccesses), std::move(PendingCallsites),
+          std::move(PendingAllocs));
       LastSeenSummary = FS.get();
       LastSeenGUID = VI.getGUID();
       FS->setModulePath(ModuleIdMap[ModuleId]);
@@ -7484,6 +7513,95 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       PendingParamAccesses = parseParamAccesses(Record);
       break;
     }
+
+    case bitc::FS_STACK_IDS: { // [n x stackid]
+      // Save stack ids in the reader to consult when adding stack ids from the
+      // lists in the stack node and alloc node entries.
+      StackIds = ArrayRef<uint64_t>(Record);
+      break;
+    }
+
+    case bitc::FS_PERMODULE_CALLSITE_INFO: {
+      unsigned ValueID = Record[0];
+      SmallVector<unsigned> StackIdList;
+      for (auto R = Record.begin() + 1; R != Record.end(); R++) {
+        assert(*R < StackIds.size());
+        StackIdList.push_back(TheIndex.addOrGetStackIdIndex(StackIds[*R]));
+      }
+      ValueInfo VI = std::get<0>(getValueInfoFromValueId(ValueID));
+      PendingCallsites.push_back(CallsiteInfo({VI, std::move(StackIdList)}));
+      break;
+    }
+
+    case bitc::FS_COMBINED_CALLSITE_INFO: {
+      auto RecordIter = Record.begin();
+      unsigned ValueID = *RecordIter++;
+      unsigned NumStackIds = *RecordIter++;
+      unsigned NumVersions = *RecordIter++;
+      assert(Record.size() == 3 + NumStackIds + NumVersions);
+      SmallVector<unsigned> StackIdList;
+      for (unsigned J = 0; J < NumStackIds; J++) {
+        assert(*RecordIter < StackIds.size());
+        StackIdList.push_back(
+            TheIndex.addOrGetStackIdIndex(StackIds[*RecordIter++]));
+      }
+      SmallVector<unsigned> Versions;
+      for (unsigned J = 0; J < NumVersions; J++)
+        Versions.push_back(*RecordIter++);
+      ValueInfo VI = std::get<0>(
+          getValueInfoFromValueId</*AllowNullValueInfo*/ true>(ValueID));
+      PendingCallsites.push_back(
+          CallsiteInfo({VI, std::move(Versions), std::move(StackIdList)}));
+      break;
+    }
+
+    case bitc::FS_PERMODULE_ALLOC_INFO: {
+      unsigned I = 0;
+      std::vector<MIBInfo> MIBs;
+      while (I < Record.size()) {
+        assert(Record.size() - I >= 2);
+        AllocationType AllocType = (AllocationType)Record[I++];
+        unsigned NumStackEntries = Record[I++];
+        assert(Record.size() - I >= NumStackEntries);
+        SmallVector<unsigned> StackIdList;
+        for (unsigned J = 0; J < NumStackEntries; J++) {
+          assert(Record[I] < StackIds.size());
+          StackIdList.push_back(
+              TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]]));
+        }
+        MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
+      }
+      PendingAllocs.push_back(AllocInfo(std::move(MIBs)));
+      break;
+    }
+
+    case bitc::FS_COMBINED_ALLOC_INFO: {
+      unsigned I = 0;
+      std::vector<MIBInfo> MIBs;
+      unsigned NumMIBs = Record[I++];
+      unsigned NumVersions = Record[I++];
+      unsigned MIBsRead = 0;
+      while (MIBsRead++ < NumMIBs) {
+        assert(Record.size() - I >= 2);
+        AllocationType AllocType = (AllocationType)Record[I++];
+        unsigned NumStackEntries = Record[I++];
+        assert(Record.size() - I >= NumStackEntries);
+        SmallVector<unsigned> StackIdList;
+        for (unsigned J = 0; J < NumStackEntries; J++) {
+          assert(Record[I] < StackIds.size());
+          StackIdList.push_back(
+              TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]]));
+        }
+        MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
+      }
+      assert(Record.size() - I >= NumVersions);
+      SmallVector<uint8_t> Versions;
+      for (unsigned J = 0; J < NumVersions; J++)
+        Versions.push_back(Record[I++]);
+      PendingAllocs.push_back(
+          AllocInfo(std::move(Versions), std::move(MIBs)));
+      break;
+    }
     }
   }
   llvm_unreachable("Exit infinite loop");
@@ -7803,14 +7921,15 @@ BitcodeModule::getLazyModule(LLVMContext &Context, bool ShouldLazyLoadMetadata,
 // We don't use ModuleIdentifier here because the client may need to control the
 // module path used in the combined summary (e.g. when reading summaries for
 // regular LTO modules).
-Error BitcodeModule::readSummary(ModuleSummaryIndex &CombinedIndex,
-                                 StringRef ModulePath, uint64_t ModuleId) {
+Error BitcodeModule::readSummary(
+    ModuleSummaryIndex &CombinedIndex, StringRef ModulePath, uint64_t ModuleId,
+    std::function<bool(GlobalValue::GUID)> IsPrevailing) {
   BitstreamCursor Stream(Buffer);
   if (Error JumpFailed = Stream.JumpToBit(ModuleBit))
     return JumpFailed;
 
   ModuleSummaryIndexBitcodeReader R(std::move(Stream), Strtab, CombinedIndex,
-                                    ModulePath, ModuleId);
+                                    ModulePath, ModuleId, IsPrevailing);
   return R.parseModule();
 }
 

diff  --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 4bf881a47917..bc81afbc4064 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -211,12 +211,10 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase {
   void writePerModuleGlobalValueSummary();
 
 private:
-  void writePerModuleFunctionSummaryRecord(SmallVector<uint64_t, 64> &NameVals,
-                                           GlobalValueSummary *Summary,
-                                           unsigned ValueID,
-                                           unsigned FSCallsAbbrev,
-                                           unsigned FSCallsProfileAbbrev,
-                                           const Function &F);
+  void writePerModuleFunctionSummaryRecord(
+      SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
+      unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev,
+      unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F);
   void writeModuleLevelReferences(const GlobalVariable &V,
                                   SmallVector<uint64_t, 64> &NameVals,
                                   unsigned FSModRefsAbbrev,
@@ -424,6 +422,11 @@ class IndexBitcodeWriter : public BitcodeWriterBase {
   /// index and a value id generated by this class to use in references.
   std::map<GlobalValue::GUID, unsigned> GUIDToValueIdMap;
 
+  // The sorted stack id indices actually used in the summary entries being
+  // written, which will be a subset of those in the full index in the case of
+  // distributed indexes.
+  std::vector<unsigned> StackIdIndices;
+
   /// Tracks the last value id recorded in the GUIDToValueMap.
   unsigned GlobalValueId = 0;
 
@@ -441,9 +444,28 @@ class IndexBitcodeWriter : public BitcodeWriterBase {
     // in writing out the call graph edges. Save the mapping from GUID
     // to the new global value id to use when writing those edges, which
     // are currently saved in the index in terms of GUID.
-    forEachSummary([&](GVInfo I, bool) {
+    forEachSummary([&](GVInfo I, bool IsAliasee) {
       GUIDToValueIdMap[I.first] = ++GlobalValueId;
+      if (IsAliasee)
+        return;
+      auto *FS = dyn_cast<FunctionSummary>(I.second);
+      if (!FS)
+        return;
+      // Record all stack id indices actually used in the summary entries being
+      // written, so that we can compact them in the case of distributed ThinLTO
+      // indexes.
+      for (auto &CI : FS->callsites())
+        for (auto Idx : CI.StackIdIndices)
+          StackIdIndices.push_back(Idx);
+      for (auto &AI : FS->allocs())
+        for (auto &MIB : AI.MIBs)
+          for (auto Idx : MIB.StackIdIndices)
+            StackIdIndices.push_back(Idx);
     });
+    llvm::sort(StackIdIndices);
+    StackIdIndices.erase(
+        std::unique(StackIdIndices.begin(), StackIdIndices.end()),
+        StackIdIndices.end());
   }
 
   /// The below iterator returns the GUID and associated summary.
@@ -3888,11 +3910,64 @@ static void writeTypeIdCompatibleVtableSummaryRecord(
   }
 }
 
+static void writeFunctionHeapProfileRecords(
+    BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev,
+    unsigned AllocAbbrev, bool PerModule,
+    std::function<unsigned(const ValueInfo &VI)> GetValueID,
+    std::function<unsigned(unsigned)> GetStackIndex) {
+  SmallVector<uint64_t> Record;
+
+  for (auto &CI : FS->callsites()) {
+    Record.clear();
+    // Per module callsite clones should always have a single entry of
+    // value 0.
+    assert(!PerModule || (CI.Clones.size() == 1 && CI.Clones[0] == 0));
+    Record.push_back(GetValueID(CI.Callee));
+    if (!PerModule) {
+      Record.push_back(CI.StackIdIndices.size());
+      Record.push_back(CI.Clones.size());
+    }
+    for (auto Id : CI.StackIdIndices)
+      Record.push_back(GetStackIndex(Id));
+    if (!PerModule) {
+      for (auto V : CI.Clones)
+        Record.push_back(V);
+    }
+    Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_CALLSITE_INFO
+                                : bitc::FS_COMBINED_CALLSITE_INFO,
+                      Record, CallsiteAbbrev);
+  }
+
+  for (auto &AI : FS->allocs()) {
+    Record.clear();
+    // Per module alloc versions should always have a single entry of
+    // value 0.
+    assert(!PerModule || (AI.Versions.size() == 1 && AI.Versions[0] == 0));
+    if (!PerModule) {
+      Record.push_back(AI.MIBs.size());
+      Record.push_back(AI.Versions.size());
+    }
+    for (auto &MIB : AI.MIBs) {
+      Record.push_back((uint8_t)MIB.AllocType);
+      Record.push_back(MIB.StackIdIndices.size());
+      for (auto Id : MIB.StackIdIndices)
+        Record.push_back(GetStackIndex(Id));
+    }
+    if (!PerModule) {
+      for (auto V : AI.Versions)
+        Record.push_back(V);
+    }
+    Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO
+                                : bitc::FS_COMBINED_ALLOC_INFO,
+                      Record, AllocAbbrev);
+  }
+}
+
 // Helper to emit a single function summary record.
 void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
     SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
     unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev,
-    const Function &F) {
+    unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F) {
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
@@ -3902,6 +3977,12 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
         return {VE.getValueID(VI.getValue())};
       });
 
+  writeFunctionHeapProfileRecords(
+      Stream, FS, CallsiteAbbrev, AllocAbbrev,
+      /*PerModule*/ true,
+      /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); },
+      /*GetStackIndex*/ [&](unsigned I) { return I; });
+
   auto SpecialRefCnts = FS->specialRefCounts();
   NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
   NameVals.push_back(FS->instCount());
@@ -4013,6 +4094,16 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
                       ArrayRef<uint64_t>{GVI.second, GVI.first});
   }
 
+  if (!Index->stackIds().empty()) {
+    auto StackIdAbbv = std::make_shared<BitCodeAbbrev>();
+    StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS));
+    // numids x stackid
+    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv));
+    Stream.EmitRecord(bitc::FS_STACK_IDS, Index->stackIds(), StackIdAbbvId);
+  }
+
   // Abbrev for FS_PERMODULE_PROFILE.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
@@ -4084,6 +4175,21 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned TypeIdCompatibleVtableAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_CALLSITE_INFO));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  // n x stackidindex
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned CallsiteAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO));
+  // n x (alloc type, numstackids, numstackids x stackidindex)
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
   SmallVector<uint64_t, 64> NameVals;
   // Iterate over the list of functions instead of the Index to
   // ensure the ordering is stable.
@@ -4102,7 +4208,8 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     }
     auto *Summary = VI.getSummaryList()[0].get();
     writePerModuleFunctionSummaryRecord(NameVals, Summary, VE.getValueID(&F),
-                                        FSCallsAbbrev, FSCallsProfileAbbrev, F);
+                                        FSCallsAbbrev, FSCallsProfileAbbrev,
+                                        CallsiteAbbrev, AllocAbbrev, F);
   }
 
   // Capture references from GlobalVariable initializers, which are outside
@@ -4144,7 +4251,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
 
 /// Emit the combined summary section into the combined index file.
 void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
-  Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 3);
+  Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 4);
   Stream.EmitRecord(
       bitc::FS_VERSION,
       ArrayRef<uint64_t>{ModuleSummaryIndex::BitcodeSummaryVersion});
@@ -4157,6 +4264,21 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
                       ArrayRef<uint64_t>{GVI.second, GVI.first});
   }
 
+  if (!StackIdIndices.empty()) {
+    auto StackIdAbbv = std::make_shared<BitCodeAbbrev>();
+    StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS));
+    // numids x stackid
+    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv));
+    // Write the stack ids used by this index, which will be a subset of those in
+    // the full index in the case of distributed indexes.
+    std::vector<uint64_t> StackIds;
+    for (auto &I : StackIdIndices)
+      StackIds.push_back(Index.getStackIdAtIndex(I));
+    Stream.EmitRecord(bitc::FS_STACK_IDS, StackIds, StackIdAbbvId);
+  }
+
   // Abbrev for FS_COMBINED.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED));
@@ -4210,6 +4332,26 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_CALLSITE_INFO));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numstackindices
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
+  // numstackindices x stackidindex, numver x version
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned CallsiteAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALLOC_INFO));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
+  // nummib x (alloc type, numstackids, numstackids x stackidindex),
+  // numver x version
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
   // The aliases are emitted as a post-pass, and will point to the value
   // id of the aliasee. Save them in a vector for post-processing.
   SmallVector<AliasSummary *, 64> Aliases;
@@ -4286,6 +4428,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     }
 
     auto GetValueId = [&](const ValueInfo &VI) -> Optional<unsigned> {
+      if (!VI)
+        return None;
       return getValueId(VI.getGUID());
     };
 
@@ -4293,6 +4437,27 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     writeFunctionTypeMetadataRecords(Stream, FS, GetValueId);
     getReferencedTypeIds(FS, ReferencedTypeIds);
 
+    writeFunctionHeapProfileRecords(
+        Stream, FS, CallsiteAbbrev, AllocAbbrev,
+        /*PerModule*/ false,
+        /*GetValueId*/ [&](const ValueInfo &VI) -> unsigned {
+          Optional<unsigned> ValueID = GetValueId(VI);
+          // This can happen in shared index files for distributed ThinLTO if
+          // the callee function summary is not included. Record 0 which we
+          // will have to deal with conservatively when doing any kind of
+          // validation in the ThinLTO backends.
+          if (!ValueID)
+            return 0;
+          return *ValueID;
+        },
+        /*GetStackIndex*/ [&](unsigned I) {
+          // Get the corresponding index into the list of StackIdIndices
+          // actually being written for this combined index (which may be a
+          // subset in the case of distributed indexes).
+          auto Lower = llvm::lower_bound(StackIdIndices, I);
+          return std::distance(StackIdIndices.begin(), Lower);
+        });
+
     NameVals.push_back(*ValueId);
     NameVals.push_back(Index.getModuleId(FS->modulePath()));
     NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));

diff  --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 22e12b29843e..2f4ebcf26e6e 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -3192,6 +3192,80 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
   if (const auto *TIdInfo = FS->getTypeIdInfo())
     printTypeIdInfo(*TIdInfo);
 
+  // The AllocationType identifiers capture the profiled context behavior
+  // reaching a specific static allocation site (possibly cloned). Thus
+  // "notcoldandcold" implies there are multiple contexts which reach this site,
+  // some of which are cold and some of which are not, and that need to
+  // disambiguate via cloning or other context identification.
+  auto AllocTypeName = [](uint8_t Type) -> const char * {
+    switch (Type) {
+    case (uint8_t)AllocationType::None:
+      return "none";
+    case (uint8_t)AllocationType::NotCold:
+      return "notcold";
+    case (uint8_t)AllocationType::Cold:
+      return "cold";
+    case (uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold:
+      return "notcoldandcold";
+    }
+    llvm_unreachable("Unexpected alloc type");
+  };
+
+  if (!FS->allocs().empty()) {
+    Out << ", allocs: (";
+    FieldSeparator AFS;
+    for (auto &AI : FS->allocs()) {
+      Out << AFS;
+      Out << "(versions: (";
+      FieldSeparator VFS;
+      for (auto V : AI.Versions) {
+        Out << VFS;
+        Out << AllocTypeName(V);
+      }
+      Out << "), memProf: (";
+      FieldSeparator MIBFS;
+      for (auto &MIB : AI.MIBs) {
+        Out << MIBFS;
+        Out << "(type: " << AllocTypeName((uint8_t)MIB.AllocType);
+        Out << ", stackIds: (";
+        FieldSeparator SIDFS;
+        for (auto Id : MIB.StackIdIndices) {
+          Out << SIDFS;
+          Out << TheIndex->getStackIdAtIndex(Id);
+        }
+        Out << "))";
+      }
+      Out << "))";
+    }
+    Out << ")";
+  }
+
+  if (!FS->callsites().empty()) {
+    Out << ", callsites: (";
+    FieldSeparator SNFS;
+    for (auto &CI : FS->callsites()) {
+      Out << SNFS;
+      if (CI.Callee)
+        Out << "(callee: ^" << Machine.getGUIDSlot(CI.Callee.getGUID());
+      else
+        Out << "(callee: null";
+      Out << ", clones: (";
+      FieldSeparator VFS;
+      for (auto V : CI.Clones) {
+        Out << VFS;
+        Out << V;
+      }
+      Out << "), stackIds: (";
+      FieldSeparator SIDFS;
+      for (auto Id : CI.StackIdIndices) {
+        Out << SIDFS;
+        Out << TheIndex->getStackIdAtIndex(Id);
+      }
+      Out << "))";
+    }
+    Out << ")";
+  }
+
   auto PrintRange = [&](const ConstantRange &Range) {
     Out << "[" << Range.getSignedMin() << ", " << Range.getSignedMax() << "]";
   };

diff  --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index dc28b681a151..9bfbabc17a08 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -911,9 +911,25 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
 Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
                       const SymbolResolution *&ResI,
                       const SymbolResolution *ResE) {
+  const SymbolResolution *ResITmp = ResI;
+  for (const InputFile::Symbol &Sym : Syms) {
+    assert(ResITmp != ResE);
+    SymbolResolution Res = *ResITmp++;
+
+    if (!Sym.getIRName().empty()) {
+      auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
+          Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
+      if (Res.Prevailing)
+        ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+    }
+  }
+
   if (Error Err =
           BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
-                         ThinLTO.ModuleMap.size()))
+                         ThinLTO.ModuleMap.size(), [&](GlobalValue::GUID GUID) {
+                           return ThinLTO.PrevailingModuleForGUID[GUID] ==
+                                  BM.getModuleIdentifier();
+                         }))
     return Err;
 
   for (const InputFile::Symbol &Sym : Syms) {
@@ -924,7 +940,8 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
       auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
           Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
       if (Res.Prevailing) {
-        ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+        assert(ThinLTO.PrevailingModuleForGUID[GUID] ==
+               BM.getModuleIdentifier());
 
         // For linker redefined symbols (via --wrap or --defsym) we want to
         // switch the linkage to `weak` to prevent IPOs from happening.
@@ -1454,6 +1471,7 @@ ThinBackend lto::createWriteIndexesThinBackend(
 
 Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
                       const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  ThinLTO.CombinedIndex.releaseTemporaryMemory();
   timeTraceProfilerBegin("ThinLink", StringRef(""));
   auto TimeTraceScopeExit = llvm::make_scope_exit([]() {
     if (llvm::timeTraceProfilerEnabled())

diff  --git a/llvm/test/Assembler/thinlto-memprof-summary.ll b/llvm/test/Assembler/thinlto-memprof-summary.ll
new file mode 100644
index 000000000000..92e085e5473b
--- /dev/null
+++ b/llvm/test/Assembler/thinlto-memprof-summary.ll
@@ -0,0 +1,24 @@
+;; Test memprof summary parsing (tests all types/fields in various combinations).
+; RUN: llvm-as %s -o - | llvm-dis -o - | FileCheck %s
+
+; ModuleID = 'thinlto-memprof-summary.thinlto.bc'
+
+^0 = module: (path: "thinlto-memprof-summary.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049))
+;; Function with single alloc, multiple memprof MIBs, no versioning
+^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (8632435727821051414)), (type: cold, stackIds: (15025054523792398438, 12345678)), (type: notcoldandcold, stackIds: (23456789))))))))
+;; Function with callsite stack ids calling above function, no versioning
+^2 = gv: (guid: 25, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0), stackIds: (8632435727821051414)), (callee: ^1, clones: (0), stackIds: (15025054523792398438, 12345678)), (callee: ^1, clones: (0), stackIds: (23456789))))))
+;; Function with multiple allocs, multiple memprof MIBs, multiple versions
+^3 = gv: (guid: 26, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (cold, notcold), memProf: ((type: notcold, stackIds: (3456789)), (type: cold, stackIds: (456789)))), (versions: (notcold, cold), memProf: ((type: cold, stackIds: (3456789)), (type: notcold, stackIds: (456789))))))))
+;; Function with callsite stack ids calling above function, multiple versions
+^4 = gv: (guid: 27, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^3)), callsites: ((callee: ^3, clones: (0, 1), stackIds: (3456789)), (callee: ^3, clones: (1, 1), stackIds: (456789))))))
+;; Function with null callsite stack id (can happen in distributed indexes if callsite not imported)
+^5 = gv: (guid: 28, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), callsites: ((callee: null, clones: (0), stackIds: (8632435727821051414))))))
+
+; Make sure we get back from llvm-dis what we put in via llvm-as.
+; CHECK: ^0 = module: (path: "thinlto-memprof-summary.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049))
+; CHECK: ^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (8632435727821051414)), (type: cold, stackIds: (15025054523792398438, 12345678)), (type: notcoldandcold, stackIds: (23456789))))))))
+; CHECK: ^2 = gv: (guid: 25, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0), stackIds: (8632435727821051414)), (callee: ^1, clones: (0), stackIds: (15025054523792398438, 12345678)), (callee: ^1, clones: (0), stackIds: (23456789))))))
+; CHECK: ^3 = gv: (guid: 26, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (cold, notcold), memProf: ((type: notcold, stackIds: (3456789)), (type: cold, stackIds: (456789)))), (versions: (notcold, cold), memProf: ((type: cold, stackIds: (3456789)), (type: notcold, stackIds: (456789))))))))
+; CHECK: ^4 = gv: (guid: 27, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^3)), callsites: ((callee: ^3, clones: (0, 1), stackIds: (3456789)), (callee: ^3, clones: (1, 1), stackIds: (456789))))))
+; CHECK: ^5 = gv: (guid: 28, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), callsites: ((callee: null, clones: (0), stackIds: (8632435727821051414))))))

diff  --git a/llvm/test/ThinLTO/X86/memprof-summary.ll b/llvm/test/ThinLTO/X86/memprof-summary.ll
new file mode 100644
index 000000000000..ca3b668484b9
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-summary.ll
@@ -0,0 +1,185 @@
+;; Check memprof summaries (per module, combined index, and distributed indexes)
+
+; RUN: split-file %s %t
+; RUN: opt -module-summary %t/a.ll -o %ta.bc
+; RUN: opt -module-summary %t/b.ll -o %tb.bc
+
+; RUN: llvm-dis -o - %ta.bc | FileCheck %s --check-prefix=PRELINKDISA
+; PRELINKDISA: gv: (name: "main", {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438)))))) ; guid = 15822663052811949562
+
+; RUN: llvm-dis -o - %tb.bc | FileCheck %s --check-prefix=PRELINKDISB
+; PRELINKDISB: gv: (name: "_Z3foov", {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (2732490490862098848)))))) ; guid = 9191153033785521275
+; PRELINKDISB: gv: (name: "_Z3bazv", {{.*}} callsites: ((callee: ^3, clones: (0), stackIds: (12481870273128938184)))))) ; guid = 15176620447596392000
+; PRELINKDISB: gv: (name: "_Z3barv", {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))))) ; guid = 17377440600225628772
+
+; RUN: llvm-bcanalyzer -dump %ta.bc | FileCheck %s --check-prefix=PRELINKBCANA
+; PRELINKBCANA: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178/>
+
+; RUN: llvm-bcanalyzer -dump %tb.bc | FileCheck %s --check-prefix=PRELINKBCANB
+; PRELINKBCANB: <STACK_IDS abbrevid=4 op0=-5964873800580613432 op1=2732490490862098848 op2=8632435727821051414 op3=-3421689549917153178/>
+
+; RUN: llvm-lto2 run %ta.bc %tb.bc -o %t -save-temps \
+; RUN:     -thinlto-distributed-indexes \
+; RUN:     -r=%ta.bc,main,plx \
+; RUN:     -r=%ta.bc,_Z3foov, \
+; RUN:     -r=%ta.bc,free, \
+; RUN:     -r=%ta.bc,sleep, \
+; RUN:     -r=%tb.bc,_Z3foov,pl \
+; RUN:     -r=%tb.bc,_Znam, \
+; RUN:     -r=%tb.bc,_Z3barv,pl \
+; RUN:     -r=%tb.bc,_Z3bazv,pl
+
+; RUN: llvm-dis -o - %t.index.bc | FileCheck %s --check-prefix=COMBINEDDIS
+; COMBINEDDIS: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: ^3, clones: (0), stackIds: (2732490490862098848))))))
+; COMBINEDDIS: gv: (guid: 15176620447596392000, {{.*}} callsites: ((callee: ^5, clones: (0), stackIds: (12481870273128938184))))))
+; COMBINEDDIS: gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438))))))
+; COMBINEDDIS: gv: (guid: 17377440600225628772, {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438))))))))
+
+; RUN: llvm-bcanalyzer -dump %t.index.bc | FileCheck %s --check-prefix=COMBINEDBCAN
+; COMBINEDBCAN: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=-5964873800580613432 op3=2732490490862098848/>
+
+; RUN: llvm-dis -o - %ta.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDDISA
+; DISTRIBUTEDDISA: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: null, clones: (0), stackIds: (2732490490862098848))))))
+; DISTRIBUTEDDISA: gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438))))))
+
+; RUN: llvm-dis -o - %tb.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDDISB
+; DISTRIBUTEDDISB: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (2732490490862098848))))))
+; DISTRIBUTEDDISB: gv: (guid: 15176620447596392000, {{.*}} callsites: ((callee: ^3, clones: (0), stackIds: (12481870273128938184))))))
+; DISTRIBUTEDDISB: gv: (guid: 17377440600225628772, {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438))))))))
+
+; RUN: llvm-bcanalyzer -dump %ta.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDBCANA
+; DISTRIBUTEDBCANA: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=2732490490862098848/>
+
+; RUN: llvm-bcanalyzer -dump %tb.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDBCANB
+; DISTRIBUTEDBCANB: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=-5964873800580613432 op3=2732490490862098848/>
+
+;--- a.ll
+; ModuleID = 'a.cc'
+source_filename = "a.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress norecurse uwtable
+define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #0 !dbg !39 {
+entry:
+  %call = call noundef ptr @_Z3foov(), !dbg !42, !callsite !43
+  %call1 = call noundef ptr @_Z3foov(), !dbg !44, !callsite !45
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call, i8 0, i64 10, i1 false), !dbg !46
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call1, i8 0, i64 10, i1 false), !dbg !47
+  call void @free(ptr noundef %call) #4, !dbg !48
+  %call2 = call i32 @sleep(i32 noundef 10), !dbg !49
+  call void @free(ptr noundef %call1) #4, !dbg !50
+  ret i32 0, !dbg !51
+}
+
+declare !dbg !52 noundef ptr @_Z3foov() local_unnamed_addr #1
+
+; Function Attrs: argmemonly mustprogress nocallback nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+; Function Attrs: inaccessiblemem_or_argmemonly mustprogress nounwind willreturn allockind("free")
+declare void @free(ptr allocptr nocapture noundef) local_unnamed_addr #3
+
+declare !dbg !53 i32 @sleep(i32 noundef) local_unnamed_addr #1
+
+attributes #0 = { mustprogress norecurse uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { argmemonly mustprogress nocallback nofree nounwind willreturn writeonly }
+attributes #3 = { inaccessiblemem_or_argmemonly mustprogress nounwind willreturn allockind("free") "alloc-family"="malloc" "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 16.0.0 (git at github.com:llvm/llvm-project.git ffecb643ee2c49e55e0689339b6d5921b5e6ff8b)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "a.cc", directory: ".", checksumkind: CSK_MD5, checksum: "ebabd56909271a1d4a7cac81c10624d5")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!39 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 5, type: !40, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!40 = !DISubroutineType(types: !41)
+!41 = !{}
+!42 = !DILocation(line: 6, column: 13, scope: !39)
+!43 = !{i64 8632435727821051414}
+!44 = !DILocation(line: 7, column: 13, scope: !39)
+!45 = !{i64 -3421689549917153178}
+!46 = !DILocation(line: 8, column: 3, scope: !39)
+!47 = !DILocation(line: 9, column: 3, scope: !39)
+!48 = !DILocation(line: 10, column: 3, scope: !39)
+!49 = !DILocation(line: 11, column: 3, scope: !39)
+!50 = !DILocation(line: 12, column: 3, scope: !39)
+!51 = !DILocation(line: 13, column: 3, scope: !39)
+!52 = !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !41)
+!53 = !DISubprogram(name: "sleep", scope: !54, file: !54, line: 453, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !41)
+!54 = !DIFile(filename: "include/unistd.h", directory: "/usr", checksumkind: CSK_MD5, checksum: "ee8f41a17f563f029d0e930ad871815a")
+
+;--- b.ll
+; ModuleID = 'b.cc'
+source_filename = "b.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress noinline uwtable
+define dso_local noalias noundef nonnull ptr @_Z3barv() local_unnamed_addr #0 !dbg !39 {
+entry:
+  %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #2, !dbg !42, !memprof !43, !callsite !48
+  ret ptr %call, !dbg !49
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znam(i64 noundef) local_unnamed_addr #1
+
+; Function Attrs: mustprogress noinline uwtable
+define dso_local noalias noundef nonnull ptr @_Z3bazv() local_unnamed_addr #0 !dbg !50 {
+entry:
+  %call = call noundef ptr @_Z3barv(), !dbg !51, !callsite !52
+  ret ptr %call, !dbg !53
+}
+
+; Function Attrs: mustprogress uwtable
+define dso_local noalias noundef nonnull ptr @_Z3foov() local_unnamed_addr #3 !dbg !54 {
+entry:
+  %call = call noundef ptr @_Z3bazv(), !dbg !55, !callsite !56
+  ret ptr %call, !dbg !57
+}
+
+attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { builtin allocsize(0) }
+attributes #3 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 16.0.0 (git at github.com:llvm/llvm-project.git ffecb643ee2c49e55e0689339b6d5921b5e6ff8b)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "b.cc", directory: ".", checksumkind: CSK_MD5, checksum: "335f81d275af57725cfc9ffc7be49bc2")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!39 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 1, type: !40, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!40 = !DISubroutineType(types: !41)
+!41 = !{}
+!42 = !DILocation(line: 2, column: 10, scope: !39)
+!43 = !{!44, !46}
+!44 = !{!45, !"notcold"}
+!45 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!46 = !{!47, !"cold"}
+!47 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!48 = !{i64 9086428284934609951}
+!49 = !DILocation(line: 2, column: 3, scope: !39)
+!50 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 5, type: !40, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!51 = !DILocation(line: 6, column: 10, scope: !50)
+!52 = !{i64 -5964873800580613432}
+!53 = !DILocation(line: 6, column: 3, scope: !50)
+!54 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 9, type: !40, scopeLine: 9, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!55 = !DILocation(line: 10, column: 10, scope: !54)
+!56 = !{i64 2732490490862098848}
+!57 = !DILocation(line: 10, column: 3, scope: !54)

diff  --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index a3549aca81e5..be3b04175d5a 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
@@ -34,6 +35,15 @@ class MemoryProfileInfoTest : public testing::Test {
     return Mod;
   }
 
+  std::unique_ptr<ModuleSummaryIndex> makeLLVMIndex(const char *Summary) {
+    SMDiagnostic Err;
+    std::unique_ptr<ModuleSummaryIndex> Index =
+        parseSummaryIndexAssemblyString(Summary, Err);
+    if (!Index)
+      Err.print("MemoryProfileInfoTest", errs());
+    return Index;
+  }
+
   // This looks for a call that has the given value name, which
   // is the name of the value being assigned the call return value.
   CallBase *findCall(Function &F, const char *Name = nullptr) {
@@ -359,4 +369,99 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   }
 }
 
+TEST_F(MemoryProfileInfoTest, CallStackTestIR) {
+  LLVMContext C;
+  std::unique_ptr<Module> M = makeLLVMModule(C,
+                                             R"IR(
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+define ptr @test() {
+entry:
+  %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10), !memprof !1, !callsite !6
+  ret ptr %call
+}
+declare noundef nonnull ptr @_Znam(i64 noundef)
+!1 = !{!2, !4}
+!2 = !{!3, !"notcold"}
+!3 = !{i64 1, i64 2, i64 3, i64 4}
+!4 = !{!5, !"cold"}
+!5 = !{i64 1, i64 2, i64 3, i64 5}
+!6 = !{i64 1}
+)IR");
+
+  Function *Func = M->getFunction("test");
+  CallBase *Call = findCall(*Func, "call");
+
+  CallStack<MDNode, MDNode::op_iterator> InstCallsite(
+      Call->getMetadata(LLVMContext::MD_callsite));
+
+  MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
+  bool First = true;
+  for (auto &MIBOp : MemProfMD->operands()) {
+    auto *MIBMD = cast<const MDNode>(MIBOp);
+    MDNode *StackNode = getMIBStackNode(MIBMD);
+    CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
+    std::vector<uint64_t> StackIds;
+    for (auto ContextIter = StackContext.beginAfterSharedPrefix(InstCallsite);
+         ContextIter != StackContext.end(); ++ContextIter)
+      StackIds.push_back(*ContextIter);
+    if (First) {
+      std::vector<uint64_t> Expected = {2, 3, 4};
+      EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef(Expected));
+    } else {
+      std::vector<uint64_t> Expected = {2, 3, 5};
+      EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef(Expected));
+    }
+    First = false;
+  }
+}
+
+TEST_F(MemoryProfileInfoTest, CallStackTestSummary) {
+  std::unique_ptr<ModuleSummaryIndex> Index = makeLLVMIndex(R"Summary(
+^0 = module: (path: "test.o", hash: (0, 0, 0, 0, 0))
+^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (1, 2, 3, 4)), (type: cold, stackIds: (1, 2, 3, 5))))))))
+^2 = gv: (guid: 25, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0), stackIds: (3, 4)), (callee: ^1, clones: (0), stackIds: (3, 5))))))
+)Summary");
+
+  ASSERT_NE(Index, nullptr);
+  auto *CallsiteSummary =
+      cast<FunctionSummary>(Index->getGlobalValueSummary(/*guid=*/25));
+  bool First = true;
+  for (auto &CI : CallsiteSummary->callsites()) {
+    CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> InstCallsite(
+        &CI);
+    std::vector<uint64_t> StackIds;
+    for (auto StackIdIndex : InstCallsite)
+      StackIds.push_back(Index->getStackIdAtIndex(StackIdIndex));
+    if (First) {
+      std::vector<uint64_t> Expected = {3, 4};
+      EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef(Expected));
+    } else {
+      std::vector<uint64_t> Expected = {3, 5};
+      EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef(Expected));
+    }
+    First = false;
+  }
+
+  auto *AllocSummary =
+      cast<FunctionSummary>(Index->getGlobalValueSummary(/*guid=*/23));
+  for (auto &AI : AllocSummary->allocs()) {
+    bool First = true;
+    for (auto &MIB : AI.MIBs) {
+      CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> StackContext(
+          &MIB);
+      std::vector<uint64_t> StackIds;
+      for (auto StackIdIndex : StackContext)
+        StackIds.push_back(Index->getStackIdAtIndex(StackIdIndex));
+      if (First) {
+        std::vector<uint64_t> Expected = {1, 2, 3, 4};
+        EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef(Expected));
+      } else {
+        std::vector<uint64_t> Expected = {1, 2, 3, 5};
+        EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef(Expected));
+      }
+      First = false;
+    }
+  }
+}
 } // end anonymous namespace


        


More information about the llvm-commits mailing list