[llvm] 4745945 - [MemProf] ThinLTO summary support

Teresa Johnson via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 15 06:45:30 PST 2022


Author: Teresa Johnson
Date: 2022-11-15T06:45:12-08:00
New Revision: 47459455009db4790ffc3765a2ec0f8b4934c2a4

URL: https://github.com/llvm/llvm-project/commit/47459455009db4790ffc3765a2ec0f8b4934c2a4
DIFF: https://github.com/llvm/llvm-project/commit/47459455009db4790ffc3765a2ec0f8b4934c2a4.diff

LOG: [MemProf] ThinLTO summary support

Implements the ThinLTO summary support for memprof related metadata.

This includes support for the assembly format, and for building the
summary from IR during ModuleSummaryAnalysis.

To reduce space in both the bitcode format and the in memory index,
we do 2 things:
1. We keep a single vector of all uniq stack id hashes, and record the
   index into this vector in the callsite and allocation memprof
   summaries.
2. When building the combined index during the LTO link, the callsite
   and allocation memprof summaries are only kept on the FunctionSummary
   of the prevailing copy.

Differential Revision: https://reviews.llvm.org/D135714

Added: 
    llvm/test/Assembler/thinlto-memprof-summary.ll
    llvm/test/ThinLTO/X86/memprof-summary.ll

Modified: 
    llvm/include/llvm/Analysis/MemoryProfileInfo.h
    llvm/include/llvm/AsmParser/LLParser.h
    llvm/include/llvm/AsmParser/LLToken.h
    llvm/include/llvm/AsmParser/Parser.h
    llvm/include/llvm/Bitcode/BitcodeReader.h
    llvm/include/llvm/Bitcode/LLVMBitCodes.h
    llvm/include/llvm/IR/ModuleSummaryIndex.h
    llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
    llvm/lib/Analysis/MemoryProfileInfo.cpp
    llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
    llvm/lib/AsmParser/LLLexer.cpp
    llvm/lib/AsmParser/LLParser.cpp
    llvm/lib/AsmParser/Parser.cpp
    llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
    llvm/lib/Bitcode/Reader/BitcodeReader.cpp
    llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
    llvm/lib/IR/AsmWriter.cpp
    llvm/lib/LTO/LTO.cpp
    llvm/unittests/Analysis/MemoryProfileInfoTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index 1b12e78eaeba..4758cbc23fe3 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -17,18 +17,12 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include <map>
 
 namespace llvm {
 namespace memprof {
 
-// Allocation type assigned to an allocation reached by a given context.
-// More can be added but initially this is just noncold and cold.
-// Values should be powers of two so that they can be ORed, in particular to
-// track allocations that have 
diff erent behavior with 
diff erent calling
-// contexts.
-enum class AllocationType : uint8_t { None = 0, NotCold = 1, Cold = 2 };
-
 /// Return the allocation type for a given set of memory profile values.
 AllocationType getAllocType(uint64_t MaxAccessCount, uint64_t MinSize,
                             uint64_t MinLifetime);
@@ -106,6 +100,62 @@ class CallStackTrie {
   bool buildAndAttachMIBMetadata(CallBase *CI);
 };
 
+/// Helper class to iterate through stack ids in both metadata (memprof MIB and
+/// callsite) and the corresponding ThinLTO summary data structures
+/// (CallsiteInfo and MIBInfo). This simplifies implementation of client code
+/// which doesn't need to worry about whether we are operating with IR (Regular
+/// LTO), or summary (ThinLTO).
+template <class NodeT, class IteratorT> class CallStack {
+public:
+  CallStack(const NodeT *N = nullptr) : N(N) {}
+
+  // Implement minimum required methods for range-based for loop.
+  // The default implementation assumes we are operating on ThinLTO data
+  // structures, which have a vector of StackIdIndices. There are specialized
+  // versions provided to iterate through metadata.
+  struct CallStackIterator {
+    const NodeT *N = nullptr;
+    IteratorT Iter;
+    CallStackIterator(const NodeT *N, bool End) : N(N) {
+      if (!N)
+        return;
+      Iter = End ? N->StackIdIndices.end() : N->StackIdIndices.begin();
+    }
+    uint64_t operator*() {
+      assert(Iter != N->StackIdIndices.end());
+      return *Iter;
+    }
+    bool operator==(const CallStackIterator &rhs) { return Iter == rhs.Iter; }
+    bool operator!=(const CallStackIterator &rhs) { return !(*this == rhs); }
+    void operator++() { ++Iter; }
+  };
+
+  bool empty() const { return N == nullptr; }
+
+  CallStackIterator begin() const {
+    return CallStackIterator(N, /*End*/ false);
+  }
+  CallStackIterator end() const { return CallStackIterator(N, /*End*/ true); }
+
+  CallStackIterator beginAfterSharedPrefix(CallStack &Other) {
+    CallStackIterator Cur = begin();
+    for (CallStackIterator OtherCur = Other.begin();
+         Cur != end() && OtherCur != Other.end(); ++Cur, ++OtherCur)
+      assert(*Cur == *OtherCur);
+    return Cur;
+  }
+
+private:
+  const NodeT *N = nullptr;
+};
+
+/// Specializations for iterating through IR metadata stack contexts.
+template <>
+CallStack<MDNode, MDNode::op_iterator>::CallStackIterator::CallStackIterator(
+    const MDNode *N, bool End);
+template <>
+uint64_t CallStack<MDNode, MDNode::op_iterator>::CallStackIterator::operator*();
+
 } // end namespace memprof
 } // end namespace llvm
 

diff  --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index e9813c34ce37..d936c6f36b6f 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -406,6 +406,10 @@ namespace llvm {
     void addGlobalValueToIndex(std::string Name, GlobalValue::GUID,
                                GlobalValue::LinkageTypes Linkage, unsigned ID,
                                std::unique_ptr<GlobalValueSummary> Summary);
+    bool parseOptionalAllocs(std::vector<AllocInfo> &Allocs);
+    bool parseMemProfs(std::vector<MIBInfo> &MIBs);
+    bool parseAllocType(uint8_t &AllocType);
+    bool parseOptionalCallsites(std::vector<CallsiteInfo> &Callsites);
 
     // Type Parsing.
     bool parseType(Type *&Result, const Twine &Msg, bool AllowVoid = false);

diff  --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index 5fc192f145aa..8a679007a504 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -406,6 +406,15 @@ enum Kind {
   kw_byte,
   kw_bit,
   kw_varFlags,
+  // The following are used by MemProf summary info.
+  kw_callsites,
+  kw_clones,
+  kw_stackIds,
+  kw_allocs,
+  kw_versions,
+  kw_memProf,
+  kw_notcold,
+  kw_notcoldandcold,
 
   // GV's with __attribute__((no_sanitize("address"))), or things in
   // -fsanitize-ignorelist when built with ASan.

diff  --git a/llvm/include/llvm/AsmParser/Parser.h b/llvm/include/llvm/AsmParser/Parser.h
index 6710ae6e358d..336e95c2399f 100644
--- a/llvm/include/llvm/AsmParser/Parser.h
+++ b/llvm/include/llvm/AsmParser/Parser.h
@@ -105,6 +105,17 @@ ParsedModuleAndIndex parseAssemblyFileWithIndexNoUpgradeDebugInfo(
 std::unique_ptr<ModuleSummaryIndex>
 parseSummaryIndexAssemblyFile(StringRef Filename, SMDiagnostic &Err);
 
+/// The function is a secondary interface to the LLVM Assembly Parser. It parses
+/// an ASCII string that (presumably) contains LLVM Assembly code for a module
+/// summary. It returns a a ModuleSummaryIndex with the corresponding features.
+/// Note that this does not verify that the generated Index is valid, so you
+/// should run the verifier after parsing the file to check that it is okay.
+/// Parse LLVM Assembly from a string
+/// \param AsmString The string containing assembly
+/// \param Err Error result info.
+std::unique_ptr<ModuleSummaryIndex>
+parseSummaryIndexAssemblyString(StringRef AsmString, SMDiagnostic &Err);
+
 /// parseAssemblyFile and parseAssemblyString are wrappers around this function.
 /// Parse LLVM Assembly from a MemoryBuffer.
 /// \param F The MemoryBuffer containing assembly

diff  --git a/llvm/include/llvm/Bitcode/BitcodeReader.h b/llvm/include/llvm/Bitcode/BitcodeReader.h
index 39ea48c33fc3..bc1f88f73229 100644
--- a/llvm/include/llvm/Bitcode/BitcodeReader.h
+++ b/llvm/include/llvm/Bitcode/BitcodeReader.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitstream/BitCodeEnums.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
@@ -117,8 +118,10 @@ typedef llvm::function_ref<Optional<std::string>(StringRef)>
 
     /// Parse the specified bitcode buffer and merge its module summary index
     /// into CombinedIndex.
-    Error readSummary(ModuleSummaryIndex &CombinedIndex, StringRef ModulePath,
-                      uint64_t ModuleId);
+    Error
+    readSummary(ModuleSummaryIndex &CombinedIndex, StringRef ModulePath,
+                uint64_t ModuleId,
+                std::function<bool(GlobalValue::GUID)> IsPrevailing = nullptr);
   };
 
   struct BitcodeFileContents {

diff  --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 74a51d5ce690..2b474b67425c 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -301,6 +301,22 @@ enum GlobalValueSummarySymtabCodes {
   // Range information for accessed offsets for every argument.
   // [n x (paramno, range, numcalls, numcalls x (callee_guid, paramno, range))]
   FS_PARAM_ACCESS = 25,
+  // Summary of per-module memprof callsite metadata.
+  // [valueid, n x stackidindex]
+  FS_PERMODULE_CALLSITE_INFO = 26,
+  // Summary of per-module allocation memprof metadata.
+  // [n x (alloc type, nummib, nummib x stackidindex)]
+  FS_PERMODULE_ALLOC_INFO = 27,
+  // Summary of combined index memprof callsite metadata.
+  // [valueid, numstackindices, numver,
+  //  numstackindices x stackidindex, numver x version]
+  FS_COMBINED_CALLSITE_INFO = 28,
+  // Summary of combined index allocation memprof metadata.
+  // [nummib, numver,
+  //  nummib x (alloc type, numstackids, numstackids x stackidindex),
+  //  numver x version]
+  FS_COMBINED_ALLOC_INFO = 29,
+  FS_STACK_IDS = 30,
 };
 
 enum MetadataCodes {

diff  --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 9f25b5a1e550..4bcf21edfecb 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -284,6 +285,79 @@ template <> struct DenseMapInfo<ValueInfo> {
   static unsigned getHashValue(ValueInfo I) { return (uintptr_t)I.getRef(); }
 };
 
+/// Summary of memprof callsite metadata.
+struct CallsiteInfo {
+  // Actual callee function.
+  ValueInfo Callee;
+
+  // Used to record whole program analysis cloning decisions.
+  // The ThinLTO backend will need to create as many clones as there are entries
+  // in the vector (it is expected and should be confirmed that all such
+  // summaries in the same FunctionSummary have the same number of entries).
+  // Each index records version info for the corresponding clone of this
+  // function. The value is the callee clone it calls (becomes the appended
+  // suffix id). Index 0 is the original version, and a value of 0 calls the
+  // original callee.
+  SmallVector<unsigned> Clones{0};
+
+  // Represents stack ids in this context, recorded as indices into the
+  // StackIds vector in the summary index, which in turn holds the full 64-bit
+  // stack ids. This reduces memory as there are in practice far fewer unique
+  // stack ids than stack id references.
+  SmallVector<unsigned> StackIdIndices;
+
+  CallsiteInfo(ValueInfo Callee, SmallVector<unsigned> StackIdIndices)
+      : Callee(Callee), StackIdIndices(std::move(StackIdIndices)) {}
+  CallsiteInfo(ValueInfo Callee, SmallVector<unsigned> Clones,
+               SmallVector<unsigned> StackIdIndices)
+      : Callee(Callee), Clones(std::move(Clones)),
+        StackIdIndices(std::move(StackIdIndices)) {}
+};
+
+// Allocation type assigned to an allocation reached by a given context.
+// More can be added but initially this is just noncold and cold.
+// Values should be powers of two so that they can be ORed, in particular to
+// track allocations that have 
diff erent behavior with 
diff erent calling
+// contexts.
+enum class AllocationType : uint8_t { None = 0, NotCold = 1, Cold = 2 };
+
+/// Summary of a single MIB in a memprof metadata on allocations.
+struct MIBInfo {
+  // The allocation type for this profiled context.
+  AllocationType AllocType;
+
+  // Represents stack ids in this context, recorded as indices into the
+  // StackIds vector in the summary index, which in turn holds the full 64-bit
+  // stack ids. This reduces memory as there are in practice far fewer unique
+  // stack ids than stack id references.
+  SmallVector<unsigned> StackIdIndices;
+
+  MIBInfo(AllocationType AllocType, SmallVector<unsigned> StackIdIndices)
+      : AllocType(AllocType), StackIdIndices(std::move(StackIdIndices)) {}
+};
+
+/// Summary of memprof metadata on allocations.
+struct AllocInfo {
+  // Used to record whole program analysis cloning decisions.
+  // The ThinLTO backend will need to create as many clones as there are entries
+  // in the vector (it is expected and should be confirmed that all such
+  // summaries in the same FunctionSummary have the same number of entries).
+  // Each index records version info for the corresponding clone of this
+  // function. The value is the allocation type of the corresponding allocation.
+  // Index 0 is the original version. Before cloning, index 0 may have more than
+  // one allocation type.
+  SmallVector<uint8_t> Versions;
+
+  // Vector of MIBs in this memprof metadata.
+  std::vector<MIBInfo> MIBs;
+
+  AllocInfo(std::vector<MIBInfo> MIBs) : MIBs(std::move(MIBs)) {
+    Versions.push_back(0);
+  }
+  AllocInfo(SmallVector<uint8_t> Versions, std::vector<MIBInfo> MIBs)
+      : Versions(std::move(Versions)), MIBs(std::move(MIBs)) {}
+};
+
 /// Function and variable summary information to aid decisions and
 /// implementation of importing.
 class GlobalValueSummary {
@@ -678,7 +752,8 @@ class FunctionSummary : public GlobalValueSummary {
         std::vector<FunctionSummary::VFuncId>(),
         std::vector<FunctionSummary::ConstVCall>(),
         std::vector<FunctionSummary::ConstVCall>(),
-        std::vector<FunctionSummary::ParamAccess>());
+        std::vector<FunctionSummary::ParamAccess>(),
+        std::vector<CallsiteInfo>(), std::vector<AllocInfo>());
   }
 
   /// A dummy node to reference external functions that aren't in the index
@@ -706,6 +781,25 @@ class FunctionSummary : public GlobalValueSummary {
   using ParamAccessesTy = std::vector<ParamAccess>;
   std::unique_ptr<ParamAccessesTy> ParamAccesses;
 
+  /// Optional list of memprof callsite metadata summaries. The correspondence
+  /// between the callsite summary and the callsites in the function is implied
+  /// by the order in the vector (and can be validated by comparing the stack
+  /// ids in the CallsiteInfo to those in the instruction callsite metadata).
+  /// As a memory savings optimization, we only create these for the prevailing
+  /// copy of a symbol when creating the combined index during LTO.
+  using CallsitesTy = std::vector<CallsiteInfo>;
+  std::unique_ptr<CallsitesTy> Callsites;
+
+  /// Optional list of allocation memprof metadata summaries. The correspondence
+  /// between the alloc memprof summary and the allocation callsites in the
+  /// function is implied by the order in the vector (and can be validated by
+  /// comparing the stack ids in the AllocInfo to those in the instruction
+  /// memprof metadata).
+  /// As a memory savings optimization, we only create these for the prevailing
+  /// copy of a symbol when creating the combined index during LTO.
+  using AllocsTy = std::vector<AllocInfo>;
+  std::unique_ptr<AllocsTy> Allocs;
+
 public:
   FunctionSummary(GVFlags Flags, unsigned NumInsts, FFlags FunFlags,
                   uint64_t EntryCount, std::vector<ValueInfo> Refs,
@@ -715,7 +809,8 @@ class FunctionSummary : public GlobalValueSummary {
                   std::vector<VFuncId> TypeCheckedLoadVCalls,
                   std::vector<ConstVCall> TypeTestAssumeConstVCalls,
                   std::vector<ConstVCall> TypeCheckedLoadConstVCalls,
-                  std::vector<ParamAccess> Params)
+                  std::vector<ParamAccess> Params, CallsitesTy CallsiteList,
+                  AllocsTy AllocList)
       : GlobalValueSummary(FunctionKind, Flags, std::move(Refs)),
         InstCount(NumInsts), FunFlags(FunFlags), EntryCount(EntryCount),
         CallGraphEdgeList(std::move(CGEdges)) {
@@ -729,6 +824,10 @@ class FunctionSummary : public GlobalValueSummary {
                      std::move(TypeCheckedLoadConstVCalls)});
     if (!Params.empty())
       ParamAccesses = std::make_unique<ParamAccessesTy>(std::move(Params));
+    if (!CallsiteList.empty())
+      Callsites = std::make_unique<CallsitesTy>(std::move(CallsiteList));
+    if (!AllocList.empty())
+      Allocs = std::make_unique<AllocsTy>(std::move(AllocList));
   }
   // Gets the number of readonly and writeonly refs in RefEdgeList
   std::pair<unsigned, unsigned> specialRefCounts() const;
@@ -832,6 +931,18 @@ class FunctionSummary : public GlobalValueSummary {
 
   const TypeIdInfo *getTypeIdInfo() const { return TIdInfo.get(); };
 
+  ArrayRef<CallsiteInfo> callsites() const {
+    if (Callsites)
+      return *Callsites;
+    return {};
+  }
+
+  ArrayRef<AllocInfo> allocs() const {
+    if (Allocs)
+      return *Allocs;
+    return {};
+  }
+
   friend struct GraphTraits<ValueInfo>;
 };
 
@@ -1163,6 +1274,16 @@ class ModuleSummaryIndex {
   // the total number of basic blocks in the LTO unit in the combined index.
   uint64_t BlockCount;
 
+  // List of unique stack ids (hashes). We use a 4B index of the id in the
+  // stack id lists on the alloc and callsite summaries for memory savings,
+  // since the number of unique ids is in practice much smaller than the
+  // number of stack id references in the summaries.
+  std::vector<uint64_t> StackIds;
+
+  // Temporary map while building StackIds list. Clear when index is completely
+  // built via releaseTemporaryMemory.
+  std::map<uint64_t, unsigned> StackIdToIndex;
+
   // YAML I/O support.
   friend yaml::MappingTraits<ModuleSummaryIndex>;
 
@@ -1205,6 +1326,31 @@ class ModuleSummaryIndex {
   const_gvsummary_iterator end() const { return GlobalValueMap.end(); }
   size_t size() const { return GlobalValueMap.size(); }
 
+  const std::vector<uint64_t> &stackIds() const { return StackIds; }
+
+  unsigned addOrGetStackIdIndex(uint64_t StackId) {
+    auto Inserted = StackIdToIndex.insert({StackId, StackIds.size()});
+    if (Inserted.second)
+      StackIds.push_back(StackId);
+    return Inserted.first->second;
+  }
+
+  uint64_t getStackIdAtIndex(unsigned Index) const {
+    assert(StackIds.size() > Index);
+    return StackIds[Index];
+  }
+
+  // Facility to release memory from data structures only needed during index
+  // construction (including while building combined index). Currently this only
+  // releases the temporary map used while constructing a correspondence between
+  // stack ids and their index in the StackIds vector. Mostly impactful when
+  // building a large combined index.
+  void releaseTemporaryMemory() {
+    assert(StackIdToIndex.size() == StackIds.size());
+    StackIdToIndex.clear();
+    StackIds.shrink_to_fit();
+  }
+
   /// Convenience function for doing a DFS on a ValueInfo. Marks the function in
   /// the FunctionHasParent map.
   static void discoverNodes(ValueInfo V,

diff  --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index 74e92797f15c..33e57e5f2102 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -234,7 +234,8 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
           std::move(FSum.TypeCheckedLoadVCalls),
           std::move(FSum.TypeTestAssumeConstVCalls),
           std::move(FSum.TypeCheckedLoadConstVCalls),
-          ArrayRef<FunctionSummary::ParamAccess>{}));
+          ArrayRef<FunctionSummary::ParamAccess>{}, ArrayRef<CallsiteInfo>{},
+          ArrayRef<AllocInfo>{}));
     }
   }
   static void output(IO &io, GlobalValueSummaryMapTy &V) {

diff  --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index f28ddbbfc849..f42fff5f195f 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -224,3 +224,21 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
   CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));
   return true;
 }
+
+template <>
+CallStack<MDNode, MDNode::op_iterator>::CallStackIterator::CallStackIterator(
+    const MDNode *N, bool End)
+    : N(N) {
+  if (!N)
+    return;
+  Iter = End ? N->op_end() : N->op_begin();
+}
+
+template <>
+uint64_t
+CallStack<MDNode, MDNode::op_iterator>::CallStackIterator::operator*() {
+  assert(Iter != N->op_end());
+  ConstantInt *StackIdCInt = mdconst::dyn_extract<ConstantInt>(*Iter);
+  assert(StackIdCInt);
+  return StackIdCInt->getZExtValue();
+}

diff  --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 52827c210b5a..e8309f51a498 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
@@ -56,6 +57,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace llvm::memprof;
 
 #define DEBUG_TYPE "module-summary-analysis"
 
@@ -275,6 +277,9 @@ static void computeFunctionSummary(
   std::vector<const Instruction *> NonVolatileLoads;
   std::vector<const Instruction *> NonVolatileStores;
 
+  std::vector<CallsiteInfo> Callsites;
+  std::vector<AllocInfo> Allocs;
+
   bool HasInlineAsmMaybeReferencingInternal = false;
   bool HasIndirBranchToBlockAddress = false;
   bool HasUnknownCall = false;
@@ -417,6 +422,57 @@ static void computeFunctionSummary(
           CallGraphEdges[Index.getOrInsertValueInfo(Candidate.Value)]
               .updateHotness(getHotness(Candidate.Count, PSI));
       }
+
+      // TODO: Skip indirect calls for now. Need to handle these better, likely
+      // by creating multiple Callsites, one per target, then speculatively
+      // devirtualize while applying clone info in the ThinLTO backends. This
+      // will also be important because we will have a 
diff erent set of clone
+      // versions per target. This handling needs to match that in the ThinLTO
+      // backend so we handle things consistently for matching of callsite
+      // summaries to instructions.
+      if (!CalledFunction)
+        continue;
+
+      // Compute the list of stack ids first (so we can trim them from the stack
+      // ids on any MIBs).
+      CallStack<MDNode, MDNode::op_iterator> InstCallsite(
+          I.getMetadata(LLVMContext::MD_callsite));
+      auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
+      if (MemProfMD) {
+        std::vector<MIBInfo> MIBs;
+        for (auto &MDOp : MemProfMD->operands()) {
+          auto *MIBMD = cast<const MDNode>(MDOp);
+          MDNode *StackNode = getMIBStackNode(MIBMD);
+          assert(StackNode);
+          SmallVector<unsigned> StackIdIndices;
+          CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
+          // Collapse out any on the allocation call (inlining).
+          for (auto ContextIter =
+                   StackContext.beginAfterSharedPrefix(InstCallsite);
+               ContextIter != StackContext.end(); ++ContextIter) {
+            unsigned StackIdIdx = Index.addOrGetStackIdIndex(*ContextIter);
+            // If this is a direct recursion, simply skip the duplicate
+            // entries. If this is mutual recursion, handling is left to
+            // the LTO link analysis client.
+            if (StackIdIndices.empty() || StackIdIndices.back() != StackIdIdx)
+              StackIdIndices.push_back(StackIdIdx);
+          }
+          MIBs.push_back(
+              MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices)));
+        }
+        Allocs.push_back(AllocInfo(std::move(MIBs)));
+      } else if (!InstCallsite.empty()) {
+        SmallVector<unsigned> StackIdIndices;
+        for (auto StackId : InstCallsite)
+          StackIdIndices.push_back(Index.addOrGetStackIdIndex(StackId));
+        // Use the original CalledValue, in case it was an alias. We want
+        // to record the call edge to the alias in that case. Eventually
+        // an alias summary will be created to associate the alias and
+        // aliasee.
+        auto CalleeValueInfo =
+            Index.getOrInsertValueInfo(cast<GlobalValue>(CalledValue));
+        Callsites.push_back({CalleeValueInfo, StackIdIndices});
+      }
     }
   }
   Index.addBlockCount(F.size());
@@ -508,7 +564,8 @@ static void computeFunctionSummary(
       CallGraphEdges.takeVector(), TypeTests.takeVector(),
       TypeTestAssumeVCalls.takeVector(), TypeCheckedLoadVCalls.takeVector(),
       TypeTestAssumeConstVCalls.takeVector(),
-      TypeCheckedLoadConstVCalls.takeVector(), std::move(ParamAccesses));
+      TypeCheckedLoadConstVCalls.takeVector(), std::move(ParamAccesses),
+      std::move(Callsites), std::move(Allocs));
   if (NonRenamableLocal)
     CantBePromoted.insert(F.getGUID());
   Index.addGlobalValueSummary(F, std::move(FuncSummary));
@@ -757,7 +814,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                     ArrayRef<FunctionSummary::VFuncId>{},
                     ArrayRef<FunctionSummary::ConstVCall>{},
                     ArrayRef<FunctionSummary::ConstVCall>{},
-                    ArrayRef<FunctionSummary::ParamAccess>{});
+                    ArrayRef<FunctionSummary::ParamAccess>{},
+                    ArrayRef<CallsiteInfo>{}, ArrayRef<AllocInfo>{});
             Index.addGlobalValueSummary(*GV, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =

diff  --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 2a171df168fc..c33dc9710f35 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -772,6 +772,14 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(byte);
   KEYWORD(bit);
   KEYWORD(varFlags);
+  KEYWORD(callsites);
+  KEYWORD(clones);
+  KEYWORD(stackIds);
+  KEYWORD(allocs);
+  KEYWORD(versions);
+  KEYWORD(memProf);
+  KEYWORD(notcold);
+  KEYWORD(notcoldandcold);
 
 #undef KEYWORD
 

diff  --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 7fc51682533f..dde0672da8e4 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -8682,6 +8682,8 @@ bool LLParser::parseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
   FunctionSummary::TypeIdInfo TypeIdInfo;
   std::vector<FunctionSummary::ParamAccess> ParamAccesses;
   std::vector<ValueInfo> Refs;
+  std::vector<CallsiteInfo> Callsites;
+  std::vector<AllocInfo> Allocs;
   // Default is all-zeros (conservative values).
   FunctionSummary::FFlags FFlags = {};
   if (parseToken(lltok::colon, "expected ':' here") ||
@@ -8716,6 +8718,14 @@ bool LLParser::parseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
       if (parseOptionalParamAccesses(ParamAccesses))
         return true;
       break;
+    case lltok::kw_allocs:
+      if (parseOptionalAllocs(Allocs))
+        return true;
+      break;
+    case lltok::kw_callsites:
+      if (parseOptionalCallsites(Callsites))
+        return true;
+      break;
     default:
       return error(Lex.getLoc(), "expected optional function summary field");
     }
@@ -8731,7 +8741,7 @@ bool LLParser::parseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
       std::move(TypeIdInfo.TypeCheckedLoadVCalls),
       std::move(TypeIdInfo.TypeTestAssumeConstVCalls),
       std::move(TypeIdInfo.TypeCheckedLoadConstVCalls),
-      std::move(ParamAccesses));
+      std::move(ParamAccesses), std::move(Callsites), std::move(Allocs));
 
   FS->setModulePath(ModulePath);
 
@@ -9683,3 +9693,220 @@ bool LLParser::parseGVReference(ValueInfo &VI, unsigned &GVId) {
     VI.setWriteOnly();
   return false;
 }
+
+/// OptionalAllocs
+///   := 'allocs' ':' '(' Alloc [',' Alloc]* ')'
+/// Alloc ::= '(' 'versions' ':' '(' Version [',' Version]* ')'
+///              ',' MemProfs ')'
+/// Version ::= UInt32
+bool LLParser::parseOptionalAllocs(std::vector<AllocInfo> &Allocs) {
+  assert(Lex.getKind() == lltok::kw_allocs);
+  Lex.Lex();
+
+  if (parseToken(lltok::colon, "expected ':' in allocs") ||
+      parseToken(lltok::lparen, "expected '(' in allocs"))
+    return true;
+
+  // parse each alloc
+  do {
+    if (parseToken(lltok::lparen, "expected '(' in alloc") ||
+        parseToken(lltok::kw_versions, "expected 'versions' in alloc") ||
+        parseToken(lltok::colon, "expected ':'") ||
+        parseToken(lltok::lparen, "expected '(' in versions"))
+      return true;
+
+    SmallVector<uint8_t> Versions;
+    do {
+      uint8_t V = 0;
+      if (parseAllocType(V))
+        return true;
+      Versions.push_back(V);
+    } while (EatIfPresent(lltok::comma));
+
+    if (parseToken(lltok::rparen, "expected ')' in versions") ||
+        parseToken(lltok::comma, "expected ',' in alloc"))
+      return true;
+
+    std::vector<MIBInfo> MIBs;
+    if (parseMemProfs(MIBs))
+      return true;
+
+    Allocs.push_back({Versions, MIBs});
+
+    if (parseToken(lltok::rparen, "expected ')' in alloc"))
+      return true;
+  } while (EatIfPresent(lltok::comma));
+
+  if (parseToken(lltok::rparen, "expected ')' in allocs"))
+    return true;
+
+  return false;
+}
+
+/// MemProfs
+///   := 'memProf' ':' '(' MemProf [',' MemProf]* ')'
+/// MemProf ::= '(' 'type' ':' AllocType
+///              ',' 'stackIds' ':' '(' StackId [',' StackId]* ')' ')'
+/// StackId ::= UInt64
+bool LLParser::parseMemProfs(std::vector<MIBInfo> &MIBs) {
+  assert(Lex.getKind() == lltok::kw_memProf);
+  Lex.Lex();
+
+  if (parseToken(lltok::colon, "expected ':' in memprof") ||
+      parseToken(lltok::lparen, "expected '(' in memprof"))
+    return true;
+
+  // parse each MIB
+  do {
+    if (parseToken(lltok::lparen, "expected '(' in memprof") ||
+        parseToken(lltok::kw_type, "expected 'type' in memprof") ||
+        parseToken(lltok::colon, "expected ':'"))
+      return true;
+
+    uint8_t AllocType;
+    if (parseAllocType(AllocType))
+      return true;
+
+    if (parseToken(lltok::comma, "expected ',' in memprof") ||
+        parseToken(lltok::kw_stackIds, "expected 'stackIds' in memprof") ||
+        parseToken(lltok::colon, "expected ':'") ||
+        parseToken(lltok::lparen, "expected '(' in stackIds"))
+      return true;
+
+    SmallVector<unsigned> StackIdIndices;
+    do {
+      uint64_t StackId = 0;
+      if (parseUInt64(StackId))
+        return true;
+      StackIdIndices.push_back(Index->addOrGetStackIdIndex(StackId));
+    } while (EatIfPresent(lltok::comma));
+
+    if (parseToken(lltok::rparen, "expected ')' in stackIds"))
+      return true;
+
+    MIBs.push_back({(AllocationType)AllocType, StackIdIndices});
+
+    if (parseToken(lltok::rparen, "expected ')' in memprof"))
+      return true;
+  } while (EatIfPresent(lltok::comma));
+
+  if (parseToken(lltok::rparen, "expected ')' in memprof"))
+    return true;
+
+  return false;
+}
+
+/// AllocType
+///   := ('none'|'notcold'|'cold'|'notcoldandcold')
+bool LLParser::parseAllocType(uint8_t &AllocType) {
+  switch (Lex.getKind()) {
+  case lltok::kw_none:
+    AllocType = (uint8_t)AllocationType::None;
+    break;
+  case lltok::kw_notcold:
+    AllocType = (uint8_t)AllocationType::NotCold;
+    break;
+  case lltok::kw_cold:
+    AllocType = (uint8_t)AllocationType::Cold;
+    break;
+  case lltok::kw_notcoldandcold:
+    AllocType =
+        (uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold;
+    break;
+  default:
+    return error(Lex.getLoc(), "invalid alloc type");
+  }
+  Lex.Lex();
+  return false;
+}
+
+/// OptionalCallsites
+///   := 'callsites' ':' '(' Callsite [',' Callsite]* ')'
+/// Callsite ::= '(' 'callee' ':' GVReference
+///              ',' 'clones' ':' '(' Version [',' Version]* ')'
+///              ',' 'stackIds' ':' '(' StackId [',' StackId]* ')' ')'
+/// Version ::= UInt32
+/// StackId ::= UInt64
+bool LLParser::parseOptionalCallsites(std::vector<CallsiteInfo> &Callsites) {
+  assert(Lex.getKind() == lltok::kw_callsites);
+  Lex.Lex();
+
+  if (parseToken(lltok::colon, "expected ':' in callsites") ||
+      parseToken(lltok::lparen, "expected '(' in callsites"))
+    return true;
+
+  IdToIndexMapType IdToIndexMap;
+  // parse each callsite
+  do {
+    if (parseToken(lltok::lparen, "expected '(' in callsite") ||
+        parseToken(lltok::kw_callee, "expected 'callee' in callsite") ||
+        parseToken(lltok::colon, "expected ':'"))
+      return true;
+
+    ValueInfo VI;
+    unsigned GVId = 0;
+    LocTy Loc = Lex.getLoc();
+    if (!EatIfPresent(lltok::kw_null)) {
+      if (parseGVReference(VI, GVId))
+        return true;
+    }
+
+    if (parseToken(lltok::comma, "expected ',' in callsite") ||
+        parseToken(lltok::kw_clones, "expected 'clones' in callsite") ||
+        parseToken(lltok::colon, "expected ':'") ||
+        parseToken(lltok::lparen, "expected '(' in clones"))
+      return true;
+
+    SmallVector<unsigned> Clones;
+    do {
+      unsigned V = 0;
+      if (parseUInt32(V))
+        return true;
+      Clones.push_back(V);
+    } while (EatIfPresent(lltok::comma));
+
+    if (parseToken(lltok::rparen, "expected ')' in clones") ||
+        parseToken(lltok::comma, "expected ',' in callsite") ||
+        parseToken(lltok::kw_stackIds, "expected 'stackIds' in callsite") ||
+        parseToken(lltok::colon, "expected ':'") ||
+        parseToken(lltok::lparen, "expected '(' in stackIds"))
+      return true;
+
+    SmallVector<unsigned> StackIdIndices;
+    do {
+      uint64_t StackId = 0;
+      if (parseUInt64(StackId))
+        return true;
+      StackIdIndices.push_back(Index->addOrGetStackIdIndex(StackId));
+    } while (EatIfPresent(lltok::comma));
+
+    if (parseToken(lltok::rparen, "expected ')' in stackIds"))
+      return true;
+
+    // Keep track of the Callsites array index needing a forward reference.
+    // We will save the location of the ValueInfo needing an update, but
+    // can only do so once the SmallVector is finalized.
+    if (VI.getRef() == FwdVIRef)
+      IdToIndexMap[GVId].push_back(std::make_pair(Callsites.size(), Loc));
+    Callsites.push_back({VI, Clones, StackIdIndices});
+
+    if (parseToken(lltok::rparen, "expected ')' in callsite"))
+      return true;
+  } while (EatIfPresent(lltok::comma));
+
+  // Now that the Callsites vector is finalized, it is safe to save the
+  // locations of any forward GV references that need updating later.
+  for (auto I : IdToIndexMap) {
+    auto &Infos = ForwardRefValueInfos[I.first];
+    for (auto P : I.second) {
+      assert(Callsites[P.first].Callee.getRef() == FwdVIRef &&
+             "Forward referenced ValueInfo expected to be empty");
+      Infos.emplace_back(&Callsites[P.first].Callee, P.second);
+    }
+  }
+
+  if (parseToken(lltok::rparen, "expected ')' in callsites"))
+    return true;
+
+  return false;
+}

diff  --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp
index 95b9079f0f9c..4458ae757c16 100644
--- a/llvm/lib/AsmParser/Parser.cpp
+++ b/llvm/lib/AsmParser/Parser.cpp
@@ -177,6 +177,12 @@ llvm::parseSummaryIndexAssemblyFile(StringRef Filename, SMDiagnostic &Err) {
   return parseSummaryIndexAssembly(FileOrErr.get()->getMemBufferRef(), Err);
 }
 
+std::unique_ptr<ModuleSummaryIndex>
+llvm::parseSummaryIndexAssemblyString(StringRef AsmString, SMDiagnostic &Err) {
+  MemoryBufferRef F(AsmString, "<string>");
+  return parseSummaryIndexAssembly(F, Err);
+}
+
 Constant *llvm::parseConstantValue(StringRef Asm, SMDiagnostic &Err,
                                    const Module &M, const SlotMapping *Slots) {
   SourceMgr SM;

diff  --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index dd3cac8b8a6f..97bc828066a5 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -315,6 +315,11 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, TYPE_ID_METADATA)
       STRINGIFY_CODE(FS, BLOCK_COUNT)
       STRINGIFY_CODE(FS, PARAM_ACCESS)
+      STRINGIFY_CODE(FS, PERMODULE_CALLSITE_INFO)
+      STRINGIFY_CODE(FS, PERMODULE_ALLOC_INFO)
+      STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO)
+      STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO)
+      STRINGIFY_CODE(FS, STACK_IDS)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch (CodeID) {

diff  --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 0a2415cbfdef..c4805602d1ad 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -883,8 +883,10 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   // they are recorded in the summary index being built.
   // We save a GUID which refers to the same global as the ValueInfo, but
   // ignoring the linkage, i.e. for values other than local linkage they are
-  // identical.
-  DenseMap<unsigned, std::tuple<ValueInfo, GlobalValue::GUID>>
+  // identical (this is the second tuple member).
+  // The third tuple member is the real GUID of the ValueInfo.
+  DenseMap<unsigned,
+           std::tuple<ValueInfo, GlobalValue::GUID, GlobalValue::GUID>>
       ValueIdToValueInfoMap;
 
   /// Map populated during module path string table parsing, from the
@@ -904,10 +906,19 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   /// this module by the client.
   unsigned ModuleId;
 
+  /// Callback to ask whether a symbol is the prevailing copy when invoked
+  /// during combined index building.
+  std::function<bool(GlobalValue::GUID)> IsPrevailing;
+
+  /// Saves the stack ids from the STACK_IDS record to consult when adding stack
+  /// ids from the lists in the callsite and alloc entries to the index.
+  std::vector<uint64_t> StackIds;
+
 public:
-  ModuleSummaryIndexBitcodeReader(BitstreamCursor Stream, StringRef Strtab,
-                                  ModuleSummaryIndex &TheIndex,
-                                  StringRef ModulePath, unsigned ModuleId);
+  ModuleSummaryIndexBitcodeReader(
+      BitstreamCursor Stream, StringRef Strtab, ModuleSummaryIndex &TheIndex,
+      StringRef ModulePath, unsigned ModuleId,
+      std::function<bool(GlobalValue::GUID)> IsPrevailing = nullptr);
 
   Error parseModule();
 
@@ -931,7 +942,8 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   std::vector<FunctionSummary::ParamAccess>
   parseParamAccesses(ArrayRef<uint64_t> Record);
 
-  std::tuple<ValueInfo, GlobalValue::GUID>
+  template <bool AllowNullValueInfo = false>
+  std::tuple<ValueInfo, GlobalValue::GUID, GlobalValue::GUID>
   getValueInfoFromValueId(unsigned ValueId);
 
   void addThisModule();
@@ -6643,9 +6655,10 @@ std::vector<StructType *> BitcodeReader::getIdentifiedStructTypes() const {
 
 ModuleSummaryIndexBitcodeReader::ModuleSummaryIndexBitcodeReader(
     BitstreamCursor Cursor, StringRef Strtab, ModuleSummaryIndex &TheIndex,
-    StringRef ModulePath, unsigned ModuleId)
+    StringRef ModulePath, unsigned ModuleId,
+    std::function<bool(GlobalValue::GUID)> IsPrevailing)
     : BitcodeReaderBase(std::move(Cursor), Strtab), TheIndex(TheIndex),
-      ModulePath(ModulePath), ModuleId(ModuleId) {}
+      ModulePath(ModulePath), ModuleId(ModuleId), IsPrevailing(IsPrevailing) {}
 
 void ModuleSummaryIndexBitcodeReader::addThisModule() {
   TheIndex.addModule(ModulePath, ModuleId);
@@ -6656,10 +6669,15 @@ ModuleSummaryIndexBitcodeReader::getThisModule() {
   return TheIndex.getModule(ModulePath);
 }
 
-std::tuple<ValueInfo, GlobalValue::GUID>
+template <bool AllowNullValueInfo>
+std::tuple<ValueInfo, GlobalValue::GUID, GlobalValue::GUID>
 ModuleSummaryIndexBitcodeReader::getValueInfoFromValueId(unsigned ValueId) {
   auto VGI = ValueIdToValueInfoMap[ValueId];
-  assert(std::get<0>(VGI));
+  // We can have a null value info for memprof callsite info records in
+  // distributed ThinLTO index files when the callee function summary is not
+  // included in the index. The bitcode writer records 0 in that case,
+  // and the caller of this helper will set AllowNullValueInfo to true.
+  assert(AllowNullValueInfo || std::get<0>(VGI));
   return VGI;
 }
 
@@ -6682,7 +6700,7 @@ void ModuleSummaryIndexBitcodeReader::setValueGUID(
   ValueIdToValueInfoMap[ValueID] = std::make_tuple(
       TheIndex.getOrInsertValueInfo(
           ValueGUID, UseStrtab ? ValueName : TheIndex.saveString(ValueName)),
-      OriginalNameID);
+      OriginalNameID, ValueGUID);
 }
 
 // Specialized value symbol table parser used when reading module index
@@ -6770,8 +6788,8 @@ Error ModuleSummaryIndexBitcodeReader::parseValueSymbolTable(
       GlobalValue::GUID RefGUID = Record[1];
       // The "original name", which is the second value of the pair will be
       // overriden later by a FS_COMBINED_ORIGINAL_NAME in the combined index.
-      ValueIdToValueInfoMap[ValueID] =
-          std::make_tuple(TheIndex.getOrInsertValueInfo(RefGUID), RefGUID);
+      ValueIdToValueInfoMap[ValueID] = std::make_tuple(
+          TheIndex.getOrInsertValueInfo(RefGUID), RefGUID, RefGUID);
       break;
     }
     }
@@ -7116,6 +7134,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       PendingTypeCheckedLoadConstVCalls;
   std::vector<FunctionSummary::ParamAccess> PendingParamAccesses;
 
+  std::vector<CallsiteInfo> PendingCallsites;
+  std::vector<AllocInfo> PendingAllocs;
+
   while (true) {
     Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
     if (!MaybeEntry)
@@ -7154,8 +7175,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     case bitc::FS_VALUE_GUID: { // [valueid, refguid]
       uint64_t ValueID = Record[0];
       GlobalValue::GUID RefGUID = Record[1];
-      ValueIdToValueInfoMap[ValueID] =
-          std::make_tuple(TheIndex.getOrInsertValueInfo(RefGUID), RefGUID);
+      ValueIdToValueInfoMap[ValueID] = std::make_tuple(
+          TheIndex.getOrInsertValueInfo(RefGUID), RefGUID, RefGUID);
       break;
     }
     // FS_PERMODULE: [valueid, flags, instcount, fflags, numrefs,
@@ -7207,6 +7228,13 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           ArrayRef<uint64_t>(Record).slice(CallGraphEdgeStartIndex),
           IsOldProfileFormat, HasProfile, HasRelBF);
       setSpecialRefs(Refs, NumRORefs, NumWORefs);
+      auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID);
+      // In order to save memory, only record the memprof summaries if this is
+      // the prevailing copy of a symbol.
+      if (IsPrevailing && !IsPrevailing(std::get<2>(VIAndOriginalGUID))) {
+        PendingCallsites.clear();
+        PendingAllocs.clear();
+      }
       auto FS = std::make_unique<FunctionSummary>(
           Flags, InstCount, getDecodedFFlags(RawFunFlags), /*EntryCount=*/0,
           std::move(Refs), std::move(Calls), std::move(PendingTypeTests),
@@ -7214,8 +7242,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           std::move(PendingTypeCheckedLoadVCalls),
           std::move(PendingTypeTestAssumeConstVCalls),
           std::move(PendingTypeCheckedLoadConstVCalls),
-          std::move(PendingParamAccesses));
-      auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID);
+          std::move(PendingParamAccesses), std::move(PendingCallsites),
+          std::move(PendingAllocs));
       FS->setModulePath(getThisModule()->first());
       FS->setOriginalName(std::get<1>(VIAndOriginalGUID));
       TheIndex.addGlobalValueSummary(std::get<0>(VIAndOriginalGUID),
@@ -7358,7 +7386,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           std::move(PendingTypeCheckedLoadVCalls),
           std::move(PendingTypeTestAssumeConstVCalls),
           std::move(PendingTypeCheckedLoadConstVCalls),
-          std::move(PendingParamAccesses));
+          std::move(PendingParamAccesses), std::move(PendingCallsites),
+          std::move(PendingAllocs));
       LastSeenSummary = FS.get();
       LastSeenGUID = VI.getGUID();
       FS->setModulePath(ModuleIdMap[ModuleId]);
@@ -7484,6 +7513,95 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       PendingParamAccesses = parseParamAccesses(Record);
       break;
     }
+
+    case bitc::FS_STACK_IDS: { // [n x stackid]
+      // Save stack ids in the reader to consult when adding stack ids from the
+      // lists in the stack node and alloc node entries.
+      StackIds = ArrayRef<uint64_t>(Record);
+      break;
+    }
+
+    case bitc::FS_PERMODULE_CALLSITE_INFO: {
+      unsigned ValueID = Record[0];
+      SmallVector<unsigned> StackIdList;
+      for (auto R = Record.begin() + 1; R != Record.end(); R++) {
+        assert(*R < StackIds.size());
+        StackIdList.push_back(TheIndex.addOrGetStackIdIndex(StackIds[*R]));
+      }
+      ValueInfo VI = std::get<0>(getValueInfoFromValueId(ValueID));
+      PendingCallsites.push_back(CallsiteInfo({VI, std::move(StackIdList)}));
+      break;
+    }
+
+    case bitc::FS_COMBINED_CALLSITE_INFO: {
+      auto RecordIter = Record.begin();
+      unsigned ValueID = *RecordIter++;
+      unsigned NumStackIds = *RecordIter++;
+      unsigned NumVersions = *RecordIter++;
+      assert(Record.size() == 3 + NumStackIds + NumVersions);
+      SmallVector<unsigned> StackIdList;
+      for (unsigned J = 0; J < NumStackIds; J++) {
+        assert(*RecordIter < StackIds.size());
+        StackIdList.push_back(
+            TheIndex.addOrGetStackIdIndex(StackIds[*RecordIter++]));
+      }
+      SmallVector<unsigned> Versions;
+      for (unsigned J = 0; J < NumVersions; J++)
+        Versions.push_back(*RecordIter++);
+      ValueInfo VI = std::get<0>(
+          getValueInfoFromValueId</*AllowNullValueInfo*/ true>(ValueID));
+      PendingCallsites.push_back(
+          CallsiteInfo({VI, std::move(Versions), std::move(StackIdList)}));
+      break;
+    }
+
+    case bitc::FS_PERMODULE_ALLOC_INFO: {
+      unsigned I = 0;
+      std::vector<MIBInfo> MIBs;
+      while (I < Record.size()) {
+        assert(Record.size() - I >= 2);
+        AllocationType AllocType = (AllocationType)Record[I++];
+        unsigned NumStackEntries = Record[I++];
+        assert(Record.size() - I >= NumStackEntries);
+        SmallVector<unsigned> StackIdList;
+        for (unsigned J = 0; J < NumStackEntries; J++) {
+          assert(Record[I] < StackIds.size());
+          StackIdList.push_back(
+              TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]]));
+        }
+        MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
+      }
+      PendingAllocs.push_back(AllocInfo(std::move(MIBs)));
+      break;
+    }
+
+    case bitc::FS_COMBINED_ALLOC_INFO: {
+      unsigned I = 0;
+      std::vector<MIBInfo> MIBs;
+      unsigned NumMIBs = Record[I++];
+      unsigned NumVersions = Record[I++];
+      unsigned MIBsRead = 0;
+      while (MIBsRead++ < NumMIBs) {
+        assert(Record.size() - I >= 2);
+        AllocationType AllocType = (AllocationType)Record[I++];
+        unsigned NumStackEntries = Record[I++];
+        assert(Record.size() - I >= NumStackEntries);
+        SmallVector<unsigned> StackIdList;
+        for (unsigned J = 0; J < NumStackEntries; J++) {
+          assert(Record[I] < StackIds.size());
+          StackIdList.push_back(
+              TheIndex.addOrGetStackIdIndex(StackIds[Record[I++]]));
+        }
+        MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
+      }
+      assert(Record.size() - I >= NumVersions);
+      SmallVector<uint8_t> Versions;
+      for (unsigned J = 0; J < NumVersions; J++)
+        Versions.push_back(Record[I++]);
+      PendingAllocs.push_back(
+          AllocInfo(std::move(Versions), std::move(MIBs)));
+      break;
+    }
     }
   }
   llvm_unreachable("Exit infinite loop");
@@ -7803,14 +7921,15 @@ BitcodeModule::getLazyModule(LLVMContext &Context, bool ShouldLazyLoadMetadata,
 // We don't use ModuleIdentifier here because the client may need to control the
 // module path used in the combined summary (e.g. when reading summaries for
 // regular LTO modules).
-Error BitcodeModule::readSummary(ModuleSummaryIndex &CombinedIndex,
-                                 StringRef ModulePath, uint64_t ModuleId) {
+Error BitcodeModule::readSummary(
+    ModuleSummaryIndex &CombinedIndex, StringRef ModulePath, uint64_t ModuleId,
+    std::function<bool(GlobalValue::GUID)> IsPrevailing) {
   BitstreamCursor Stream(Buffer);
   if (Error JumpFailed = Stream.JumpToBit(ModuleBit))
     return JumpFailed;
 
   ModuleSummaryIndexBitcodeReader R(std::move(Stream), Strtab, CombinedIndex,
-                                    ModulePath, ModuleId);
+                                    ModulePath, ModuleId, IsPrevailing);
   return R.parseModule();
 }
 

diff  --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 4bf881a47917..bc81afbc4064 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -211,12 +211,10 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase {
   void writePerModuleGlobalValueSummary();
 
 private:
-  void writePerModuleFunctionSummaryRecord(SmallVector<uint64_t, 64> &NameVals,
-                                           GlobalValueSummary *Summary,
-                                           unsigned ValueID,
-                                           unsigned FSCallsAbbrev,
-                                           unsigned FSCallsProfileAbbrev,
-                                           const Function &F);
+  void writePerModuleFunctionSummaryRecord(
+      SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
+      unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev,
+      unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F);
   void writeModuleLevelReferences(const GlobalVariable &V,
                                   SmallVector<uint64_t, 64> &NameVals,
                                   unsigned FSModRefsAbbrev,
@@ -424,6 +422,11 @@ class IndexBitcodeWriter : public BitcodeWriterBase {
   /// index and a value id generated by this class to use in references.
   std::map<GlobalValue::GUID, unsigned> GUIDToValueIdMap;
 
+  // The sorted stack id indices actually used in the summary entries being
+  // written, which will be a subset of those in the full index in the case of
+  // distributed indexes.
+  std::vector<unsigned> StackIdIndices;
+
   /// Tracks the last value id recorded in the GUIDToValueMap.
   unsigned GlobalValueId = 0;
 
@@ -441,9 +444,28 @@ class IndexBitcodeWriter : public BitcodeWriterBase {
     // in writing out the call graph edges. Save the mapping from GUID
     // to the new global value id to use when writing those edges, which
     // are currently saved in the index in terms of GUID.
-    forEachSummary([&](GVInfo I, bool) {
+    forEachSummary([&](GVInfo I, bool IsAliasee) {
       GUIDToValueIdMap[I.first] = ++GlobalValueId;
+      if (IsAliasee)
+        return;
+      auto *FS = dyn_cast<FunctionSummary>(I.second);
+      if (!FS)
+        return;
+      // Record all stack id indices actually used in the summary entries being
+      // written, so that we can compact them in the case of distributed ThinLTO
+      // indexes.
+      for (auto &CI : FS->callsites())
+        for (auto Idx : CI.StackIdIndices)
+          StackIdIndices.push_back(Idx);
+      for (auto &AI : FS->allocs())
+        for (auto &MIB : AI.MIBs)
+          for (auto Idx : MIB.StackIdIndices)
+            StackIdIndices.push_back(Idx);
     });
+    llvm::sort(StackIdIndices);
+    StackIdIndices.erase(
+        std::unique(StackIdIndices.begin(), StackIdIndices.end()),
+        StackIdIndices.end());
   }
 
   /// The below iterator returns the GUID and associated summary.
@@ -3888,11 +3910,64 @@ static void writeTypeIdCompatibleVtableSummaryRecord(
   }
 }
 
+static void writeFunctionHeapProfileRecords(
+    BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev,
+    unsigned AllocAbbrev, bool PerModule,
+    std::function<unsigned(const ValueInfo &VI)> GetValueID,
+    std::function<unsigned(unsigned)> GetStackIndex) {
+  SmallVector<uint64_t> Record;
+
+  for (auto &CI : FS->callsites()) {
+    Record.clear();
+    // Per module callsite clones should always have a single entry of
+    // value 0.
+    assert(!PerModule || (CI.Clones.size() == 1 && CI.Clones[0] == 0));
+    Record.push_back(GetValueID(CI.Callee));
+    if (!PerModule) {
+      Record.push_back(CI.StackIdIndices.size());
+      Record.push_back(CI.Clones.size());
+    }
+    for (auto Id : CI.StackIdIndices)
+      Record.push_back(GetStackIndex(Id));
+    if (!PerModule) {
+      for (auto V : CI.Clones)
+        Record.push_back(V);
+    }
+    Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_CALLSITE_INFO
+                                : bitc::FS_COMBINED_CALLSITE_INFO,
+                      Record, CallsiteAbbrev);
+  }
+
+  for (auto &AI : FS->allocs()) {
+    Record.clear();
+    // Per module alloc versions should always have a single entry of
+    // value 0.
+    assert(!PerModule || (AI.Versions.size() == 1 && AI.Versions[0] == 0));
+    if (!PerModule) {
+      Record.push_back(AI.MIBs.size());
+      Record.push_back(AI.Versions.size());
+    }
+    for (auto &MIB : AI.MIBs) {
+      Record.push_back((uint8_t)MIB.AllocType);
+      Record.push_back(MIB.StackIdIndices.size());
+      for (auto Id : MIB.StackIdIndices)
+        Record.push_back(GetStackIndex(Id));
+    }
+    if (!PerModule) {
+      for (auto V : AI.Versions)
+        Record.push_back(V);
+    }
+    Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO
+                                : bitc::FS_COMBINED_ALLOC_INFO,
+                      Record, AllocAbbrev);
+  }
+}
+
 // Helper to emit a single function summary record.
 void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
     SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
     unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev,
-    const Function &F) {
+    unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F) {
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
@@ -3902,6 +3977,12 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
         return {VE.getValueID(VI.getValue())};
       });
 
+  writeFunctionHeapProfileRecords(
+      Stream, FS, CallsiteAbbrev, AllocAbbrev,
+      /*PerModule*/ true,
+      /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); },
+      /*GetStackIndex*/ [&](unsigned I) { return I; });
+
   auto SpecialRefCnts = FS->specialRefCounts();
   NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
   NameVals.push_back(FS->instCount());
@@ -4013,6 +4094,16 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
                       ArrayRef<uint64_t>{GVI.second, GVI.first});
   }
 
+  if (!Index->stackIds().empty()) {
+    auto StackIdAbbv = std::make_shared<BitCodeAbbrev>();
+    StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS));
+    // numids x stackid
+    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv));
+    Stream.EmitRecord(bitc::FS_STACK_IDS, Index->stackIds(), StackIdAbbvId);
+  }
+
   // Abbrev for FS_PERMODULE_PROFILE.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
@@ -4084,6 +4175,21 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned TypeIdCompatibleVtableAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_CALLSITE_INFO));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  // n x stackidindex
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned CallsiteAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO));
+  // n x (alloc type, numstackids, numstackids x stackidindex)
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
   SmallVector<uint64_t, 64> NameVals;
   // Iterate over the list of functions instead of the Index to
   // ensure the ordering is stable.
@@ -4102,7 +4208,8 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     }
     auto *Summary = VI.getSummaryList()[0].get();
     writePerModuleFunctionSummaryRecord(NameVals, Summary, VE.getValueID(&F),
-                                        FSCallsAbbrev, FSCallsProfileAbbrev, F);
+                                        FSCallsAbbrev, FSCallsProfileAbbrev,
+                                        CallsiteAbbrev, AllocAbbrev, F);
   }
 
   // Capture references from GlobalVariable initializers, which are outside
@@ -4144,7 +4251,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
 
 /// Emit the combined summary section into the combined index file.
 void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
-  Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 3);
+  Stream.EnterSubblock(bitc::GLOBALVAL_SUMMARY_BLOCK_ID, 4);
   Stream.EmitRecord(
       bitc::FS_VERSION,
       ArrayRef<uint64_t>{ModuleSummaryIndex::BitcodeSummaryVersion});
@@ -4157,6 +4264,21 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
                       ArrayRef<uint64_t>{GVI.second, GVI.first});
   }
 
+  if (!StackIdIndices.empty()) {
+    auto StackIdAbbv = std::make_shared<BitCodeAbbrev>();
+    StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS));
+    // numids x stackid
+    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv));
+    // Write the stack ids used by this index, which will be a subset of those in
+    // the full index in the case of distributed indexes.
+    std::vector<uint64_t> StackIds;
+    for (auto &I : StackIdIndices)
+      StackIds.push_back(Index.getStackIdAtIndex(I));
+    Stream.EmitRecord(bitc::FS_STACK_IDS, StackIds, StackIdAbbvId);
+  }
+
   // Abbrev for FS_COMBINED.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED));
@@ -4210,6 +4332,26 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
   unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_CALLSITE_INFO));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numstackindices
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
+  // numstackindices x stackidindex, numver x version
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned CallsiteAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALLOC_INFO));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
+  // nummib x (alloc type, numstackids, numstackids x stackidindex),
+  // numver x version
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
   // The aliases are emitted as a post-pass, and will point to the value
   // id of the aliasee. Save them in a vector for post-processing.
   SmallVector<AliasSummary *, 64> Aliases;
@@ -4286,6 +4428,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     }
 
     auto GetValueId = [&](const ValueInfo &VI) -> Optional<unsigned> {
+      if (!VI)
+        return None;
       return getValueId(VI.getGUID());
     };
 
@@ -4293,6 +4437,27 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     writeFunctionTypeMetadataRecords(Stream, FS, GetValueId);
     getReferencedTypeIds(FS, ReferencedTypeIds);
 
+    writeFunctionHeapProfileRecords(
+        Stream, FS, CallsiteAbbrev, AllocAbbrev,
+        /*PerModule*/ false,
+        /*GetValueId*/ [&](const ValueInfo &VI) -> unsigned {
+          Optional<unsigned> ValueID = GetValueId(VI);
+          // This can happen in shared index files for distributed ThinLTO if
+          // the callee function summary is not included. Record 0 which we
+          // will have to deal with conservatively when doing any kind of
+          // validation in the ThinLTO backends.
+          if (!ValueID)
+            return 0;
+          return *ValueID;
+        },
+        /*GetStackIndex*/ [&](unsigned I) {
+          // Get the corresponding index into the list of StackIdIndices
+          // actually being written for this combined index (which may be a
+          // subset in the case of distributed indexes).
+          auto Lower = llvm::lower_bound(StackIdIndices, I);
+          return std::distance(StackIdIndices.begin(), Lower);
+        });
+
     NameVals.push_back(*ValueId);
     NameVals.push_back(Index.getModuleId(FS->modulePath()));
     NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));

diff  --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 22e12b29843e..aa1723ecdffe 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -3192,6 +3192,85 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
   if (const auto *TIdInfo = FS->getTypeIdInfo())
     printTypeIdInfo(*TIdInfo);
 
+  // The AllocationType identifiers capture the profiled context behavior
+  // reaching a specific static allocation site (possibly cloned). Thus
+  // "notcoldandcold" implies there are multiple contexts which reach this site,
+  // some of which are cold and some of which are not, and that need to
+  // disambiguate via cloning or other context identification.
+  auto AllocTypeName = [](uint8_t Type) {
+    switch (Type) {
+    case (uint8_t)AllocationType::None:
+      return "none";
+      break;
+    case (uint8_t)AllocationType::NotCold:
+      return "notcold";
+      break;
+    case (uint8_t)AllocationType::Cold:
+      return "cold";
+      break;
+    case (uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold:
+      return "notcoldandcold";
+      break;
+    default:
+      assert(false && "Unexpected alloc type");
+    }
+  };
+
+  if (!FS->allocs().empty()) {
+    Out << ", allocs: (";
+    FieldSeparator AFS;
+    for (auto &AI : FS->allocs()) {
+      Out << AFS;
+      Out << "(versions: (";
+      FieldSeparator VFS;
+      for (auto V : AI.Versions) {
+        Out << VFS;
+        Out << AllocTypeName(V);
+      }
+      Out << "), memProf: (";
+      FieldSeparator MIBFS;
+      for (auto &MIB : AI.MIBs) {
+        Out << MIBFS;
+        Out << "(type: " << AllocTypeName((uint8_t)MIB.AllocType);
+        Out << ", stackIds: (";
+        FieldSeparator SIDFS;
+        for (auto Id : MIB.StackIdIndices) {
+          Out << SIDFS;
+          Out << TheIndex->getStackIdAtIndex(Id);
+        }
+        Out << "))";
+      }
+      Out << "))";
+    }
+    Out << ")";
+  }
+
+  if (!FS->callsites().empty()) {
+    Out << ", callsites: (";
+    FieldSeparator SNFS;
+    for (auto &CI : FS->callsites()) {
+      Out << SNFS;
+      if (CI.Callee)
+        Out << "(callee: ^" << Machine.getGUIDSlot(CI.Callee.getGUID());
+      else
+        Out << "(callee: null";
+      Out << ", clones: (";
+      FieldSeparator VFS;
+      for (auto V : CI.Clones) {
+        Out << VFS;
+        Out << V;
+      }
+      Out << "), stackIds: (";
+      FieldSeparator SIDFS;
+      for (auto Id : CI.StackIdIndices) {
+        Out << SIDFS;
+        Out << TheIndex->getStackIdAtIndex(Id);
+      }
+      Out << "))";
+    }
+    Out << ")";
+  }
+
   auto PrintRange = [&](const ConstantRange &Range) {
     Out << "[" << Range.getSignedMin() << ", " << Range.getSignedMax() << "]";
   };

diff  --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index dc28b681a151..9bfbabc17a08 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -911,9 +911,25 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
 Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
                       const SymbolResolution *&ResI,
                       const SymbolResolution *ResE) {
+  const SymbolResolution *ResITmp = ResI;
+  for (const InputFile::Symbol &Sym : Syms) {
+    assert(ResITmp != ResE);
+    SymbolResolution Res = *ResITmp++;
+
+    if (!Sym.getIRName().empty()) {
+      auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
+          Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
+      if (Res.Prevailing)
+        ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+    }
+  }
+
   if (Error Err =
           BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
-                         ThinLTO.ModuleMap.size()))
+                         ThinLTO.ModuleMap.size(), [&](GlobalValue::GUID GUID) {
+                           return ThinLTO.PrevailingModuleForGUID[GUID] ==
+                                  BM.getModuleIdentifier();
+                         }))
     return Err;
 
   for (const InputFile::Symbol &Sym : Syms) {
@@ -924,7 +940,8 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
       auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
           Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
       if (Res.Prevailing) {
-        ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+        assert(ThinLTO.PrevailingModuleForGUID[GUID] ==
+               BM.getModuleIdentifier());
 
         // For linker redefined symbols (via --wrap or --defsym) we want to
         // switch the linkage to `weak` to prevent IPOs from happening.
@@ -1454,6 +1471,7 @@ ThinBackend lto::createWriteIndexesThinBackend(
 
 Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
                       const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) {
+  ThinLTO.CombinedIndex.releaseTemporaryMemory();
   timeTraceProfilerBegin("ThinLink", StringRef(""));
   auto TimeTraceScopeExit = llvm::make_scope_exit([]() {
     if (llvm::timeTraceProfilerEnabled())

diff  --git a/llvm/test/Assembler/thinlto-memprof-summary.ll b/llvm/test/Assembler/thinlto-memprof-summary.ll
new file mode 100644
index 000000000000..92e085e5473b
--- /dev/null
+++ b/llvm/test/Assembler/thinlto-memprof-summary.ll
@@ -0,0 +1,24 @@
+;; Test memprof summary parsing (tests all types/fields in various combinations).
+; RUN: llvm-as %s -o - | llvm-dis -o - | FileCheck %s
+
+; ModuleID = 'thinlto-memprof-summary.thinlto.bc'
+
+^0 = module: (path: "thinlto-memprof-summary.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049))
+;; Function with single alloc, multiple memprof MIBs, no versioning
+^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (8632435727821051414)), (type: cold, stackIds: (15025054523792398438, 12345678)), (type: notcoldandcold, stackIds: (23456789))))))))
+;; Function with callsite stack ids calling above function, no versioning
+^2 = gv: (guid: 25, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0), stackIds: (8632435727821051414)), (callee: ^1, clones: (0), stackIds: (15025054523792398438, 12345678)), (callee: ^1, clones: (0), stackIds: (23456789))))))
+;; Function with multiple allocs, multiple memprof MIBs, multiple versions
+^3 = gv: (guid: 26, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (cold, notcold), memProf: ((type: notcold, stackIds: (3456789)), (type: cold, stackIds: (456789)))), (versions: (notcold, cold), memProf: ((type: cold, stackIds: (3456789)), (type: notcold, stackIds: (456789))))))))
+;; Function with callsite stack ids calling above function, multiple versions
+^4 = gv: (guid: 27, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^3)), callsites: ((callee: ^3, clones: (0, 1), stackIds: (3456789)), (callee: ^3, clones: (1, 1), stackIds: (456789))))))
+;; Function with null callsite stack id (can happen in distributed indexes if callsite not imported)
+^5 = gv: (guid: 28, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), callsites: ((callee: null, clones: (0), stackIds: (8632435727821051414))))))
+
+; Make sure we get back from llvm-dis what we put in via llvm-as.
+; CHECK: ^0 = module: (path: "thinlto-memprof-summary.o", hash: (1369602428, 2747878711, 259090915, 2507395659, 1141468049))
+; CHECK: ^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (8632435727821051414)), (type: cold, stackIds: (15025054523792398438, 12345678)), (type: notcoldandcold, stackIds: (23456789))))))))
+; CHECK: ^2 = gv: (guid: 25, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0), stackIds: (8632435727821051414)), (callee: ^1, clones: (0), stackIds: (15025054523792398438, 12345678)), (callee: ^1, clones: (0), stackIds: (23456789))))))
+; CHECK: ^3 = gv: (guid: 26, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (cold, notcold), memProf: ((type: notcold, stackIds: (3456789)), (type: cold, stackIds: (456789)))), (versions: (notcold, cold), memProf: ((type: cold, stackIds: (3456789)), (type: notcold, stackIds: (456789))))))))
+; CHECK: ^4 = gv: (guid: 27, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^3)), callsites: ((callee: ^3, clones: (0, 1), stackIds: (3456789)), (callee: ^3, clones: (1, 1), stackIds: (456789))))))
+; CHECK: ^5 = gv: (guid: 28, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), callsites: ((callee: null, clones: (0), stackIds: (8632435727821051414))))))

diff  --git a/llvm/test/ThinLTO/X86/memprof-summary.ll b/llvm/test/ThinLTO/X86/memprof-summary.ll
new file mode 100644
index 000000000000..ca3b668484b9
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-summary.ll
@@ -0,0 +1,185 @@
+;; Check memprof summaries (per module, combined index, and distributed indexes)
+
+; RUN: split-file %s %t
+; RUN: opt -module-summary %t/a.ll -o %ta.bc
+; RUN: opt -module-summary %t/b.ll -o %tb.bc
+
+; RUN: llvm-dis -o - %ta.bc | FileCheck %s --check-prefix=PRELINKDISA
+; PRELINKDISA: gv: (name: "main", {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438)))))) ; guid = 15822663052811949562
+
+; RUN: llvm-dis -o - %tb.bc | FileCheck %s --check-prefix=PRELINKDISB
+; PRELINKDISB: gv: (name: "_Z3foov", {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (2732490490862098848)))))) ; guid = 9191153033785521275
+; PRELINKDISB: gv: (name: "_Z3bazv", {{.*}} callsites: ((callee: ^3, clones: (0), stackIds: (12481870273128938184)))))) ; guid = 15176620447596392000
+; PRELINKDISB: gv: (name: "_Z3barv", {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438)))))))) ; guid = 17377440600225628772
+
+; RUN: llvm-bcanalyzer -dump %ta.bc | FileCheck %s --check-prefix=PRELINKBCANA
+; PRELINKBCANA: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178/>
+
+; RUN: llvm-bcanalyzer -dump %tb.bc | FileCheck %s --check-prefix=PRELINKBCANB
+; PRELINKBCANB: <STACK_IDS abbrevid=4 op0=-5964873800580613432 op1=2732490490862098848 op2=8632435727821051414 op3=-3421689549917153178/>
+
+; RUN: llvm-lto2 run %ta.bc %tb.bc -o %t -save-temps \
+; RUN:     -thinlto-distributed-indexes \
+; RUN:     -r=%ta.bc,main,plx \
+; RUN:     -r=%ta.bc,_Z3foov, \
+; RUN:     -r=%ta.bc,free, \
+; RUN:     -r=%ta.bc,sleep, \
+; RUN:     -r=%tb.bc,_Z3foov,pl \
+; RUN:     -r=%tb.bc,_Znam, \
+; RUN:     -r=%tb.bc,_Z3barv,pl \
+; RUN:     -r=%tb.bc,_Z3bazv,pl
+
+; RUN: llvm-dis -o - %t.index.bc | FileCheck %s --check-prefix=COMBINEDDIS
+; COMBINEDDIS: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: ^3, clones: (0), stackIds: (2732490490862098848))))))
+; COMBINEDDIS: gv: (guid: 15176620447596392000, {{.*}} callsites: ((callee: ^5, clones: (0), stackIds: (12481870273128938184))))))
+; COMBINEDDIS: gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438))))))
+; COMBINEDDIS: gv: (guid: 17377440600225628772, {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438))))))))
+
+; RUN: llvm-bcanalyzer -dump %t.index.bc | FileCheck %s --check-prefix=COMBINEDBCAN
+; COMBINEDBCAN: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=-5964873800580613432 op3=2732490490862098848/>
+
+; RUN: llvm-dis -o - %ta.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDDISA
+; DISTRIBUTEDDISA: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: null, clones: (0), stackIds: (2732490490862098848))))))
+; DISTRIBUTEDDISA: gv: (guid: 15822663052811949562, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (8632435727821051414)), (callee: ^2, clones: (0), stackIds: (15025054523792398438))))))
+
+; RUN: llvm-dis -o - %tb.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDDISB
+; DISTRIBUTEDDISB: gv: (guid: 9191153033785521275, {{.*}} callsites: ((callee: ^2, clones: (0), stackIds: (2732490490862098848))))))
+; DISTRIBUTEDDISB: gv: (guid: 15176620447596392000, {{.*}} callsites: ((callee: ^3, clones: (0), stackIds: (12481870273128938184))))))
+; DISTRIBUTEDDISB: gv: (guid: 17377440600225628772, {{.*}} allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (12481870273128938184, 2732490490862098848, 8632435727821051414)), (type: cold, stackIds: (12481870273128938184, 2732490490862098848, 15025054523792398438))))))))
+
+; RUN: llvm-bcanalyzer -dump %ta.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDBCANA
+; DISTRIBUTEDBCANA: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=2732490490862098848/>
+
+; RUN: llvm-bcanalyzer -dump %tb.bc.thinlto.bc | FileCheck %s --check-prefix=DISTRIBUTEDBCANB
+; DISTRIBUTEDBCANB: <STACK_IDS abbrevid=4 op0=8632435727821051414 op1=-3421689549917153178 op2=-5964873800580613432 op3=2732490490862098848/>
+
+;--- a.ll
+; ModuleID = 'a.cc'
+source_filename = "a.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress norecurse uwtable
+define dso_local noundef i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #0 !dbg !39 {
+entry:
+  %call = call noundef ptr @_Z3foov(), !dbg !42, !callsite !43
+  %call1 = call noundef ptr @_Z3foov(), !dbg !44, !callsite !45
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call, i8 0, i64 10, i1 false), !dbg !46
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 1 dereferenceable(10) %call1, i8 0, i64 10, i1 false), !dbg !47
+  call void @free(ptr noundef %call) #4, !dbg !48
+  %call2 = call i32 @sleep(i32 noundef 10), !dbg !49
+  call void @free(ptr noundef %call1) #4, !dbg !50
+  ret i32 0, !dbg !51
+}
+
+declare !dbg !52 noundef ptr @_Z3foov() local_unnamed_addr #1
+
+; Function Attrs: argmemonly mustprogress nocallback nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+; Function Attrs: inaccessiblemem_or_argmemonly mustprogress nounwind willreturn allockind("free")
+declare void @free(ptr allocptr nocapture noundef) local_unnamed_addr #3
+
+declare !dbg !53 i32 @sleep(i32 noundef) local_unnamed_addr #1
+
+attributes #0 = { mustprogress norecurse uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { argmemonly mustprogress nocallback nofree nounwind willreturn writeonly }
+attributes #3 = { inaccessiblemem_or_argmemonly mustprogress nounwind willreturn allockind("free") "alloc-family"="malloc" "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #4 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 16.0.0 (git at github.com:llvm/llvm-project.git ffecb643ee2c49e55e0689339b6d5921b5e6ff8b)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "a.cc", directory: ".", checksumkind: CSK_MD5, checksum: "ebabd56909271a1d4a7cac81c10624d5")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!39 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 5, type: !40, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!40 = !DISubroutineType(types: !41)
+!41 = !{}
+!42 = !DILocation(line: 6, column: 13, scope: !39)
+!43 = !{i64 8632435727821051414}
+!44 = !DILocation(line: 7, column: 13, scope: !39)
+!45 = !{i64 -3421689549917153178}
+!46 = !DILocation(line: 8, column: 3, scope: !39)
+!47 = !DILocation(line: 9, column: 3, scope: !39)
+!48 = !DILocation(line: 10, column: 3, scope: !39)
+!49 = !DILocation(line: 11, column: 3, scope: !39)
+!50 = !DILocation(line: 12, column: 3, scope: !39)
+!51 = !DILocation(line: 13, column: 3, scope: !39)
+!52 = !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !41)
+!53 = !DISubprogram(name: "sleep", scope: !54, file: !54, line: 453, type: !40, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !41)
+!54 = !DIFile(filename: "include/unistd.h", directory: "/usr", checksumkind: CSK_MD5, checksum: "ee8f41a17f563f029d0e930ad871815a")
+
+;--- b.ll
+; ModuleID = 'b.cc'
+source_filename = "b.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress noinline uwtable
+define dso_local noalias noundef nonnull ptr @_Z3barv() local_unnamed_addr #0 !dbg !39 {
+entry:
+  %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10) #2, !dbg !42, !memprof !43, !callsite !48
+  ret ptr %call, !dbg !49
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znam(i64 noundef) local_unnamed_addr #1
+
+; Function Attrs: mustprogress noinline uwtable
+define dso_local noalias noundef nonnull ptr @_Z3bazv() local_unnamed_addr #0 !dbg !50 {
+entry:
+  %call = call noundef ptr @_Z3barv(), !dbg !51, !callsite !52
+  ret ptr %call, !dbg !53
+}
+
+; Function Attrs: mustprogress uwtable
+define dso_local noalias noundef nonnull ptr @_Z3foov() local_unnamed_addr #3 !dbg !54 {
+entry:
+  %call = call noundef ptr @_Z3bazv(), !dbg !55, !callsite !56
+  ret ptr %call, !dbg !57
+}
+
+attributes #0 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { builtin allocsize(0) }
+attributes #3 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 16.0.0 (git at github.com:llvm/llvm-project.git ffecb643ee2c49e55e0689339b6d5921b5e6ff8b)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "b.cc", directory: ".", checksumkind: CSK_MD5, checksum: "335f81d275af57725cfc9ffc7be49bc2")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!39 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 1, type: !40, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!40 = !DISubroutineType(types: !41)
+!41 = !{}
+!42 = !DILocation(line: 2, column: 10, scope: !39)
+!43 = !{!44, !46}
+!44 = !{!45, !"notcold"}
+!45 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
+!46 = !{!47, !"cold"}
+!47 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
+!48 = !{i64 9086428284934609951}
+!49 = !DILocation(line: 2, column: 3, scope: !39)
+!50 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 5, type: !40, scopeLine: 5, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!51 = !DILocation(line: 6, column: 10, scope: !50)
+!52 = !{i64 -5964873800580613432}
+!53 = !DILocation(line: 6, column: 3, scope: !50)
+!54 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 9, type: !40, scopeLine: 9, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !41)
+!55 = !DILocation(line: 10, column: 10, scope: !54)
+!56 = !{i64 2732490490862098848}
+!57 = !DILocation(line: 10, column: 3, scope: !54)

diff  --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index a3549aca81e5..64004ceac818 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
@@ -34,6 +35,15 @@ class MemoryProfileInfoTest : public testing::Test {
     return Mod;
   }
 
+  std::unique_ptr<ModuleSummaryIndex> makeLLVMIndex(const char *Summary) {
+    SMDiagnostic Err;
+    std::unique_ptr<ModuleSummaryIndex> Index =
+        parseSummaryIndexAssemblyString(Summary, Err);
+    if (!Index)
+      Err.print("MemoryProfileInfoTest", errs());
+    return Index;
+  }
+
   // This looks for a call that has the given value name, which
   // is the name of the value being assigned the call return value.
   CallBase *findCall(Function &F, const char *Name = nullptr) {
@@ -359,4 +369,90 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   }
 }
 
+TEST_F(MemoryProfileInfoTest, CallStackTestIR) {
+  LLVMContext C;
+  std::unique_ptr<Module> M = makeLLVMModule(C,
+                                             R"IR(
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+define ptr @test() {
+entry:
+  %call = call noalias noundef nonnull dereferenceable(10) ptr @_Znam(i64 noundef 10), !memprof !1, !callsite !6
+  ret ptr %call
+}
+declare noundef nonnull ptr @_Znam(i64 noundef)
+!1 = !{!2, !4}
+!2 = !{!3, !"notcold"}
+!3 = !{i64 1, i64 2, i64 3, i64 4}
+!4 = !{!5, !"cold"}
+!5 = !{i64 1, i64 2, i64 3, i64 5}
+!6 = !{i64 1}
+)IR");
+
+  Function *Func = M->getFunction("test");
+  CallBase *Call = findCall(*Func, "call");
+
+  CallStack<MDNode, MDNode::op_iterator> InstCallsite(
+      Call->getMetadata(LLVMContext::MD_callsite));
+
+  MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
+  bool First = true;
+  for (auto &MIBOp : MemProfMD->operands()) {
+    auto *MIBMD = cast<const MDNode>(MIBOp);
+    MDNode *StackNode = getMIBStackNode(MIBMD);
+    CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
+    std::vector<uint64_t> StackIds;
+    for (auto ContextIter = StackContext.beginAfterSharedPrefix(InstCallsite);
+         ContextIter != StackContext.end(); ++ContextIter)
+      StackIds.push_back(*ContextIter);
+    if (First)
+      EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({2UL, 3UL, 4UL}));
+    else
+      EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({2UL, 3UL, 5UL}));
+    First = false;
+  }
+}
+
+TEST_F(MemoryProfileInfoTest, CallStackTestSummary) {
+  std::unique_ptr<ModuleSummaryIndex> Index = makeLLVMIndex(R"Summary(
+^0 = module: (path: "test.o", hash: (0, 0, 0, 0, 0))
+^1 = gv: (guid: 23, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 2, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 0, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), allocs: ((versions: (none), memProf: ((type: notcold, stackIds: (1, 2, 3, 4)), (type: cold, stackIds: (1, 2, 3, 5))))))))
+^2 = gv: (guid: 25, summaries: (function: (module: ^0, flags: (linkage: external, visibility: default, notEligibleToImport: 0, live: 0, dsoLocal: 1, canAutoHide: 0), insts: 22, funcFlags: (readNone: 0, readOnly: 0, noRecurse: 1, returnDoesNotAlias: 0, noInline: 1, alwaysInline: 0, noUnwind: 0, mayThrow: 0, hasUnknownCall: 0, mustBeUnreachable: 0), calls: ((callee: ^1)), callsites: ((callee: ^1, clones: (0), stackIds: (3, 4)), (callee: ^1, clones: (0), stackIds: (3, 5))))))
+)Summary");
+
+  ASSERT_NE(Index, nullptr);
+  auto *CallsiteSummary =
+      cast<FunctionSummary>(Index->getGlobalValueSummary(/*guid=*/25));
+  bool First = true;
+  for (auto &CI : CallsiteSummary->callsites()) {
+    CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> InstCallsite(
+        &CI);
+    std::vector<uint64_t> StackIds;
+    for (auto StackIdIndex : InstCallsite)
+      StackIds.push_back(Index->getStackIdAtIndex(StackIdIndex));
+    if (First)
+      EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({3UL, 4UL}));
+    else
+      EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({3UL, 5UL}));
+    First = false;
+  }
+
+  auto *AllocSummary =
+      cast<FunctionSummary>(Index->getGlobalValueSummary(/*guid=*/23));
+  for (auto &AI : AllocSummary->allocs()) {
+    bool First = true;
+    for (auto &MIB : AI.MIBs) {
+      CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> StackContext(
+          &MIB);
+      std::vector<uint64_t> StackIds;
+      for (auto StackIdIndex : StackContext)
+        StackIds.push_back(Index->getStackIdAtIndex(StackIdIndex));
+      if (First)
+        EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({1UL, 2UL, 3UL, 4UL}));
+      else
+        EXPECT_EQ(makeArrayRef(StackIds), makeArrayRef({1UL, 2UL, 3UL, 5UL}));
+      First = false;
+    }
+  }
+}
 } // end anonymous namespace


        


More information about the llvm-commits mailing list