[llvm] [MemProf] Print full context hash when reporting hinted bytes (PR #114465)

Teresa Johnson via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 13 13:35:04 PST 2024


https://github.com/teresajohnson updated https://github.com/llvm/llvm-project/pull/114465

>From 11d13e4dd4d6267b9f0e2d1d69784599d0e3704b Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Thu, 31 Oct 2024 12:51:27 -0700
Subject: [PATCH 1/4] [MemProf] Print full context hash when reporting hinted
 bytes

Improve the information printed when -memprof-report-hinted-sizes is
enabled. Now print the full context hash computed from the original
profile, similar to what we do when reporting matching statistics. This
will make it easier to correlate with the profile.

Note that the full context hash must be computed at profile match time
and saved in the metadata and summary, because we may trim the context
during matching when it isn't needed for distinguishing hotness.
Similarly, due to the context trimming, we may have more than one full
context id and total size pair per MIB in the metadata and summary,
which now get a list of these pairs.

Remove the old aggregate size from the metadata and summary support.
One other change from the prior support is that we no longer write the
size information into the combined index for the LTO backends, which
don't use this information, which reduces unnecessary bloat in
distributed index files.
---
 .../include/llvm/Analysis/MemoryProfileInfo.h | 27 ++++--
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      | 10 ++-
 llvm/include/llvm/IR/ModuleSummaryIndex.h     | 76 ++++++++++++++--
 llvm/lib/Analysis/MemoryProfileInfo.cpp       | 90 +++++++++++++------
 llvm/lib/Analysis/ModuleSummaryAnalysis.cpp   | 33 +++++--
 llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp   |  1 +
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     | 55 +++++++-----
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     | 40 +++++++--
 llvm/lib/IR/Verifier.cpp                      | 37 ++++++--
 .../IPO/MemProfContextDisambiguation.cpp      | 66 +++++++++-----
 .../Instrumentation/MemProfiler.cpp           | 18 ++--
 .../ThinLTO/X86/memprof-aliased-location1.ll  |  4 +-
 .../ThinLTO/X86/memprof-aliased-location2.ll  |  4 +-
 llvm/test/ThinLTO/X86/memprof-basic.ll        | 12 ++-
 .../aliased-location1.ll                      |  4 +-
 .../aliased-location2.ll                      |  4 +-
 .../MemProfContextDisambiguation/basic.ll     | 12 ++-
 llvm/test/Transforms/PGOProfile/memprof.ll    | 29 +++---
 llvm/test/Verifier/memprof-metadata-bad.ll    |  2 +-
 19 files changed, 378 insertions(+), 146 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index edbce706953d18..55889c841b283e 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -28,16 +28,17 @@ AllocationType getAllocType(uint64_t TotalLifetimeAccessDensity,
 /// the resulting metadata node.
 MDNode *buildCallstackMetadata(ArrayRef<uint64_t> CallStack, LLVMContext &Ctx);
 
+/// Build metadata from the provided list of full stack id and profiled size, to
+/// use when reporting of hinted sizes is enabled.
+MDNode *buildContextSizeMetadata(ArrayRef<ContextTotalSize> ContextSizeInfo,
+                                 LLVMContext &Ctx);
+
 /// Returns the stack node from an MIB metadata node.
 MDNode *getMIBStackNode(const MDNode *MIB);
 
 /// Returns the allocation type from an MIB metadata node.
 AllocationType getMIBAllocType(const MDNode *MIB);
 
-/// Returns the total size from an MIB metadata node, or 0 if it was not
-/// recorded.
-uint64_t getMIBTotalSize(const MDNode *MIB);
-
 /// Returns the string to use in attributes with the given type.
 std::string getAllocTypeAttributeString(AllocationType Type);
 
@@ -55,11 +56,15 @@ class CallStackTrie {
     // Allocation types for call context sharing the context prefix at this
     // node.
     uint8_t AllocTypes;
-    uint64_t TotalSize;
+    // If the user has requested reporting of hinted sizes, keep track of the
+    // associated full stack id and profiled sizes. Can have more than one
+    // after trimming (e.g. when building from metadata). This is only placed on
+    // the last (root-most) trie node for each allocation context.
+    std::vector<ContextTotalSize> ContextSizeInfo;
     // Map of caller stack id to the corresponding child Trie node.
     std::map<uint64_t, CallStackTrieNode *> Callers;
-    CallStackTrieNode(AllocationType Type, uint64_t TotalSize)
-        : AllocTypes(static_cast<uint8_t>(Type)), TotalSize(TotalSize) {}
+    CallStackTrieNode(AllocationType Type)
+        : AllocTypes(static_cast<uint8_t>(Type)) {}
   };
 
   // The node for the allocation at the root.
@@ -75,6 +80,11 @@ class CallStackTrie {
     delete Node;
   }
 
+  // Recursively build up a complete list of context size information from the
+  // trie nodes reached form the given Node, for hint size reporting.
+  void collectContextSizeInfo(CallStackTrieNode *Node,
+                              std::vector<ContextTotalSize> &ContextSizeInfo);
+
   // Recursive helper to trim contexts and create metadata nodes.
   bool buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
                      std::vector<uint64_t> &MIBCallStack,
@@ -93,7 +103,8 @@ class CallStackTrie {
   /// allocation call down to the bottom of the call stack (i.e. callee to
   /// caller order).
   void addCallStack(AllocationType AllocType, ArrayRef<uint64_t> StackIds,
-                    uint64_t TotalSize = 0);
+                    std::vector<ContextTotalSize> ContextSizeInfo =
+                        std::vector<ContextTotalSize>());
 
   /// Add the call stack context along with its allocation type from the MIB
   /// metadata to the Trie.
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 41a6447356c23b..130c92b28b3d5e 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -308,7 +308,7 @@ enum GlobalValueSummarySymtabCodes {
   FS_PERMODULE_CALLSITE_INFO = 26,
   // Summary of per-module allocation memprof metadata.
   // [nummib, nummib x (alloc type, numstackids, numstackids x stackidindex),
-  // [nummib x total size]?]
+  // [nummib x (numcontext x contextsizeindex)]?]
   FS_PERMODULE_ALLOC_INFO = 27,
   // Summary of combined index memprof callsite metadata.
   // [valueid, numstackindices, numver,
@@ -317,9 +317,15 @@ enum GlobalValueSummarySymtabCodes {
   // Summary of combined index allocation memprof metadata.
   // [nummib, numver,
   //  nummib x (alloc type, numstackids, numstackids x stackidindex),
-  //  numver x version, [nummib x total size]?]
+  //  numver x version]
   FS_COMBINED_ALLOC_INFO = 29,
+  // List of all stack ids referenced by index in the callsite and alloc infos.
+  // [n x stack id]
   FS_STACK_IDS = 30,
+  // List of all (full stack id, total size) pairs optionally referenced by
+  // index from the alloc info records.
+  // [n x (full stack id, total size)]
+  FS_CONTEXT_SIZE_INFOS = 31,
 };
 
 enum MetadataCodes {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 1cfe7c15f97dbc..ccb6c8473f23ee 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -302,6 +302,14 @@ template <> struct DenseMapInfo<ValueInfo> {
   static unsigned getHashValue(ValueInfo I) { return (uintptr_t)I.getRef(); }
 };
 
+// For optional hinted size reporting, holds a pair of the full stack id
+// (pre-trimming, from the full context in the profile), and the associated
+// total profiled size.
+struct ContextTotalSize {
+  uint64_t FullStackId;
+  uint64_t TotalSize;
+};
+
 /// Summary of memprof callsite metadata.
 struct CallsiteInfo {
   // Actual callee function.
@@ -408,9 +416,15 @@ struct AllocInfo {
   // Vector of MIBs in this memprof metadata.
   std::vector<MIBInfo> MIBs;
 
-  // If requested, keep track of total profiled sizes for each MIB. This will be
-  // a vector of the same length and order as the MIBs vector, if non-empty.
-  std::vector<uint64_t> TotalSizes;
+  // If requested, keep track of full stack contexts and total profiled sizes
+  // for each MIB. This will be a vector of the same length and order as the
+  // MIBs vector, if non-empty. Note that each MIB in the summary can have
+  // multiple of these as we trim the contexts when possible during matching.
+  // For hinted size reporting we, however, want the original pre-trimmed full
+  // stack context id for better correlation with the profile. Note that these
+  // are indexes into the ContextSizeInfos list in the index, to enable
+  // deduplication.
+  std::vector<std::vector<unsigned>> ContextSizeInfoIndices;
 
   AllocInfo(std::vector<MIBInfo> MIBs) : MIBs(std::move(MIBs)) {
     Versions.push_back(0);
@@ -432,14 +446,21 @@ inline raw_ostream &operator<<(raw_ostream &OS, const AllocInfo &AE) {
   for (auto &M : AE.MIBs) {
     OS << "\t\t" << M << "\n";
   }
-  if (!AE.TotalSizes.empty()) {
-    OS << " TotalSizes per MIB:\n\t\t";
+  if (!AE.ContextSizeInfoIndices.empty()) {
+    OS << " ContextSizeInfo index per MIB:\n\t\t";
     First = true;
-    for (uint64_t TS : AE.TotalSizes) {
+    for (auto Indices : AE.ContextSizeInfoIndices) {
       if (!First)
         OS << ", ";
       First = false;
-      OS << TS << "\n";
+      bool FirstIndex = true;
+      for (uint64_t Index : Indices) {
+        if (!FirstIndex)
+          OS << ", ";
+        FirstIndex = false;
+        OS << Index;
+      }
+      OS << "\n";
     }
   }
   return OS;
@@ -1426,6 +1447,19 @@ class ModuleSummaryIndex {
   // built via releaseTemporaryMemory.
   DenseMap<uint64_t, unsigned> StackIdToIndex;
 
+  // List of unique ContextTotalSize structs (pair of the full stack id hash and
+  // its associated total profiled size). We use an index into this vector when
+  // referencing from the alloc summary to reduce the overall memory and size
+  // requirements, since often allocations may be duplicated due to inlining.
+  std::vector<ContextTotalSize> ContextSizeInfos;
+
+  // Temporary map while building the ContextSizeInfos list. Clear when index is
+  // completely built via releaseTemporaryMemory.
+  // Maps from full stack id to a map of total size to the assigned index.
+  // We need size in here too because due to stack truncation in the profile we
+  // can have the same full stack id and different sizes.
+  DenseMap<uint64_t, DenseMap<uint64_t, unsigned>> ContextToTotalSizeAndIndex;
+
   // YAML I/O support.
   friend yaml::MappingTraits<ModuleSummaryIndex>;
 
@@ -1470,6 +1504,9 @@ class ModuleSummaryIndex {
   size_t size() const { return GlobalValueMap.size(); }
 
   const std::vector<uint64_t> &stackIds() const { return StackIds; }
+  const std::vector<ContextTotalSize> &contextSizeInfos() const {
+    return ContextSizeInfos;
+  }
 
   unsigned addOrGetStackIdIndex(uint64_t StackId) {
     auto Inserted = StackIdToIndex.insert({StackId, StackIds.size()});
@@ -1483,15 +1520,36 @@ class ModuleSummaryIndex {
     return StackIds[Index];
   }
 
+  unsigned addOrGetContextSizeIndex(ContextTotalSize ContextSizeInfo) {
+    auto &Entry = ContextToTotalSizeAndIndex[ContextSizeInfo.FullStackId];
+    auto Inserted =
+        Entry.insert({ContextSizeInfo.TotalSize, ContextSizeInfos.size()});
+    if (Inserted.second)
+      ContextSizeInfos.push_back(
+          {ContextSizeInfo.FullStackId, ContextSizeInfo.TotalSize});
+    else
+      assert(Inserted.first->first == ContextSizeInfo.TotalSize);
+    return Inserted.first->second;
+  }
+
+  ContextTotalSize getContextSizeInfoAtIndex(unsigned Index) const {
+    assert(ContextSizeInfos.size() > Index);
+    return ContextSizeInfos[Index];
+  }
+
   // Facility to release memory from data structures only needed during index
-  // construction (including while building combined index). Currently this only
+  // construction (including while building combined index). Currently this
   // releases the temporary map used while constructing a correspondence between
-  // stack ids and their index in the StackIds vector. Mostly impactful when
+  // stack ids and their index in the StackIds vector, and a similar map used
+  // while constructing a the ContextSizeInfos vector. Mostly impactful when
   // building a large combined index.
   void releaseTemporaryMemory() {
     assert(StackIdToIndex.size() == StackIds.size());
     StackIdToIndex.clear();
     StackIds.shrink_to_fit();
+    assert(ContextToTotalSizeAndIndex.size() == ContextSizeInfos.size());
+    ContextToTotalSizeAndIndex.clear();
+    ContextSizeInfos.shrink_to_fit();
   }
 
   /// Convenience function for doing a DFS on a ValueInfo. Marks the function in
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 2b49dce17b7931..885f2e4d040143 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -99,12 +99,6 @@ AllocationType llvm::memprof::getMIBAllocType(const MDNode *MIB) {
   return AllocationType::NotCold;
 }
 
-uint64_t llvm::memprof::getMIBTotalSize(const MDNode *MIB) {
-  if (MIB->getNumOperands() < 3)
-    return 0;
-  return mdconst::dyn_extract<ConstantInt>(MIB->getOperand(2))->getZExtValue();
-}
-
 std::string llvm::memprof::getAllocTypeAttributeString(AllocationType Type) {
   switch (Type) {
   case AllocationType::NotCold:
@@ -135,22 +129,22 @@ bool llvm::memprof::hasSingleAllocType(uint8_t AllocTypes) {
   return NumAllocTypes == 1;
 }
 
-void CallStackTrie::addCallStack(AllocationType AllocType,
-                                 ArrayRef<uint64_t> StackIds,
-                                 uint64_t TotalSize) {
+void CallStackTrie::addCallStack(
+    AllocationType AllocType, ArrayRef<uint64_t> StackIds,
+    std::vector<ContextTotalSize> ContextSizeInfo) {
   bool First = true;
   CallStackTrieNode *Curr = nullptr;
   for (auto StackId : StackIds) {
-    // If this is the first stack frame, add or update alloc node.
+    // errs() << StackId << " ";
+    //  If this is the first stack frame, add or update alloc node.
     if (First) {
       First = false;
       if (Alloc) {
         assert(AllocStackId == StackId);
         Alloc->AllocTypes |= static_cast<uint8_t>(AllocType);
-        Alloc->TotalSize += TotalSize;
       } else {
         AllocStackId = StackId;
-        Alloc = new CallStackTrieNode(AllocType, TotalSize);
+        Alloc = new CallStackTrieNode(AllocType);
       }
       Curr = Alloc;
       continue;
@@ -160,15 +154,18 @@ void CallStackTrie::addCallStack(AllocationType AllocType,
     if (Next != Curr->Callers.end()) {
       Curr = Next->second;
       Curr->AllocTypes |= static_cast<uint8_t>(AllocType);
-      Curr->TotalSize += TotalSize;
       continue;
     }
     // Otherwise add a new caller node.
-    auto *New = new CallStackTrieNode(AllocType, TotalSize);
+    auto *New = new CallStackTrieNode(AllocType);
     Curr->Callers[StackId] = New;
     Curr = New;
   }
   assert(Curr);
+  Curr->ContextSizeInfo.insert(Curr->ContextSizeInfo.end(),
+                               ContextSizeInfo.begin(), ContextSizeInfo.end());
+  std::vector<ContextTotalSize> AllContextSizeInfo;
+  collectContextSizeInfo(Curr, AllContextSizeInfo);
 }
 
 void CallStackTrie::addCallStack(MDNode *MIB) {
@@ -181,21 +178,55 @@ void CallStackTrie::addCallStack(MDNode *MIB) {
     assert(StackId);
     CallStack.push_back(StackId->getZExtValue());
   }
-  addCallStack(getMIBAllocType(MIB), CallStack, getMIBTotalSize(MIB));
+  std::vector<ContextTotalSize> ContextSizeInfo;
+  // Collect the context size information if it exists.
+  if (MIB->getNumOperands() > 2) {
+    for (unsigned I = 2; I < MIB->getNumOperands(); I++) {
+      MDNode *ContextSizePair = dyn_cast<MDNode>(MIB->getOperand(I));
+      assert(ContextSizePair->getNumOperands() == 2);
+      uint64_t FullStackId =
+          mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(0))
+              ->getZExtValue();
+      uint64_t TotalSize =
+          mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(1))
+              ->getZExtValue();
+      ContextSizeInfo.push_back({FullStackId, TotalSize});
+    }
+  }
+  addCallStack(getMIBAllocType(MIB), CallStack, std::move(ContextSizeInfo));
 }
 
 static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
-                             AllocationType AllocType, uint64_t TotalSize) {
+                             AllocationType AllocType,
+                             ArrayRef<ContextTotalSize> ContextSizeInfo) {
   SmallVector<Metadata *> MIBPayload(
       {buildCallstackMetadata(MIBCallStack, Ctx)});
   MIBPayload.push_back(
       MDString::get(Ctx, getAllocTypeAttributeString(AllocType)));
-  if (TotalSize)
-    MIBPayload.push_back(ValueAsMetadata::get(
-        ConstantInt::get(Type::getInt64Ty(Ctx), TotalSize)));
+  if (!ContextSizeInfo.empty()) {
+    for (auto Info : ContextSizeInfo) {
+      auto *FullStackIdMD = ValueAsMetadata::get(
+          ConstantInt::get(Type::getInt64Ty(Ctx), Info.FullStackId));
+      auto *TotalSizeMD = ValueAsMetadata::get(
+          ConstantInt::get(Type::getInt64Ty(Ctx), Info.TotalSize));
+      auto *ContextSizeMD = MDNode::get(Ctx, {FullStackIdMD, TotalSizeMD});
+      MIBPayload.push_back(ContextSizeMD);
+    }
+  }
   return MDNode::get(Ctx, MIBPayload);
 }
 
+void CallStackTrie::collectContextSizeInfo(
+    CallStackTrieNode *Node, std::vector<ContextTotalSize> &ContextSizeInfo) {
+  ContextSizeInfo.insert(ContextSizeInfo.end(), Node->ContextSizeInfo.begin(),
+                         Node->ContextSizeInfo.end());
+  if (Node->Callers.empty())
+    return;
+  for (auto &Caller : Node->Callers) {
+    collectContextSizeInfo(Caller.second, ContextSizeInfo);
+  }
+}
+
 // Recursive helper to trim contexts and create metadata nodes.
 // Caller should have pushed Node's loc to MIBCallStack. Doing this in the
 // caller makes it simpler to handle the many early returns in this method.
@@ -206,8 +237,10 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
   // Trim context below the first node in a prefix with a single alloc type.
   // Add an MIB record for the current call stack prefix.
   if (hasSingleAllocType(Node->AllocTypes)) {
+    std::vector<ContextTotalSize> ContextSizeInfo;
+    collectContextSizeInfo(Node, ContextSizeInfo);
     MIBNodes.push_back(createMIBNode(
-        Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, Node->TotalSize));
+        Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo));
     return true;
   }
 
@@ -243,8 +276,10 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
   // non-cold allocation type.
   if (!CalleeHasAmbiguousCallerContext)
     return false;
+  std::vector<ContextTotalSize> ContextSizeInfo;
+  collectContextSizeInfo(Node, ContextSizeInfo);
   MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold,
-                                   Node->TotalSize));
+                                   ContextSizeInfo));
   return true;
 }
 
@@ -256,11 +291,14 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
   if (hasSingleAllocType(Alloc->AllocTypes)) {
     addAllocTypeAttribute(Ctx, CI, (AllocationType)Alloc->AllocTypes);
     if (MemProfReportHintedSizes) {
-      assert(Alloc->TotalSize);
-      errs() << "Total size for allocation with location hash " << AllocStackId
-             << " and single alloc type "
-             << getAllocTypeAttributeString((AllocationType)Alloc->AllocTypes)
-             << ": " << Alloc->TotalSize << "\n";
+      std::vector<ContextTotalSize> ContextSizeInfo;
+      collectContextSizeInfo(Alloc, ContextSizeInfo);
+      for (const auto &Info : ContextSizeInfo) {
+        errs() << "Total size for full allocation context hash "
+               << Info.FullStackId << " and single alloc type "
+               << getAllocTypeAttributeString((AllocationType)Alloc->AllocTypes)
+               << ": " << Info.TotalSize << "\n";
+      }
     }
     return false;
   }
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 004e8b76a3c851..3273de51a79d9f 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -523,6 +523,7 @@ static void computeFunctionSummary(
       if (MemProfMD) {
         std::vector<MIBInfo> MIBs;
         std::vector<uint64_t> TotalSizes;
+        std::vector<std::vector<unsigned>> ContextSizeInfoIndices;
         for (auto &MDOp : MemProfMD->operands()) {
           auto *MIBMD = cast<const MDNode>(MDOp);
           MDNode *StackNode = getMIBStackNode(MIBMD);
@@ -540,18 +541,34 @@ static void computeFunctionSummary(
             if (StackIdIndices.empty() || StackIdIndices.back() != StackIdIdx)
               StackIdIndices.push_back(StackIdIdx);
           }
+          // If we have context size information, collect it for inclusion in
+          // the summary.
+          assert(MIBMD->getNumOperands() > 2 || !MemProfReportHintedSizes);
+          if (MIBMD->getNumOperands() > 2) {
+            std::vector<unsigned> ContextSizeIndices;
+            for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
+              MDNode *ContextSizePair = dyn_cast<MDNode>(MIBMD->getOperand(I));
+              assert(ContextSizePair->getNumOperands() == 2);
+              uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
+                                         ContextSizePair->getOperand(0))
+                                         ->getZExtValue();
+              uint64_t TS = mdconst::dyn_extract<ConstantInt>(
+                                ContextSizePair->getOperand(1))
+                                ->getZExtValue();
+              ContextSizeIndices.push_back(
+                  Index.addOrGetContextSizeIndex({FullStackId, TS}));
+            }
+            ContextSizeInfoIndices.push_back(std::move(ContextSizeIndices));
+          }
           MIBs.push_back(
               MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices)));
-          if (MemProfReportHintedSizes) {
-            auto TotalSize = getMIBTotalSize(MIBMD);
-            assert(TotalSize);
-            TotalSizes.push_back(TotalSize);
-          }
         }
         Allocs.push_back(AllocInfo(std::move(MIBs)));
-        if (MemProfReportHintedSizes) {
-          assert(Allocs.back().MIBs.size() == TotalSizes.size());
-          Allocs.back().TotalSizes = std::move(TotalSizes);
+        assert(!ContextSizeInfoIndices.empty() || !MemProfReportHintedSizes);
+        if (!ContextSizeInfoIndices.empty()) {
+          assert(Allocs.back().MIBs.size() == ContextSizeInfoIndices.size());
+          Allocs.back().ContextSizeInfoIndices =
+              std::move(ContextSizeInfoIndices);
         }
       } else if (!InstCallsite.empty()) {
         SmallVector<unsigned> StackIdIndices;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index b7ed9cdf631454..419df0d78e1796 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -328,6 +328,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO)
       STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO)
       STRINGIFY_CODE(FS, STACK_IDS)
+      STRINGIFY_CODE(FS, CONTEXT_SIZE_INFOS)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch (CodeID) {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 446c98c8cecd88..6d95f68f328baf 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -987,6 +987,11 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   /// ids from the lists in the callsite and alloc entries to the index.
   std::vector<uint64_t> StackIds;
 
+  // Saves the context total size information from the CONTEXT_SIZE_INFOS record
+  // to consult when adding this from the lists in the alloc entries to the
+  // index.
+  std::vector<ContextTotalSize> ContextSizeInfos;
+
 public:
   ModuleSummaryIndexBitcodeReader(
       BitstreamCursor Stream, StringRef Strtab, ModuleSummaryIndex &TheIndex,
@@ -7997,6 +8002,14 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       break;
     }
 
+    case bitc::FS_CONTEXT_SIZE_INFOS: { // [n x (fullstackid, totalsize)]
+      // Save context size infos in the reader to consult when adding them from
+      // the lists in the alloc node entries.
+      for (auto R = Record.begin(); R != Record.end(); R += 2)
+        ContextSizeInfos.push_back({*R, *(R + 1)});
+      break;
+    }
+
     case bitc::FS_PERMODULE_CALLSITE_INFO: {
       unsigned ValueID = Record[0];
       SmallVector<unsigned> StackIdList;
@@ -8052,18 +8065,30 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
         }
         MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
       }
-      std::vector<uint64_t> TotalSizes;
-      // We either have no sizes or NumMIBs of them.
-      assert(I == Record.size() || Record.size() - I == NumMIBs);
+      // We either have nothing left or at least NumMIBs context size info
+      // indices left.
+      assert(I == Record.size() || Record.size() - I >= NumMIBs);
+      std::vector<std::vector<unsigned>> AllContextSizeIndices;
       if (I < Record.size()) {
         MIBsRead = 0;
-        while (MIBsRead++ < NumMIBs)
-          TotalSizes.push_back(Record[I++]);
+        while (MIBsRead++ < NumMIBs) {
+          unsigned NumContextSizeInfoEntries = Record[I++];
+          assert(Record.size() - I >= NumContextSizeInfoEntries);
+          std::vector<unsigned> ContextSizeIndices;
+          for (unsigned J = 0; J < NumContextSizeInfoEntries; J++) {
+            assert(Record[I] < ContextSizeInfos.size());
+            ContextSizeIndices.push_back(TheIndex.addOrGetContextSizeIndex(
+                ContextSizeInfos[Record[I++]]));
+          }
+          AllContextSizeIndices.push_back(std::move(ContextSizeIndices));
+        }
       }
       PendingAllocs.push_back(AllocInfo(std::move(MIBs)));
-      if (!TotalSizes.empty()) {
-        assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
-        PendingAllocs.back().TotalSizes = std::move(TotalSizes);
+      if (!AllContextSizeIndices.empty()) {
+        assert(PendingAllocs.back().MIBs.size() ==
+               AllContextSizeIndices.size());
+        PendingAllocs.back().ContextSizeInfoIndices =
+            std::move(AllContextSizeIndices);
       }
       break;
     }
@@ -8091,21 +8116,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       SmallVector<uint8_t> Versions;
       for (unsigned J = 0; J < NumVersions; J++)
         Versions.push_back(Record[I++]);
-      std::vector<uint64_t> TotalSizes;
-      // We either have no sizes or NumMIBs of them.
-      assert(I == Record.size() || Record.size() - I == NumMIBs);
-      if (I < Record.size()) {
-        MIBsRead = 0;
-        while (MIBsRead++ < NumMIBs) {
-          TotalSizes.push_back(Record[I++]);
-        }
-      }
+      assert(I == Record.size());
       PendingAllocs.push_back(
           AllocInfo(std::move(Versions), std::move(MIBs)));
-      if (!TotalSizes.empty()) {
-        assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
-        PendingAllocs.back().TotalSizes = std::move(TotalSizes);
-      }
       break;
     }
     }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index ee9cc4b6e0c0eb..867470426962dc 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4195,7 +4195,8 @@ static void writeFunctionHeapProfileRecords(
     BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev,
     unsigned AllocAbbrev, bool PerModule,
     std::function<unsigned(const ValueInfo &VI)> GetValueID,
-    std::function<unsigned(unsigned)> GetStackIndex) {
+    std::function<unsigned(unsigned)> GetStackIndex,
+    bool WriteContextSizeInfoIndex) {
   SmallVector<uint64_t> Record;
 
   for (auto &CI : FS->callsites()) {
@@ -4237,10 +4238,14 @@ static void writeFunctionHeapProfileRecords(
       for (auto V : AI.Versions)
         Record.push_back(V);
     }
-    assert(AI.TotalSizes.empty() || AI.TotalSizes.size() == AI.MIBs.size());
-    if (!AI.TotalSizes.empty()) {
-      for (auto Size : AI.TotalSizes)
-        Record.push_back(Size);
+    assert(AI.ContextSizeInfoIndices.empty() ||
+           AI.ContextSizeInfoIndices.size() == AI.MIBs.size());
+    if (WriteContextSizeInfoIndex && !AI.ContextSizeInfoIndices.empty()) {
+      for (auto Indices : AI.ContextSizeInfoIndices) {
+        Record.push_back(Indices.size());
+        for (auto Id : Indices)
+          Record.push_back(Id);
+      }
     }
     Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO
                                 : bitc::FS_COMBINED_ALLOC_INFO,
@@ -4267,7 +4272,8 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
       Stream, FS, CallsiteAbbrev, AllocAbbrev,
       /*PerModule*/ true,
       /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); },
-      /*GetStackIndex*/ [&](unsigned I) { return I; });
+      /*GetStackIndex*/ [&](unsigned I) { return I; },
+      /*WriteContextSizeInfoIndex*/ true);
 
   auto SpecialRefCnts = FS->specialRefCounts();
   NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
@@ -4404,6 +4410,24 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     Stream.EmitRecord(bitc::FS_STACK_IDS, Index->stackIds(), StackIdAbbvId);
   }
 
+  SmallVector<uint64_t, 64> NameVals;
+  if (!Index->contextSizeInfos().empty()) {
+    auto ContextSizeInfoAbbv = std::make_shared<BitCodeAbbrev>();
+    ContextSizeInfoAbbv->Add(BitCodeAbbrevOp(bitc::FS_CONTEXT_SIZE_INFOS));
+    // numids x (fullStackid, totalsize)
+    ContextSizeInfoAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    ContextSizeInfoAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    unsigned ContextSizeInfoAbbvId =
+        Stream.EmitAbbrev(std::move(ContextSizeInfoAbbv));
+    for (const auto &Info : Index->contextSizeInfos()) {
+      NameVals.push_back(Info.FullStackId);
+      NameVals.push_back(Info.TotalSize);
+    }
+    Stream.EmitRecord(bitc::FS_CONTEXT_SIZE_INFOS, NameVals,
+                      ContextSizeInfoAbbvId);
+    NameVals.clear();
+  }
+
   // Abbrev for FS_PERMODULE_PROFILE.
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
@@ -4489,7 +4513,6 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
-  SmallVector<uint64_t, 64> NameVals;
   // Iterate over the list of functions instead of the Index to
   // ensure the ordering is stable.
   for (const Function &F : M) {
@@ -4757,7 +4780,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
           // the case of distributed indexes).
           assert(StackIdIndicesToIndex.contains(I));
           return StackIdIndicesToIndex[I];
-        });
+        },
+        /*WriteContextSizeInfoIndex*/ false);
 
     NameVals.push_back(*ValueId);
     assert(ModuleIdMap.count(FS->modulePath()));
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index ee807ca13787d5..61f9c0cfe69f2b 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4998,14 +4998,35 @@ void Verifier::visitMemProfMetadata(Instruction &I, MDNode *MD) {
     MDNode *StackMD = dyn_cast<MDNode>(MIB->getOperand(0));
     visitCallStackMetadata(StackMD);
 
-    // Check that remaining operands, except possibly the last, are MDString.
-    Check(llvm::all_of(MIB->operands().drop_front().drop_back(),
-                       [](const MDOperand &Op) { return isa<MDString>(Op); }),
-          "Not all !memprof MemInfoBlock operands 1 to N-1 are MDString", MIB);
-    // The last operand might be the total profiled size so can be an integer.
-    auto &LastOperand = MIB->operands().back();
-    Check(isa<MDString>(LastOperand) || mdconst::hasa<ConstantInt>(LastOperand),
-          "Last !memprof MemInfoBlock operand not MDString or int", MIB);
+    // The next set of 1 or more operands should be MDString.
+    unsigned I = 1;
+    for (; I < MIB->getNumOperands(); ++I) {
+      if (!isa<MDString>(MIB->getOperand(I))) {
+        Check(I > 1,
+              "!memprof MemInfoBlock second operand should be an MDString",
+              MIB);
+        break;
+      }
+    }
+
+    // Any remaining should be MDNode that are pairs of integers
+    for (; I < MIB->getNumOperands(); ++I) {
+      MDNode *OpNode = dyn_cast<MDNode>(MIB->getOperand(I));
+      Check(OpNode, "Not all !memprof MemInfoBlock operands 2 to N are MDNode",
+            MIB);
+      Check(OpNode->getNumOperands() == 2,
+            "Not all !memprof MemInfoBlock operands 2 to N are MDNode with 2 "
+            "operands",
+            MIB);
+      // Check that all of Op's operands are ConstantInt.
+      Check(llvm::all_of(OpNode->operands(),
+                         [](const MDOperand &Op) {
+                           return mdconst::hasa<ConstantInt>(Op);
+                         }),
+            "Not all !memprof MemInfoBlock operands 2 to N are MDNode with "
+            "ConstantInt operands",
+            MIB);
+    }
   }
 }
 
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index da5ded23ecc045..6d5b790e5f6793 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -475,7 +475,8 @@ class CallsiteContextGraph {
   void addStackNodesForMIB(ContextNode *AllocNode,
                            CallStack<NodeT, IteratorT> &StackContext,
                            CallStack<NodeT, IteratorT> &CallsiteContext,
-                           AllocationType AllocType, uint64_t TotalSize);
+                           AllocationType AllocType,
+                           ArrayRef<ContextTotalSize> ContextSizeInfo);
 
   /// Matches all callsite metadata (or summary) to the nodes created for
   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
@@ -705,9 +706,10 @@ class CallsiteContextGraph {
   /// Map from each context ID to the AllocationType assigned to that context.
   DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
 
-  /// Map from each contextID to the profiled aggregate allocation size,
+  /// Map from each contextID to the profiled full contexts and their total
+  /// sizes (there may be more than one due to context trimming),
   /// optionally populated when requested (via MemProfReportHintedSizes).
-  DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;
+  DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
 
   /// Identifies the context node created for a stack id when adding the MIB
   /// contexts to the graph. This is used to locate the context nodes when
@@ -1203,8 +1205,7 @@ template <class NodeT, class IteratorT>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
-    uint64_t TotalSize) {
-  assert(!MemProfReportHintedSizes || TotalSize > 0);
+    ArrayRef<ContextTotalSize> ContextSizeInfo) {
   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
   // is done.
   if (AllocType == AllocationType::Hot)
@@ -1213,8 +1214,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
   ContextIdToAllocationType[++LastContextId] = AllocType;
 
   if (MemProfReportHintedSizes) {
-    assert(TotalSize);
-    ContextIdToTotalSize[LastContextId] = TotalSize;
+    assert(!ContextSizeInfo.empty());
+    auto &Entry = ContextIdToContextSizeInfos[LastContextId];
+    Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
   }
 
   // Update alloc type and context ids for this MIB.
@@ -1259,10 +1261,6 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
     assert(ContextIdToAllocationType.count(OldId));
     // The new context has the same allocation type as original.
     ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
-    // For now set this to 0 so we don't duplicate sizes. Not clear how to divvy
-    // up the size. Assume that if we are able to duplicate context ids that we
-    // will be able to disambiguate all copies.
-    ContextIdToTotalSize[LastContextId] = 0;
   }
   return NewContextIds;
 }
@@ -1961,12 +1959,28 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
           // Add all of the MIBs and their stack nodes.
           for (auto &MDOp : MemProfMD->operands()) {
             auto *MIBMD = cast<const MDNode>(MDOp);
+            std::vector<ContextTotalSize> ContextSizeInfo;
+            // Collect the context size information if it exists.
+            if (MIBMD->getNumOperands() > 2) {
+              for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
+                MDNode *ContextSizePair =
+                    dyn_cast<MDNode>(MIBMD->getOperand(I));
+                assert(ContextSizePair->getNumOperands() == 2);
+                uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
+                                           ContextSizePair->getOperand(0))
+                                           ->getZExtValue();
+                uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
+                                         ContextSizePair->getOperand(1))
+                                         ->getZExtValue();
+                ContextSizeInfo.push_back({FullStackId, TotalSize});
+              }
+            }
             MDNode *StackNode = getMIBStackNode(MIBMD);
             assert(StackNode);
             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
                 AllocNode, StackContext, CallsiteContext,
-                getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));
+                getMIBAllocType(MIBMD), ContextSizeInfo);
           }
           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
           // Memprof and callsite metadata on memory allocations no longer
@@ -2042,17 +2056,21 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
               EmptyContext;
           unsigned I = 0;
           assert(!MemProfReportHintedSizes ||
-                 AN.TotalSizes.size() == AN.MIBs.size());
+                 AN.ContextSizeInfoIndices.size() == AN.MIBs.size());
           // Now add all of the MIBs and their stack nodes.
           for (auto &MIB : AN.MIBs) {
             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
                 StackContext(&MIB);
-            uint64_t TotalSize = 0;
-            if (MemProfReportHintedSizes)
-              TotalSize = AN.TotalSizes[I];
+            std::vector<ContextTotalSize> ContextSizeInfo;
+            if (MemProfReportHintedSizes) {
+              for (auto Id : AN.ContextSizeInfoIndices[I]) {
+                auto Info = Index.getContextSizeInfoAtIndex(Id);
+                ContextSizeInfo.push_back({Info.FullStackId, Info.TotalSize});
+              }
+            }
             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
-                TotalSize);
+                ContextSizeInfo);
             I++;
           }
           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
@@ -2824,13 +2842,17 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
     std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
     std::sort(SortedIds.begin(), SortedIds.end());
     for (auto Id : SortedIds) {
-      auto SizeI = ContextIdToTotalSize.find(Id);
-      assert(SizeI != ContextIdToTotalSize.end());
       auto TypeI = ContextIdToAllocationType.find(Id);
       assert(TypeI != ContextIdToAllocationType.end());
-      OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id
-         << " with total size " << SizeI->second << " is "
-         << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
+      auto CSI = ContextIdToContextSizeInfos.find(Id);
+      if (CSI != ContextIdToContextSizeInfos.end()) {
+        for (auto &Info : CSI->second) {
+          OS << getAllocTypeString((uint8_t)TypeI->second)
+             << " full allocation context " << Info.FullStackId
+             << " with total size " << Info.TotalSize << " is "
+             << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
+        }
+      }
     }
   }
 }
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 4a43120c9a9e7f..42c01fe832572e 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -716,19 +716,22 @@ computeFullStackId(const std::vector<memprof::Frame> &CallStack) {
 }
 
 static AllocationType addCallStack(CallStackTrie &AllocTrie,
-                                   const AllocationInfo *AllocInfo) {
+                                   const AllocationInfo *AllocInfo,
+                                   uint64_t FullStackId) {
   SmallVector<uint64_t> StackIds;
   for (const auto &StackFrame : AllocInfo->CallStack)
     StackIds.push_back(computeStackId(StackFrame));
   auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
                                 AllocInfo->Info.getAllocCount(),
                                 AllocInfo->Info.getTotalLifetime());
-  uint64_t TotalSize = 0;
+  std::vector<ContextTotalSize> ContextSizeInfo;
   if (MemProfReportHintedSizes) {
-    TotalSize = AllocInfo->Info.getTotalSize();
+    auto TotalSize = AllocInfo->Info.getTotalSize();
     assert(TotalSize);
+    assert(FullStackId != 0);
+    ContextSizeInfo.push_back({FullStackId, TotalSize});
   }
-  AllocTrie.addCallStack(AllocType, StackIds, TotalSize);
+  AllocTrie.addCallStack(AllocType, StackIds, std::move(ContextSizeInfo));
   return AllocType;
 }
 
@@ -964,11 +967,14 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
           if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
                                                  InlinedCallStack)) {
             NumOfMemProfMatchedAllocContexts++;
-            auto AllocType = addCallStack(AllocTrie, AllocInfo);
+            uint64_t FullStackId = 0;
+            if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes)
+              FullStackId = computeFullStackId(AllocInfo->CallStack);
+            auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
             // Record information about the allocation if match info printing
             // was requested.
             if (ClPrintMemProfMatchInfo) {
-              auto FullStackId = computeFullStackId(AllocInfo->CallStack);
+              assert(FullStackId != 0);
               FullStackIdToAllocMatchInfo[FullStackId] = {
                   AllocInfo->Info.getTotalSize(), AllocType, /*Matched=*/true};
             }
diff --git a/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll
index 42819d5421ca0f..3e8aa9766d6c5f 100644
--- a/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll
+++ b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll
@@ -62,9 +62,9 @@ attributes #0 = { noinline optnone }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold"}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold"}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}
diff --git a/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll
index 663f8525043c2f..9169cc03d08d6f 100644
--- a/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll
+++ b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll
@@ -62,9 +62,9 @@ attributes #0 = { noinline optnone }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold"}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold"}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 6922dbfd368467..96d5459c78793e 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -128,13 +128,16 @@ attributes #0 = { noinline optnone }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold", !10}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold", !11, !12}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}
 !9 = !{i64 2732490490862098848}
+!10 = !{i64 123, i64 100}
+!11 = !{i64 456, i64 200}
+!12 = !{i64 789, i64 300}
 
 
 ; DUMP: CCG before cloning:
@@ -267,8 +270,9 @@ attributes #0 = { noinline optnone }
 ; DUMP: 		Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2
 ; DUMP:		Clone of [[BAR]]
 
-; SIZES: NotCold context 1 with total size 100 is NotCold after cloning
-; SIZES: Cold context 2 with total size 400 is Cold after cloning
+; SIZES: NotCold full allocation context 123 with total size 100 is NotCold after cloning
+; SIZES: Cold full allocation context 456 with total size 200 is Cold after cloning
+; SIZES: Cold full allocation context 789 with total size 300 is Cold after cloning
 
 ; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
 ; REMARKS: created clone _Z3barv.memprof.1
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll
index 8f9df20471e41c..c2810dfabffbd7 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll
@@ -66,9 +66,9 @@ attributes #6 = { builtin }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold"}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold"}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll
index c3c164d4928632..068e1f116519e8 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll
@@ -66,9 +66,9 @@ attributes #6 = { builtin }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold"}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold"}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
index a82f872d51c7d5..952e2519bbf0b3 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
@@ -106,13 +106,16 @@ attributes #6 = { builtin }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold", !10}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold", !11, !12}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}
 !9 = !{i64 2732490490862098848}
+!10 = !{i64 123, i64 100}
+!11 = !{i64 456, i64 200}
+!12 = !{i64 789, i64 300}
 
 
 ; DUMP: CCG before cloning:
@@ -249,8 +252,9 @@ attributes #6 = { builtin }
 ; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv
 ; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
 
-; SIZES: NotCold context 1 with total size 100 is NotCold after cloning
-; SIZES: Cold context 2 with total size 400 is Cold after cloning
+; SIZES: NotCold full allocation context 123 with total size 100 is NotCold after cloning
+; SIZES: Cold full allocation context 456 with total size 200 is Cold after cloning
+; SIZES: Cold full allocation context 789 with total size 300 is Cold after cloning
 
 ; IR: define {{.*}} @main
 ;; The first call to foo does not allocate cold memory. It should call the
diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index e1457ca7251ed8..d6c86bb7ad5a8a 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -335,17 +335,24 @@ for.end:                                          ; preds = %for.cond
 ; MEMPROF: ![[C11]] = !{i64 1544787832369987002}
 
 ;; For non-context sensitive allocations that get attributes we emit a message
-;; with the allocation hash, type, and size in bytes.
-; TOTALSIZES: Total size for allocation with location hash 6792096022461663180 and single alloc type notcold: 10
-; TOTALSIZES: Total size for allocation with location hash 15737101490731057601 and single alloc type cold: 10
-;; For context sensitive allocations the size in bytes is included on the MIB
-;; metadata.
-; TOTALSIZES: !"cold", i64 10}
-; TOTALSIZES: !"cold", i64 10}
-; TOTALSIZES: !"notcold", i64 10}
-; TOTALSIZES: !"cold", i64 20}
-; TOTALSIZES: !"notcold", i64 10}
-
+;; with the full allocation context hash, type, and size in bytes.
+; TOTALSIZES: Total size for full allocation context hash 6792096022461663180 and single alloc type notcold: 10
+; TOTALSIZES: Total size for full allocation context hash 15737101490731057601 and single alloc type cold: 10
+;; For context sensitive allocations the full context hash and size in bytes
+;; are in separate metadata nodes included on the MIB metadata.
+; TOTALSIZES: !"cold", ![[CONTEXT1:[0-9]+]]}
+; TOTALSIZES: ![[CONTEXT1]] = !{i64 8525406123785421946, i64 10}
+; TOTALSIZES: !"cold", ![[CONTEXT2:[0-9]+]]}
+; TOTALSIZES: ![[CONTEXT2]] = !{i64 -6732513409544482918, i64 10}
+; TOTALSIZES: !"notcold", ![[CONTEXT3:[0-9]+]]}
+; TOTALSIZES: ![[CONTEXT3]] = !{i64 5725971306423925017, i64 10}
+;; There can be more than one context id / size pair due to context trimming
+;; when we match.
+; TOTALSIZES: !"cold", ![[CONTEXT4:[0-9]+]], ![[CONTEXT5:[0-9]+]]}
+; TOTALSIZES: ![[CONTEXT4]] = !{i64 -2103941543456458045, i64 10}
+; TOTALSIZES: ![[CONTEXT5]] = !{i64 -191931298737547222, i64 10}
+; TOTALSIZES: !"notcold", ![[CONTEXT6:[0-9]+]]}
+; TOTALSIZES: ![[CONTEXT6]] = !{i64 1093248920606587996, i64 10}
 
 ; MEMPROFNOCOLINFO: #[[A1]] = { builtin allocsize(0) "memprof"="notcold" }
 ; MEMPROFNOCOLINFO: #[[A2]] = { builtin allocsize(0) "memprof"="cold" }
diff --git a/llvm/test/Verifier/memprof-metadata-bad.ll b/llvm/test/Verifier/memprof-metadata-bad.ll
index f4f1f6bb0a4635..b8c2c2d8a2c993 100644
--- a/llvm/test/Verifier/memprof-metadata-bad.ll
+++ b/llvm/test/Verifier/memprof-metadata-bad.ll
@@ -43,7 +43,7 @@ declare dso_local noalias noundef ptr @malloc(i64 noundef)
 !6 = !{i64 0}
 !7 = !{!8}
 ; CHECK: call stack metadata should have at least 1 operand
-; CHECK: Not all !memprof MemInfoBlock operands 1 to N-1 are MDString
+; CHECK: Not all !memprof MemInfoBlock operands 2 to N are MDNode
 !8 = !{!0, !"default", i64 0, i64 5}
 !9 = !{i64 123}
 ; CHECK: call stack metadata operand should be constant integer

>From cb1c09834221bd1e25ad6740a646f86b3fb150d9 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Thu, 31 Oct 2024 14:11:51 -0700
Subject: [PATCH 2/4] clang format

---
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 6d95f68f328baf..11506b8e246ac6 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -8117,8 +8117,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       for (unsigned J = 0; J < NumVersions; J++)
         Versions.push_back(Record[I++]);
       assert(I == Record.size());
-      PendingAllocs.push_back(
-          AllocInfo(std::move(Versions), std::move(MIBs)));
+      PendingAllocs.push_back(AllocInfo(std::move(Versions), std::move(MIBs)));
       break;
     }
     }

>From 8d960cf6ea6bad88599ff0d4d0d9438c2fc60474 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Thu, 7 Nov 2024 11:32:24 -0800
Subject: [PATCH 3/4] Address comments

---
 .../include/llvm/Analysis/MemoryProfileInfo.h |  3 +--
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      |  2 +-
 llvm/include/llvm/IR/ModuleSummaryIndex.h     |  2 +-
 llvm/lib/Analysis/MemoryProfileInfo.cpp       | 23 ++++++++-----------
 llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp   |  2 +-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |  5 ++--
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     | 12 +++++-----
 .../IPO/MemProfContextDisambiguation.cpp      |  3 ++-
 8 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index 55889c841b283e..b46124a4ed0d56 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -103,8 +103,7 @@ class CallStackTrie {
   /// allocation call down to the bottom of the call stack (i.e. callee to
   /// caller order).
   void addCallStack(AllocationType AllocType, ArrayRef<uint64_t> StackIds,
-                    std::vector<ContextTotalSize> ContextSizeInfo =
-                        std::vector<ContextTotalSize>());
+                    std::vector<ContextTotalSize> ContextSizeInfo = {});
 
   /// Add the call stack context along with its allocation type from the MIB
   /// metadata to the Trie.
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 130c92b28b3d5e..1480e8e3745dd2 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -325,7 +325,7 @@ enum GlobalValueSummarySymtabCodes {
   // List of all (full stack id, total size) pairs optionally referenced by
   // index from the alloc info records.
   // [n x (full stack id, total size)]
-  FS_CONTEXT_SIZE_INFOS = 31,
+  FS_CONTEXT_SIZE_INFO = 31,
 };
 
 enum MetadataCodes {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index ccb6c8473f23ee..f9b9cef652ad85 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1541,7 +1541,7 @@ class ModuleSummaryIndex {
   // construction (including while building combined index). Currently this
   // releases the temporary map used while constructing a correspondence between
   // stack ids and their index in the StackIds vector, and a similar map used
-  // while constructing a the ContextSizeInfos vector. Mostly impactful when
+  // while constructing the ContextSizeInfos vector. Mostly impactful when
   // building a large combined index.
   void releaseTemporaryMemory() {
     assert(StackIdToIndex.size() == StackIds.size());
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 885f2e4d040143..85aadefb96e056 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -135,7 +135,6 @@ void CallStackTrie::addCallStack(
   bool First = true;
   CallStackTrieNode *Curr = nullptr;
   for (auto StackId : StackIds) {
-    // errs() << StackId << " ";
     //  If this is the first stack frame, add or update alloc node.
     if (First) {
       First = false;
@@ -204,11 +203,11 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
   MIBPayload.push_back(
       MDString::get(Ctx, getAllocTypeAttributeString(AllocType)));
   if (!ContextSizeInfo.empty()) {
-    for (auto Info : ContextSizeInfo) {
+    for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) {
       auto *FullStackIdMD = ValueAsMetadata::get(
-          ConstantInt::get(Type::getInt64Ty(Ctx), Info.FullStackId));
+          ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId));
       auto *TotalSizeMD = ValueAsMetadata::get(
-          ConstantInt::get(Type::getInt64Ty(Ctx), Info.TotalSize));
+          ConstantInt::get(Type::getInt64Ty(Ctx), TotalSize));
       auto *ContextSizeMD = MDNode::get(Ctx, {FullStackIdMD, TotalSizeMD});
       MIBPayload.push_back(ContextSizeMD);
     }
@@ -220,11 +219,8 @@ void CallStackTrie::collectContextSizeInfo(
     CallStackTrieNode *Node, std::vector<ContextTotalSize> &ContextSizeInfo) {
   ContextSizeInfo.insert(ContextSizeInfo.end(), Node->ContextSizeInfo.begin(),
                          Node->ContextSizeInfo.end());
-  if (Node->Callers.empty())
-    return;
-  for (auto &Caller : Node->Callers) {
+  for (auto &Caller : Node->Callers)
     collectContextSizeInfo(Caller.second, ContextSizeInfo);
-  }
 }
 
 // Recursive helper to trim contexts and create metadata nodes.
@@ -293,11 +289,12 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
     if (MemProfReportHintedSizes) {
       std::vector<ContextTotalSize> ContextSizeInfo;
       collectContextSizeInfo(Alloc, ContextSizeInfo);
-      for (const auto &Info : ContextSizeInfo) {
-        errs() << "Total size for full allocation context hash "
-               << Info.FullStackId << " and single alloc type "
-               << getAllocTypeAttributeString((AllocationType)Alloc->AllocTypes)
-               << ": " << Info.TotalSize << "\n";
+      for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) {
+        errs()
+            << "MemProf hinting: Total size for full allocation context hash "
+            << FullStackId << " and single alloc type "
+            << getAllocTypeAttributeString((AllocationType)Alloc->AllocTypes)
+            << ": " << TotalSize << "\n";
       }
     }
     return false;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 419df0d78e1796..e729099de175a8 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -328,7 +328,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO)
       STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO)
       STRINGIFY_CODE(FS, STACK_IDS)
-      STRINGIFY_CODE(FS, CONTEXT_SIZE_INFOS)
+      STRINGIFY_CODE(FS, CONTEXT_SIZE_INFO)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch (CodeID) {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 11506b8e246ac6..131a0454a68ac4 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -987,7 +987,7 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   /// ids from the lists in the callsite and alloc entries to the index.
   std::vector<uint64_t> StackIds;
 
-  // Saves the context total size information from the CONTEXT_SIZE_INFOS record
+  // Saves the context total size information from the CONTEXT_SIZE_INFO record
   // to consult when adding this from the lists in the alloc entries to the
   // index.
   std::vector<ContextTotalSize> ContextSizeInfos;
@@ -8002,7 +8002,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       break;
     }
 
-    case bitc::FS_CONTEXT_SIZE_INFOS: { // [n x (fullstackid, totalsize)]
+    case bitc::FS_CONTEXT_SIZE_INFO: { // [n x (fullstackid, totalsize)]
       // Save context size infos in the reader to consult when adding them from
       // the lists in the alloc node entries.
       for (auto R = Record.begin(); R != Record.end(); R += 2)
@@ -8075,6 +8075,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           unsigned NumContextSizeInfoEntries = Record[I++];
           assert(Record.size() - I >= NumContextSizeInfoEntries);
           std::vector<unsigned> ContextSizeIndices;
+          ContextSizeIndices.reserve(NumContextSizeInfoEntries);
           for (unsigned J = 0; J < NumContextSizeInfoEntries; J++) {
             assert(Record[I] < ContextSizeInfos.size());
             ContextSizeIndices.push_back(TheIndex.addOrGetContextSizeIndex(
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 867470426962dc..fd911a114427cc 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4241,7 +4241,7 @@ static void writeFunctionHeapProfileRecords(
     assert(AI.ContextSizeInfoIndices.empty() ||
            AI.ContextSizeInfoIndices.size() == AI.MIBs.size());
     if (WriteContextSizeInfoIndex && !AI.ContextSizeInfoIndices.empty()) {
-      for (auto Indices : AI.ContextSizeInfoIndices) {
+      for (auto &Indices : AI.ContextSizeInfoIndices) {
         Record.push_back(Indices.size());
         for (auto Id : Indices)
           Record.push_back(Id);
@@ -4413,17 +4413,17 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   SmallVector<uint64_t, 64> NameVals;
   if (!Index->contextSizeInfos().empty()) {
     auto ContextSizeInfoAbbv = std::make_shared<BitCodeAbbrev>();
-    ContextSizeInfoAbbv->Add(BitCodeAbbrevOp(bitc::FS_CONTEXT_SIZE_INFOS));
+    ContextSizeInfoAbbv->Add(BitCodeAbbrevOp(bitc::FS_CONTEXT_SIZE_INFO));
     // numids x (fullStackid, totalsize)
     ContextSizeInfoAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
     ContextSizeInfoAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     unsigned ContextSizeInfoAbbvId =
         Stream.EmitAbbrev(std::move(ContextSizeInfoAbbv));
-    for (const auto &Info : Index->contextSizeInfos()) {
-      NameVals.push_back(Info.FullStackId);
-      NameVals.push_back(Info.TotalSize);
+    for (const auto &[FullStackId, TotalSize] : Index->contextSizeInfos()) {
+      NameVals.push_back(FullStackId);
+      NameVals.push_back(TotalSize);
     }
-    Stream.EmitRecord(bitc::FS_CONTEXT_SIZE_INFOS, NameVals,
+    Stream.EmitRecord(bitc::FS_CONTEXT_SIZE_INFO, NameVals,
                       ContextSizeInfoAbbvId);
     NameVals.clear();
   }
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 6d5b790e5f6793..677ad78ab52082 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -2847,7 +2847,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
       auto CSI = ContextIdToContextSizeInfos.find(Id);
       if (CSI != ContextIdToContextSizeInfos.end()) {
         for (auto &Info : CSI->second) {
-          OS << getAllocTypeString((uint8_t)TypeI->second)
+          OS << "MemProf hinting: "
+             << getAllocTypeString((uint8_t)TypeI->second)
              << " full allocation context " << Info.FullStackId
              << " with total size " << Info.TotalSize << " is "
              << getAllocTypeString(Node->AllocTypes) << " after cloning\n";

>From 0d2f55afb371011b9e6cb5d4733795fe8ab0c30d Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Wed, 13 Nov 2024 09:22:00 -0800
Subject: [PATCH 4/4] Update format to reduce bitcode size and thin link memory
 overhead.

---
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      | 15 ++--
 llvm/include/llvm/IR/ModuleSummaryIndex.h     | 68 ++++------------
 llvm/lib/Analysis/ModuleSummaryAnalysis.cpp   | 18 ++---
 llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp   |  2 +-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     | 60 +++++++-------
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     | 78 +++++++++++--------
 .../IPO/MemProfContextDisambiguation.cpp      |  8 +-
 llvm/test/ThinLTO/X86/memprof-basic.ll        |  5 +-
 8 files changed, 120 insertions(+), 134 deletions(-)

diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 1480e8e3745dd2..a0fb32f67e3858 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -308,7 +308,7 @@ enum GlobalValueSummarySymtabCodes {
   FS_PERMODULE_CALLSITE_INFO = 26,
   // Summary of per-module allocation memprof metadata.
   // [nummib, nummib x (alloc type, numstackids, numstackids x stackidindex),
-  // [nummib x (numcontext x contextsizeindex)]?]
+  // [nummib x (numcontext x total size)]?]
   FS_PERMODULE_ALLOC_INFO = 27,
   // Summary of combined index memprof callsite metadata.
   // [valueid, numstackindices, numver,
@@ -322,10 +322,15 @@ enum GlobalValueSummarySymtabCodes {
   // List of all stack ids referenced by index in the callsite and alloc infos.
   // [n x stack id]
   FS_STACK_IDS = 30,
-  // List of all (full stack id, total size) pairs optionally referenced by
-  // index from the alloc info records.
-  // [n x (full stack id, total size)]
-  FS_CONTEXT_SIZE_INFO = 31,
+  // List of all full stack id pairs corresponding to the total sizes recorded
+  // at the end of the alloc info when reporting of hinted bytes is enabled.
+  // We use a fixed-width array, which is more efficient as these ids typically
+  // are close to 64 bits in size. The max fixed width value supported is 32
+  // bits so each 64-bit context id hash is recorded as a pair (upper 32 bits
+  // first). This record must immediately precede the associated alloc info, and
+  // the entries must be in the exact same order as the corresponding sizes.
+  // [nummib x (numcontext x full stack id)]
+  FS_ALLOC_CONTEXT_IDS = 31,
 };
 
 enum MetadataCodes {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index f9b9cef652ad85..62d8e07bd9acd4 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -421,10 +421,8 @@ struct AllocInfo {
   // MIBs vector, if non-empty. Note that each MIB in the summary can have
   // multiple of these as we trim the contexts when possible during matching.
   // For hinted size reporting we, however, want the original pre-trimmed full
-  // stack context id for better correlation with the profile. Note that these
-  // are indexes into the ContextSizeInfos list in the index, to enable
-  // deduplication.
-  std::vector<std::vector<unsigned>> ContextSizeInfoIndices;
+  // stack context id for better correlation with the profile.
+  std::vector<std::vector<ContextTotalSize>> ContextSizeInfos;
 
   AllocInfo(std::vector<MIBInfo> MIBs) : MIBs(std::move(MIBs)) {
     Versions.push_back(0);
@@ -446,19 +444,16 @@ inline raw_ostream &operator<<(raw_ostream &OS, const AllocInfo &AE) {
   for (auto &M : AE.MIBs) {
     OS << "\t\t" << M << "\n";
   }
-  if (!AE.ContextSizeInfoIndices.empty()) {
-    OS << " ContextSizeInfo index per MIB:\n\t\t";
-    First = true;
-    for (auto Indices : AE.ContextSizeInfoIndices) {
-      if (!First)
-        OS << ", ";
-      First = false;
-      bool FirstIndex = true;
-      for (uint64_t Index : Indices) {
-        if (!FirstIndex)
+  if (!AE.ContextSizeInfos.empty()) {
+    OS << "\tContextSizeInfo per MIB:\n";
+    for (auto Infos : AE.ContextSizeInfos) {
+      OS << "\t\t";
+      bool FirstInfo = true;
+      for (auto [FullStackId, TotalSize] : Infos) {
+        if (!FirstInfo)
           OS << ", ";
-        FirstIndex = false;
-        OS << Index;
+        FirstInfo = false;
+        OS << "{ " << FullStackId << ", " << TotalSize << " }";
       }
       OS << "\n";
     }
@@ -1447,19 +1442,6 @@ class ModuleSummaryIndex {
   // built via releaseTemporaryMemory.
   DenseMap<uint64_t, unsigned> StackIdToIndex;
 
-  // List of unique ContextTotalSize structs (pair of the full stack id hash and
-  // its associated total profiled size). We use an index into this vector when
-  // referencing from the alloc summary to reduce the overall memory and size
-  // requirements, since often allocations may be duplicated due to inlining.
-  std::vector<ContextTotalSize> ContextSizeInfos;
-
-  // Temporary map while building the ContextSizeInfos list. Clear when index is
-  // completely built via releaseTemporaryMemory.
-  // Maps from full stack id to a map of total size to the assigned index.
-  // We need size in here too because due to stack truncation in the profile we
-  // can have the same full stack id and different sizes.
-  DenseMap<uint64_t, DenseMap<uint64_t, unsigned>> ContextToTotalSizeAndIndex;
-
   // YAML I/O support.
   friend yaml::MappingTraits<ModuleSummaryIndex>;
 
@@ -1504,9 +1486,6 @@ class ModuleSummaryIndex {
   size_t size() const { return GlobalValueMap.size(); }
 
   const std::vector<uint64_t> &stackIds() const { return StackIds; }
-  const std::vector<ContextTotalSize> &contextSizeInfos() const {
-    return ContextSizeInfos;
-  }
 
   unsigned addOrGetStackIdIndex(uint64_t StackId) {
     auto Inserted = StackIdToIndex.insert({StackId, StackIds.size()});
@@ -1520,36 +1499,15 @@ class ModuleSummaryIndex {
     return StackIds[Index];
   }
 
-  unsigned addOrGetContextSizeIndex(ContextTotalSize ContextSizeInfo) {
-    auto &Entry = ContextToTotalSizeAndIndex[ContextSizeInfo.FullStackId];
-    auto Inserted =
-        Entry.insert({ContextSizeInfo.TotalSize, ContextSizeInfos.size()});
-    if (Inserted.second)
-      ContextSizeInfos.push_back(
-          {ContextSizeInfo.FullStackId, ContextSizeInfo.TotalSize});
-    else
-      assert(Inserted.first->first == ContextSizeInfo.TotalSize);
-    return Inserted.first->second;
-  }
-
-  ContextTotalSize getContextSizeInfoAtIndex(unsigned Index) const {
-    assert(ContextSizeInfos.size() > Index);
-    return ContextSizeInfos[Index];
-  }
-
   // Facility to release memory from data structures only needed during index
-  // construction (including while building combined index). Currently this
+  // construction (including while building combined index). Currently this only
   // releases the temporary map used while constructing a correspondence between
-  // stack ids and their index in the StackIds vector, and a similar map used
-  // while constructing the ContextSizeInfos vector. Mostly impactful when
+  // stack ids and their index in the StackIds vector. Mostly impactful when
   // building a large combined index.
   void releaseTemporaryMemory() {
     assert(StackIdToIndex.size() == StackIds.size());
     StackIdToIndex.clear();
     StackIds.shrink_to_fit();
-    assert(ContextToTotalSizeAndIndex.size() == ContextSizeInfos.size());
-    ContextToTotalSizeAndIndex.clear();
-    ContextSizeInfos.shrink_to_fit();
   }
 
   /// Convenience function for doing a DFS on a ValueInfo. Marks the function in
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 3273de51a79d9f..1593d4d9014634 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -523,7 +523,7 @@ static void computeFunctionSummary(
       if (MemProfMD) {
         std::vector<MIBInfo> MIBs;
         std::vector<uint64_t> TotalSizes;
-        std::vector<std::vector<unsigned>> ContextSizeInfoIndices;
+        std::vector<std::vector<ContextTotalSize>> ContextSizeInfos;
         for (auto &MDOp : MemProfMD->operands()) {
           auto *MIBMD = cast<const MDNode>(MDOp);
           MDNode *StackNode = getMIBStackNode(MIBMD);
@@ -545,7 +545,7 @@ static void computeFunctionSummary(
           // the summary.
           assert(MIBMD->getNumOperands() > 2 || !MemProfReportHintedSizes);
           if (MIBMD->getNumOperands() > 2) {
-            std::vector<unsigned> ContextSizeIndices;
+            std::vector<ContextTotalSize> ContextSizes;
             for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
               MDNode *ContextSizePair = dyn_cast<MDNode>(MIBMD->getOperand(I));
               assert(ContextSizePair->getNumOperands() == 2);
@@ -555,20 +555,18 @@ static void computeFunctionSummary(
               uint64_t TS = mdconst::dyn_extract<ConstantInt>(
                                 ContextSizePair->getOperand(1))
                                 ->getZExtValue();
-              ContextSizeIndices.push_back(
-                  Index.addOrGetContextSizeIndex({FullStackId, TS}));
+              ContextSizes.push_back({FullStackId, TS});
             }
-            ContextSizeInfoIndices.push_back(std::move(ContextSizeIndices));
+            ContextSizeInfos.push_back(std::move(ContextSizes));
           }
           MIBs.push_back(
               MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices)));
         }
         Allocs.push_back(AllocInfo(std::move(MIBs)));
-        assert(!ContextSizeInfoIndices.empty() || !MemProfReportHintedSizes);
-        if (!ContextSizeInfoIndices.empty()) {
-          assert(Allocs.back().MIBs.size() == ContextSizeInfoIndices.size());
-          Allocs.back().ContextSizeInfoIndices =
-              std::move(ContextSizeInfoIndices);
+        assert(!ContextSizeInfos.empty() || !MemProfReportHintedSizes);
+        if (!ContextSizeInfos.empty()) {
+          assert(Allocs.back().MIBs.size() == ContextSizeInfos.size());
+          Allocs.back().ContextSizeInfos = std::move(ContextSizeInfos);
         }
       } else if (!InstCallsite.empty()) {
         SmallVector<unsigned> StackIdIndices;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index e729099de175a8..8f79ccdb9ff75f 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -328,7 +328,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO)
       STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO)
       STRINGIFY_CODE(FS, STACK_IDS)
-      STRINGIFY_CODE(FS, CONTEXT_SIZE_INFO)
+      STRINGIFY_CODE(FS, ALLOC_CONTEXT_IDS)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch (CodeID) {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 131a0454a68ac4..e3910cf4408e4f 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -987,11 +987,6 @@ class ModuleSummaryIndexBitcodeReader : public BitcodeReaderBase {
   /// ids from the lists in the callsite and alloc entries to the index.
   std::vector<uint64_t> StackIds;
 
-  // Saves the context total size information from the CONTEXT_SIZE_INFO record
-  // to consult when adding this from the lists in the alloc entries to the
-  // index.
-  std::vector<ContextTotalSize> ContextSizeInfos;
-
 public:
   ModuleSummaryIndexBitcodeReader(
       BitstreamCursor Stream, StringRef Strtab, ModuleSummaryIndex &TheIndex,
@@ -7608,6 +7603,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
 
   std::vector<CallsiteInfo> PendingCallsites;
   std::vector<AllocInfo> PendingAllocs;
+  std::vector<uint64_t> PendingContextIds;
 
   while (true) {
     Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
@@ -8002,14 +7998,6 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       break;
     }
 
-    case bitc::FS_CONTEXT_SIZE_INFO: { // [n x (fullstackid, totalsize)]
-      // Save context size infos in the reader to consult when adding them from
-      // the lists in the alloc node entries.
-      for (auto R = Record.begin(); R != Record.end(); R += 2)
-        ContextSizeInfos.push_back({*R, *(R + 1)});
-      break;
-    }
-
     case bitc::FS_PERMODULE_CALLSITE_INFO: {
       unsigned ValueID = Record[0];
       SmallVector<unsigned> StackIdList;
@@ -8044,6 +8032,16 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       break;
     }
 
+    case bitc::FS_ALLOC_CONTEXT_IDS: {
+      // This is an array of 32-bit fixed-width values, holding each 64-bit
+      // context id as a pair of adjacent (most significant first) 32-bit words.
+      assert(!(Record.size() % 2));
+      PendingContextIds.reserve(Record.size() / 2);
+      for (auto R = Record.begin(); R != Record.end(); R += 2)
+        PendingContextIds.push_back(*R << 32 | *(R + 1));
+      break;
+    }
+
     case bitc::FS_PERMODULE_ALLOC_INFO: {
       unsigned I = 0;
       std::vector<MIBInfo> MIBs;
@@ -8066,30 +8064,40 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
         MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
       }
       // We either have nothing left or at least NumMIBs context size info
-      // indices left.
+      // indices left (for the total sizes included when reporting of hinted
+      // bytes is enabled).
       assert(I == Record.size() || Record.size() - I >= NumMIBs);
-      std::vector<std::vector<unsigned>> AllContextSizeIndices;
+      std::vector<std::vector<ContextTotalSize>> AllContextSizes;
       if (I < Record.size()) {
+        assert(!PendingContextIds.empty() &&
+               "Missing context ids for alloc sizes");
+        unsigned ContextIdIndex = 0;
         MIBsRead = 0;
+        // The sizes are a linearized array of sizes, where for each MIB there
+        // is 1 or more sizes (due to context trimming, each MIB in the metadata
+        // and summarized here can correspond to more than one original context
+        // from the profile).
         while (MIBsRead++ < NumMIBs) {
+          // First read the number of contexts recorded for this MIB.
           unsigned NumContextSizeInfoEntries = Record[I++];
           assert(Record.size() - I >= NumContextSizeInfoEntries);
-          std::vector<unsigned> ContextSizeIndices;
-          ContextSizeIndices.reserve(NumContextSizeInfoEntries);
+          std::vector<ContextTotalSize> ContextSizes;
+          ContextSizes.reserve(NumContextSizeInfoEntries);
           for (unsigned J = 0; J < NumContextSizeInfoEntries; J++) {
-            assert(Record[I] < ContextSizeInfos.size());
-            ContextSizeIndices.push_back(TheIndex.addOrGetContextSizeIndex(
-                ContextSizeInfos[Record[I++]]));
+            assert(ContextIdIndex < PendingContextIds.size());
+            // PendingContextIds read from the preceding FS_ALLOC_CONTEXT_IDS
+            // should be in the same order as the total sizes.
+            ContextSizes.push_back(
+                {PendingContextIds[ContextIdIndex++], Record[I++]});
           }
-          AllContextSizeIndices.push_back(std::move(ContextSizeIndices));
+          AllContextSizes.push_back(std::move(ContextSizes));
         }
+        PendingContextIds.clear();
       }
       PendingAllocs.push_back(AllocInfo(std::move(MIBs)));
-      if (!AllContextSizeIndices.empty()) {
-        assert(PendingAllocs.back().MIBs.size() ==
-               AllContextSizeIndices.size());
-        PendingAllocs.back().ContextSizeInfoIndices =
-            std::move(AllContextSizeIndices);
+      if (!AllContextSizes.empty()) {
+        assert(PendingAllocs.back().MIBs.size() == AllContextSizes.size());
+        PendingAllocs.back().ContextSizeInfos = std::move(AllContextSizes);
       }
       break;
     }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index fd911a114427cc..a1bc573806235e 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -230,7 +230,8 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase {
   void writePerModuleFunctionSummaryRecord(
       SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
       unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev,
-      unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F);
+      unsigned CallsiteAbbrev, unsigned AllocAbbrev, unsigned ContextIdAbbvId,
+      const Function &F);
   void writeModuleLevelReferences(const GlobalVariable &V,
                                   SmallVector<uint64_t, 64> &NameVals,
                                   unsigned FSModRefsAbbrev,
@@ -4193,7 +4194,7 @@ static void writeTypeIdCompatibleVtableSummaryRecord(
 
 static void writeFunctionHeapProfileRecords(
     BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev,
-    unsigned AllocAbbrev, bool PerModule,
+    unsigned AllocAbbrev, unsigned ContextIdAbbvId, bool PerModule,
     std::function<unsigned(const ValueInfo &VI)> GetValueID,
     std::function<unsigned(unsigned)> GetStackIndex,
     bool WriteContextSizeInfoIndex) {
@@ -4238,14 +4239,34 @@ static void writeFunctionHeapProfileRecords(
       for (auto V : AI.Versions)
         Record.push_back(V);
     }
-    assert(AI.ContextSizeInfoIndices.empty() ||
-           AI.ContextSizeInfoIndices.size() == AI.MIBs.size());
-    if (WriteContextSizeInfoIndex && !AI.ContextSizeInfoIndices.empty()) {
-      for (auto &Indices : AI.ContextSizeInfoIndices) {
-        Record.push_back(Indices.size());
-        for (auto Id : Indices)
-          Record.push_back(Id);
+    assert(AI.ContextSizeInfos.empty() ||
+           AI.ContextSizeInfos.size() == AI.MIBs.size());
+    // Optionally emit the context size information if it exists.
+    if (WriteContextSizeInfoIndex && !AI.ContextSizeInfos.empty()) {
+      // The abbreviation id for the context ids record should have been created
+      // if we are emitting the per-module index, which is where we write this
+      // info.
+      assert(ContextIdAbbvId);
+      SmallVector<uint32_t> ContextIds;
+      // At least one context id per ContextSizeInfos entry (MIB), broken into 2
+      // halves.
+      ContextIds.reserve(AI.ContextSizeInfos.size() * 2);
+      for (auto &Infos : AI.ContextSizeInfos) {
+        Record.push_back(Infos.size());
+        for (auto [FullStackId, TotalSize] : Infos) {
+          // The context ids are emitted separately as a fixed width array,
+          // which is more efficient than a VBR given that these hashes are
+          // typically close to 64-bits. The max fixed width entry is 32 bits so
+          // it is split into 2.
+          ContextIds.push_back(static_cast<uint32_t>(FullStackId >> 32));
+          ContextIds.push_back(static_cast<uint32_t>(FullStackId));
+          Record.push_back(TotalSize);
+        }
       }
+      // The context ids are expected by the reader to immediately precede the
+      // associated alloc info record.
+      Stream.EmitRecord(bitc::FS_ALLOC_CONTEXT_IDS, ContextIds,
+                        ContextIdAbbvId);
     }
     Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO
                                 : bitc::FS_COMBINED_ALLOC_INFO,
@@ -4258,7 +4279,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
     SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
     unsigned ValueID, unsigned FSCallsRelBFAbbrev,
     unsigned FSCallsProfileAbbrev, unsigned CallsiteAbbrev,
-    unsigned AllocAbbrev, const Function &F) {
+    unsigned AllocAbbrev, unsigned ContextIdAbbvId, const Function &F) {
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
@@ -4269,7 +4290,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
       });
 
   writeFunctionHeapProfileRecords(
-      Stream, FS, CallsiteAbbrev, AllocAbbrev,
+      Stream, FS, CallsiteAbbrev, AllocAbbrev, ContextIdAbbvId,
       /*PerModule*/ true,
       /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); },
       /*GetStackIndex*/ [&](unsigned I) { return I; },
@@ -4405,28 +4426,22 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS));
     // numids x stackid
     StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    // FIXME: The stack ids are hashes that are close to 64 bits in size, so
+    // emitting as a pair of 32-bit fixed-width values, as we do for context
+    // ids, would be more efficient.
     StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv));
     Stream.EmitRecord(bitc::FS_STACK_IDS, Index->stackIds(), StackIdAbbvId);
   }
 
-  SmallVector<uint64_t, 64> NameVals;
-  if (!Index->contextSizeInfos().empty()) {
-    auto ContextSizeInfoAbbv = std::make_shared<BitCodeAbbrev>();
-    ContextSizeInfoAbbv->Add(BitCodeAbbrevOp(bitc::FS_CONTEXT_SIZE_INFO));
-    // numids x (fullStackid, totalsize)
-    ContextSizeInfoAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    ContextSizeInfoAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-    unsigned ContextSizeInfoAbbvId =
-        Stream.EmitAbbrev(std::move(ContextSizeInfoAbbv));
-    for (const auto &[FullStackId, TotalSize] : Index->contextSizeInfos()) {
-      NameVals.push_back(FullStackId);
-      NameVals.push_back(TotalSize);
-    }
-    Stream.EmitRecord(bitc::FS_CONTEXT_SIZE_INFO, NameVals,
-                      ContextSizeInfoAbbvId);
-    NameVals.clear();
-  }
+  // n x context id
+  auto ContextIdAbbv = std::make_shared<BitCodeAbbrev>();
+  ContextIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_ALLOC_CONTEXT_IDS));
+  ContextIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  // The context ids are hashes that are close to 64 bits in size, so emitting
+  // as a pair of 32-bit fixed-width values is more efficient than a VBR.
+  ContextIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  unsigned ContextIdAbbvId = Stream.EmitAbbrev(std::move(ContextIdAbbv));
 
   // Abbrev for FS_PERMODULE_PROFILE.
   Abbv = std::make_shared<BitCodeAbbrev>();
@@ -4508,11 +4523,12 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
   // n x (alloc type, numstackids, numstackids x stackidindex)
-  // optional: nummib x total size
+  // optional: nummib x (numcontext x total size)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
+  SmallVector<uint64_t, 64> NameVals;
   // Iterate over the list of functions instead of the Index to
   // ensure the ordering is stable.
   for (const Function &F : M) {
@@ -4531,7 +4547,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     auto *Summary = VI.getSummaryList()[0].get();
     writePerModuleFunctionSummaryRecord(
         NameVals, Summary, VE.getValueID(&F), FSCallsRelBFAbbrev,
-        FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, F);
+        FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, ContextIdAbbvId, F);
   }
 
   // Capture references from GlobalVariable initializers, which are outside
@@ -4760,7 +4776,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     getReferencedTypeIds(FS, ReferencedTypeIds);
 
     writeFunctionHeapProfileRecords(
-        Stream, FS, CallsiteAbbrev, AllocAbbrev,
+        Stream, FS, CallsiteAbbrev, AllocAbbrev, /*ContextIdAbbvId*/ 0,
         /*PerModule*/ false,
         /*GetValueId*/
         [&](const ValueInfo &VI) -> unsigned {
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 677ad78ab52082..99b23c250c5386 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -2056,17 +2056,15 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
               EmptyContext;
           unsigned I = 0;
           assert(!MemProfReportHintedSizes ||
-                 AN.ContextSizeInfoIndices.size() == AN.MIBs.size());
+                 AN.ContextSizeInfos.size() == AN.MIBs.size());
           // Now add all of the MIBs and their stack nodes.
           for (auto &MIB : AN.MIBs) {
             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
                 StackContext(&MIB);
             std::vector<ContextTotalSize> ContextSizeInfo;
             if (MemProfReportHintedSizes) {
-              for (auto Id : AN.ContextSizeInfoIndices[I]) {
-                auto Info = Index.getContextSizeInfoAtIndex(Id);
-                ContextSizeInfo.push_back({Info.FullStackId, Info.TotalSize});
-              }
+              for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
+                ContextSizeInfo.push_back({FullStackId, TotalSize});
             }
             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 96d5459c78793e..6de301340acf35 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -45,7 +45,7 @@
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
 ; RUN:	-memprof-report-hinted-sizes \
 ; RUN:	-stats -pass-remarks=memprof-context-disambiguation -save-temps \
-; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=DUMP-SIZES \
 ; RUN:	--check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS \
 ; RUN:  --check-prefix=SIZES
 
@@ -146,6 +146,9 @@ attributes #0 = { noinline optnone }
 ; DUMP: 	Versions: 1 MIB:
 ; DUMP: 		AllocType 1 StackIds: 2, 3, 0
 ; DUMP: 		AllocType 2 StackIds: 2, 3, 1
+; DUMP-SIZES:	ContextSizeInfo per MIB:
+; DUMP-SIZES:		{ 123, 100 }
+; DUMP-SIZES:		{ 456, 200 }, { 789, 300 }
 ; DUMP: 	(clone 0)
 ; DUMP: 	AllocTypes: NotColdCold
 ; DUMP: 	ContextIds: 1 2



More information about the llvm-commits mailing list