[llvm] 9513f2f - [MemProf] Print full context hash when reporting hinted bytes (#114465)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 15 08:24:49 PST 2024


Author: Teresa Johnson
Date: 2024-11-15T08:24:44-08:00
New Revision: 9513f2fdf2ad50f55726154a6b6a4aa463bc457f

URL: https://github.com/llvm/llvm-project/commit/9513f2fdf2ad50f55726154a6b6a4aa463bc457f
DIFF: https://github.com/llvm/llvm-project/commit/9513f2fdf2ad50f55726154a6b6a4aa463bc457f.diff

LOG: [MemProf] Print full context hash when reporting hinted bytes (#114465)

Improve the information printed when -memprof-report-hinted-sizes is
enabled. Now print the full context hash computed from the original
profile, similar to what we do when reporting matching statistics. This
will make it easier to correlate with the profile.

Note that the full context hash must be computed at profile match time
and saved in the metadata and summary, because we may trim the context
during matching when it isn't needed for distinguishing hotness.
Similarly, due to the context trimming, we may have more than one full
context id and total size pair per MIB in the metadata and summary,
which now get a list of these pairs.

Remove the old aggregate size from the metadata and summary support.
One other change from the prior support is that we no longer write the
size information into the combined index for the LTO backends, which
don't use this information, which reduces unnecessary bloat in
distributed index files.

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/MemoryProfileInfo.h
    llvm/include/llvm/Bitcode/LLVMBitCodes.h
    llvm/include/llvm/IR/ModuleSummaryIndex.h
    llvm/lib/Analysis/MemoryProfileInfo.cpp
    llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
    llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
    llvm/lib/Bitcode/Reader/BitcodeReader.cpp
    llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
    llvm/lib/IR/Verifier.cpp
    llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
    llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
    llvm/test/ThinLTO/X86/memprof-aliased-location1.ll
    llvm/test/ThinLTO/X86/memprof-aliased-location2.ll
    llvm/test/ThinLTO/X86/memprof-basic.ll
    llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll
    llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll
    llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
    llvm/test/Transforms/PGOProfile/memprof.ll
    llvm/test/Verifier/memprof-metadata-bad.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index edbce706953d18..b46124a4ed0d56 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -28,16 +28,17 @@ AllocationType getAllocType(uint64_t TotalLifetimeAccessDensity,
 /// the resulting metadata node.
 MDNode *buildCallstackMetadata(ArrayRef<uint64_t> CallStack, LLVMContext &Ctx);
 
+/// Build metadata from the provided list of full stack id and profiled size, to
+/// use when reporting of hinted sizes is enabled.
+MDNode *buildContextSizeMetadata(ArrayRef<ContextTotalSize> ContextSizeInfo,
+                                 LLVMContext &Ctx);
+
 /// Returns the stack node from an MIB metadata node.
 MDNode *getMIBStackNode(const MDNode *MIB);
 
 /// Returns the allocation type from an MIB metadata node.
 AllocationType getMIBAllocType(const MDNode *MIB);
 
-/// Returns the total size from an MIB metadata node, or 0 if it was not
-/// recorded.
-uint64_t getMIBTotalSize(const MDNode *MIB);
-
 /// Returns the string to use in attributes with the given type.
 std::string getAllocTypeAttributeString(AllocationType Type);
 
@@ -55,11 +56,15 @@ class CallStackTrie {
     // Allocation types for call context sharing the context prefix at this
     // node.
     uint8_t AllocTypes;
-    uint64_t TotalSize;
+    // If the user has requested reporting of hinted sizes, keep track of the
+    // associated full stack id and profiled sizes. Can have more than one
+    // after trimming (e.g. when building from metadata). This is only placed on
+    // the last (root-most) trie node for each allocation context.
+    std::vector<ContextTotalSize> ContextSizeInfo;
     // Map of caller stack id to the corresponding child Trie node.
     std::map<uint64_t, CallStackTrieNode *> Callers;
-    CallStackTrieNode(AllocationType Type, uint64_t TotalSize)
-        : AllocTypes(static_cast<uint8_t>(Type)), TotalSize(TotalSize) {}
+    CallStackTrieNode(AllocationType Type)
+        : AllocTypes(static_cast<uint8_t>(Type)) {}
   };
 
   // The node for the allocation at the root.
@@ -75,6 +80,11 @@ class CallStackTrie {
     delete Node;
   }
 
+  // Recursively build up a complete list of context size information from the
+  // trie nodes reached form the given Node, for hint size reporting.
+  void collectContextSizeInfo(CallStackTrieNode *Node,
+                              std::vector<ContextTotalSize> &ContextSizeInfo);
+
   // Recursive helper to trim contexts and create metadata nodes.
   bool buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
                      std::vector<uint64_t> &MIBCallStack,
@@ -93,7 +103,7 @@ class CallStackTrie {
   /// allocation call down to the bottom of the call stack (i.e. callee to
   /// caller order).
   void addCallStack(AllocationType AllocType, ArrayRef<uint64_t> StackIds,
-                    uint64_t TotalSize = 0);
+                    std::vector<ContextTotalSize> ContextSizeInfo = {});
 
   /// Add the call stack context along with its allocation type from the MIB
   /// metadata to the Trie.

diff  --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 41a6447356c23b..a0fb32f67e3858 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -308,7 +308,7 @@ enum GlobalValueSummarySymtabCodes {
   FS_PERMODULE_CALLSITE_INFO = 26,
   // Summary of per-module allocation memprof metadata.
   // [nummib, nummib x (alloc type, numstackids, numstackids x stackidindex),
-  // [nummib x total size]?]
+  // [nummib x (numcontext x total size)]?]
   FS_PERMODULE_ALLOC_INFO = 27,
   // Summary of combined index memprof callsite metadata.
   // [valueid, numstackindices, numver,
@@ -317,9 +317,20 @@ enum GlobalValueSummarySymtabCodes {
   // Summary of combined index allocation memprof metadata.
   // [nummib, numver,
   //  nummib x (alloc type, numstackids, numstackids x stackidindex),
-  //  numver x version, [nummib x total size]?]
+  //  numver x version]
   FS_COMBINED_ALLOC_INFO = 29,
+  // List of all stack ids referenced by index in the callsite and alloc infos.
+  // [n x stack id]
   FS_STACK_IDS = 30,
+  // List of all full stack id pairs corresponding to the total sizes recorded
+  // at the end of the alloc info when reporting of hinted bytes is enabled.
+  // We use a fixed-width array, which is more efficient as these ids typically
+  // are close to 64 bits in size. The max fixed width value supported is 32
+  // bits so each 64-bit context id hash is recorded as a pair (upper 32 bits
+  // first). This record must immediately precede the associated alloc info, and
+  // the entries must be in the exact same order as the corresponding sizes.
+  // [nummib x (numcontext x full stack id)]
+  FS_ALLOC_CONTEXT_IDS = 31,
 };
 
 enum MetadataCodes {

diff  --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 2ff5b4a42643d8..50def0eaf78867 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -302,6 +302,14 @@ template <> struct DenseMapInfo<ValueInfo> {
   static unsigned getHashValue(ValueInfo I) { return (uintptr_t)I.getRef(); }
 };
 
+// For optional hinted size reporting, holds a pair of the full stack id
+// (pre-trimming, from the full context in the profile), and the associated
+// total profiled size.
+struct ContextTotalSize {
+  uint64_t FullStackId;
+  uint64_t TotalSize;
+};
+
 /// Summary of memprof callsite metadata.
 struct CallsiteInfo {
   // Actual callee function.
@@ -408,9 +416,13 @@ struct AllocInfo {
   // Vector of MIBs in this memprof metadata.
   std::vector<MIBInfo> MIBs;
 
-  // If requested, keep track of total profiled sizes for each MIB. This will be
-  // a vector of the same length and order as the MIBs vector, if non-empty.
-  std::vector<uint64_t> TotalSizes;
+  // If requested, keep track of full stack contexts and total profiled sizes
+  // for each MIB. This will be a vector of the same length and order as the
+  // MIBs vector, if non-empty. Note that each MIB in the summary can have
+  // multiple of these as we trim the contexts when possible during matching.
+  // For hinted size reporting we, however, want the original pre-trimmed full
+  // stack context id for better correlation with the profile.
+  std::vector<std::vector<ContextTotalSize>> ContextSizeInfos;
 
   AllocInfo(std::vector<MIBInfo> MIBs) : MIBs(std::move(MIBs)) {
     Versions.push_back(0);
@@ -432,14 +444,18 @@ inline raw_ostream &operator<<(raw_ostream &OS, const AllocInfo &AE) {
   for (auto &M : AE.MIBs) {
     OS << "\t\t" << M << "\n";
   }
-  if (!AE.TotalSizes.empty()) {
-    OS << " TotalSizes per MIB:\n\t\t";
-    First = true;
-    for (uint64_t TS : AE.TotalSizes) {
-      if (!First)
-        OS << ", ";
-      First = false;
-      OS << TS << "\n";
+  if (!AE.ContextSizeInfos.empty()) {
+    OS << "\tContextSizeInfo per MIB:\n";
+    for (auto Infos : AE.ContextSizeInfos) {
+      OS << "\t\t";
+      bool FirstInfo = true;
+      for (auto [FullStackId, TotalSize] : Infos) {
+        if (!FirstInfo)
+          OS << ", ";
+        FirstInfo = false;
+        OS << "{ " << FullStackId << ", " << TotalSize << " }";
+      }
+      OS << "\n";
     }
   }
   return OS;

diff  --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index d3d125893377a8..0a6623c67d6bb3 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -100,12 +100,6 @@ AllocationType llvm::memprof::getMIBAllocType(const MDNode *MIB) {
   return AllocationType::NotCold;
 }
 
-uint64_t llvm::memprof::getMIBTotalSize(const MDNode *MIB) {
-  if (MIB->getNumOperands() < 3)
-    return 0;
-  return mdconst::dyn_extract<ConstantInt>(MIB->getOperand(2))->getZExtValue();
-}
-
 std::string llvm::memprof::getAllocTypeAttributeString(AllocationType Type) {
   switch (Type) {
   case AllocationType::NotCold:
@@ -136,22 +130,21 @@ bool llvm::memprof::hasSingleAllocType(uint8_t AllocTypes) {
   return NumAllocTypes == 1;
 }
 
-void CallStackTrie::addCallStack(AllocationType AllocType,
-                                 ArrayRef<uint64_t> StackIds,
-                                 uint64_t TotalSize) {
+void CallStackTrie::addCallStack(
+    AllocationType AllocType, ArrayRef<uint64_t> StackIds,
+    std::vector<ContextTotalSize> ContextSizeInfo) {
   bool First = true;
   CallStackTrieNode *Curr = nullptr;
   for (auto StackId : StackIds) {
-    // If this is the first stack frame, add or update alloc node.
+    //  If this is the first stack frame, add or update alloc node.
     if (First) {
       First = false;
       if (Alloc) {
         assert(AllocStackId == StackId);
         Alloc->AllocTypes |= static_cast<uint8_t>(AllocType);
-        Alloc->TotalSize += TotalSize;
       } else {
         AllocStackId = StackId;
-        Alloc = new CallStackTrieNode(AllocType, TotalSize);
+        Alloc = new CallStackTrieNode(AllocType);
       }
       Curr = Alloc;
       continue;
@@ -161,15 +154,18 @@ void CallStackTrie::addCallStack(AllocationType AllocType,
     if (Next != Curr->Callers.end()) {
       Curr = Next->second;
       Curr->AllocTypes |= static_cast<uint8_t>(AllocType);
-      Curr->TotalSize += TotalSize;
       continue;
     }
     // Otherwise add a new caller node.
-    auto *New = new CallStackTrieNode(AllocType, TotalSize);
+    auto *New = new CallStackTrieNode(AllocType);
     Curr->Callers[StackId] = New;
     Curr = New;
   }
   assert(Curr);
+  Curr->ContextSizeInfo.insert(Curr->ContextSizeInfo.end(),
+                               ContextSizeInfo.begin(), ContextSizeInfo.end());
+  std::vector<ContextTotalSize> AllContextSizeInfo;
+  collectContextSizeInfo(Curr, AllContextSizeInfo);
 }
 
 void CallStackTrie::addCallStack(MDNode *MIB) {
@@ -182,21 +178,52 @@ void CallStackTrie::addCallStack(MDNode *MIB) {
     assert(StackId);
     CallStack.push_back(StackId->getZExtValue());
   }
-  addCallStack(getMIBAllocType(MIB), CallStack, getMIBTotalSize(MIB));
+  std::vector<ContextTotalSize> ContextSizeInfo;
+  // Collect the context size information if it exists.
+  if (MIB->getNumOperands() > 2) {
+    for (unsigned I = 2; I < MIB->getNumOperands(); I++) {
+      MDNode *ContextSizePair = dyn_cast<MDNode>(MIB->getOperand(I));
+      assert(ContextSizePair->getNumOperands() == 2);
+      uint64_t FullStackId =
+          mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(0))
+              ->getZExtValue();
+      uint64_t TotalSize =
+          mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(1))
+              ->getZExtValue();
+      ContextSizeInfo.push_back({FullStackId, TotalSize});
+    }
+  }
+  addCallStack(getMIBAllocType(MIB), CallStack, std::move(ContextSizeInfo));
 }
 
 static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
-                             AllocationType AllocType, uint64_t TotalSize) {
+                             AllocationType AllocType,
+                             ArrayRef<ContextTotalSize> ContextSizeInfo) {
   SmallVector<Metadata *> MIBPayload(
       {buildCallstackMetadata(MIBCallStack, Ctx)});
   MIBPayload.push_back(
       MDString::get(Ctx, getAllocTypeAttributeString(AllocType)));
-  if (TotalSize)
-    MIBPayload.push_back(ValueAsMetadata::get(
-        ConstantInt::get(Type::getInt64Ty(Ctx), TotalSize)));
+  if (!ContextSizeInfo.empty()) {
+    for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) {
+      auto *FullStackIdMD = ValueAsMetadata::get(
+          ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId));
+      auto *TotalSizeMD = ValueAsMetadata::get(
+          ConstantInt::get(Type::getInt64Ty(Ctx), TotalSize));
+      auto *ContextSizeMD = MDNode::get(Ctx, {FullStackIdMD, TotalSizeMD});
+      MIBPayload.push_back(ContextSizeMD);
+    }
+  }
   return MDNode::get(Ctx, MIBPayload);
 }
 
+void CallStackTrie::collectContextSizeInfo(
+    CallStackTrieNode *Node, std::vector<ContextTotalSize> &ContextSizeInfo) {
+  ContextSizeInfo.insert(ContextSizeInfo.end(), Node->ContextSizeInfo.begin(),
+                         Node->ContextSizeInfo.end());
+  for (auto &Caller : Node->Callers)
+    collectContextSizeInfo(Caller.second, ContextSizeInfo);
+}
+
 // Recursive helper to trim contexts and create metadata nodes.
 // Caller should have pushed Node's loc to MIBCallStack. Doing this in the
 // caller makes it simpler to handle the many early returns in this method.
@@ -207,8 +234,10 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
   // Trim context below the first node in a prefix with a single alloc type.
   // Add an MIB record for the current call stack prefix.
   if (hasSingleAllocType(Node->AllocTypes)) {
+    std::vector<ContextTotalSize> ContextSizeInfo;
+    collectContextSizeInfo(Node, ContextSizeInfo);
     MIBNodes.push_back(createMIBNode(
-        Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, Node->TotalSize));
+        Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo));
     return true;
   }
 
@@ -244,8 +273,10 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
   // non-cold allocation type.
   if (!CalleeHasAmbiguousCallerContext)
     return false;
+  std::vector<ContextTotalSize> ContextSizeInfo;
+  collectContextSizeInfo(Node, ContextSizeInfo);
   MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold,
-                                   Node->TotalSize));
+                                   ContextSizeInfo));
   return true;
 }
 
@@ -257,11 +288,15 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
   if (hasSingleAllocType(Alloc->AllocTypes)) {
     addAllocTypeAttribute(Ctx, CI, (AllocationType)Alloc->AllocTypes);
     if (MemProfReportHintedSizes) {
-      assert(Alloc->TotalSize);
-      errs() << "Total size for allocation with location hash " << AllocStackId
-             << " and single alloc type "
-             << getAllocTypeAttributeString((AllocationType)Alloc->AllocTypes)
-             << ": " << Alloc->TotalSize << "\n";
+      std::vector<ContextTotalSize> ContextSizeInfo;
+      collectContextSizeInfo(Alloc, ContextSizeInfo);
+      for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) {
+        errs()
+            << "MemProf hinting: Total size for full allocation context hash "
+            << FullStackId << " and single alloc type "
+            << getAllocTypeAttributeString((AllocationType)Alloc->AllocTypes)
+            << ": " << TotalSize << "\n";
+      }
     }
     return false;
   }

diff  --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 04670f2e2d6351..611d4bfbc69e8f 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -522,6 +522,7 @@ static void computeFunctionSummary(
       if (MemProfMD) {
         std::vector<MIBInfo> MIBs;
         std::vector<uint64_t> TotalSizes;
+        std::vector<std::vector<ContextTotalSize>> ContextSizeInfos;
         for (auto &MDOp : MemProfMD->operands()) {
           auto *MIBMD = cast<const MDNode>(MDOp);
           MDNode *StackNode = getMIBStackNode(MIBMD);
@@ -539,18 +540,32 @@ static void computeFunctionSummary(
             if (StackIdIndices.empty() || StackIdIndices.back() != StackIdIdx)
               StackIdIndices.push_back(StackIdIdx);
           }
+          // If we have context size information, collect it for inclusion in
+          // the summary.
+          assert(MIBMD->getNumOperands() > 2 || !MemProfReportHintedSizes);
+          if (MIBMD->getNumOperands() > 2) {
+            std::vector<ContextTotalSize> ContextSizes;
+            for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
+              MDNode *ContextSizePair = dyn_cast<MDNode>(MIBMD->getOperand(I));
+              assert(ContextSizePair->getNumOperands() == 2);
+              uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
+                                         ContextSizePair->getOperand(0))
+                                         ->getZExtValue();
+              uint64_t TS = mdconst::dyn_extract<ConstantInt>(
+                                ContextSizePair->getOperand(1))
+                                ->getZExtValue();
+              ContextSizes.push_back({FullStackId, TS});
+            }
+            ContextSizeInfos.push_back(std::move(ContextSizes));
+          }
           MIBs.push_back(
               MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices)));
-          if (MemProfReportHintedSizes) {
-            auto TotalSize = getMIBTotalSize(MIBMD);
-            assert(TotalSize);
-            TotalSizes.push_back(TotalSize);
-          }
         }
         Allocs.push_back(AllocInfo(std::move(MIBs)));
-        if (MemProfReportHintedSizes) {
-          assert(Allocs.back().MIBs.size() == TotalSizes.size());
-          Allocs.back().TotalSizes = std::move(TotalSizes);
+        assert(!ContextSizeInfos.empty() || !MemProfReportHintedSizes);
+        if (!ContextSizeInfos.empty()) {
+          assert(Allocs.back().MIBs.size() == ContextSizeInfos.size());
+          Allocs.back().ContextSizeInfos = std::move(ContextSizeInfos);
         }
       } else if (!InstCallsite.empty()) {
         SmallVector<unsigned> StackIdIndices;

diff  --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index b7ed9cdf631454..8f79ccdb9ff75f 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -328,6 +328,7 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO)
       STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO)
       STRINGIFY_CODE(FS, STACK_IDS)
+      STRINGIFY_CODE(FS, ALLOC_CONTEXT_IDS)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch (CodeID) {

diff  --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 91b1917a3c0c99..9ca76b54a88d9d 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7604,6 +7604,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
 
   std::vector<CallsiteInfo> PendingCallsites;
   std::vector<AllocInfo> PendingAllocs;
+  std::vector<uint64_t> PendingContextIds;
 
   while (true) {
     Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
@@ -8034,6 +8035,16 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       break;
     }
 
+    case bitc::FS_ALLOC_CONTEXT_IDS: {
+      // This is an array of 32-bit fixed-width values, holding each 64-bit
+      // context id as a pair of adjacent (most significant first) 32-bit words.
+      assert(Record.size() % 2 == 0);
+      PendingContextIds.reserve(Record.size() / 2);
+      for (auto R = Record.begin(); R != Record.end(); R += 2)
+        PendingContextIds.push_back(*R << 32 | *(R + 1));
+      break;
+    }
+
     case bitc::FS_PERMODULE_ALLOC_INFO: {
       unsigned I = 0;
       std::vector<MIBInfo> MIBs;
@@ -8055,18 +8066,41 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
         }
         MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
       }
-      std::vector<uint64_t> TotalSizes;
-      // We either have no sizes or NumMIBs of them.
-      assert(I == Record.size() || Record.size() - I == NumMIBs);
+      // We either have nothing left or at least NumMIBs context size info
+      // indices left (for the total sizes included when reporting of hinted
+      // bytes is enabled).
+      assert(I == Record.size() || Record.size() - I >= NumMIBs);
+      std::vector<std::vector<ContextTotalSize>> AllContextSizes;
       if (I < Record.size()) {
+        assert(!PendingContextIds.empty() &&
+               "Missing context ids for alloc sizes");
+        unsigned ContextIdIndex = 0;
         MIBsRead = 0;
-        while (MIBsRead++ < NumMIBs)
-          TotalSizes.push_back(Record[I++]);
+        // The sizes are a linearized array of sizes, where for each MIB there
+        // is 1 or more sizes (due to context trimming, each MIB in the metadata
+        // and summarized here can correspond to more than one original context
+        // from the profile).
+        while (MIBsRead++ < NumMIBs) {
+          // First read the number of contexts recorded for this MIB.
+          unsigned NumContextSizeInfoEntries = Record[I++];
+          assert(Record.size() - I >= NumContextSizeInfoEntries);
+          std::vector<ContextTotalSize> ContextSizes;
+          ContextSizes.reserve(NumContextSizeInfoEntries);
+          for (unsigned J = 0; J < NumContextSizeInfoEntries; J++) {
+            assert(ContextIdIndex < PendingContextIds.size());
+            // PendingContextIds read from the preceding FS_ALLOC_CONTEXT_IDS
+            // should be in the same order as the total sizes.
+            ContextSizes.push_back(
+                {PendingContextIds[ContextIdIndex++], Record[I++]});
+          }
+          AllContextSizes.push_back(std::move(ContextSizes));
+        }
+        PendingContextIds.clear();
       }
       PendingAllocs.push_back(AllocInfo(std::move(MIBs)));
-      if (!TotalSizes.empty()) {
-        assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
-        PendingAllocs.back().TotalSizes = std::move(TotalSizes);
+      if (!AllContextSizes.empty()) {
+        assert(PendingAllocs.back().MIBs.size() == AllContextSizes.size());
+        PendingAllocs.back().ContextSizeInfos = std::move(AllContextSizes);
       }
       break;
     }
@@ -8094,21 +8128,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       SmallVector<uint8_t> Versions;
       for (unsigned J = 0; J < NumVersions; J++)
         Versions.push_back(Record[I++]);
-      std::vector<uint64_t> TotalSizes;
-      // We either have no sizes or NumMIBs of them.
-      assert(I == Record.size() || Record.size() - I == NumMIBs);
-      if (I < Record.size()) {
-        MIBsRead = 0;
-        while (MIBsRead++ < NumMIBs) {
-          TotalSizes.push_back(Record[I++]);
-        }
-      }
-      PendingAllocs.push_back(
-          AllocInfo(std::move(Versions), std::move(MIBs)));
-      if (!TotalSizes.empty()) {
-        assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
-        PendingAllocs.back().TotalSizes = std::move(TotalSizes);
-      }
+      assert(I == Record.size());
+      PendingAllocs.push_back(AllocInfo(std::move(Versions), std::move(MIBs)));
       break;
     }
     }

diff  --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index cd6541c2581994..5829af39cf5e2c 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -230,7 +230,8 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase {
   void writePerModuleFunctionSummaryRecord(
       SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
       unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev,
-      unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F);
+      unsigned CallsiteAbbrev, unsigned AllocAbbrev, unsigned ContextIdAbbvId,
+      const Function &F);
   void writeModuleLevelReferences(const GlobalVariable &V,
                                   SmallVector<uint64_t, 64> &NameVals,
                                   unsigned FSModRefsAbbrev,
@@ -4196,9 +4197,10 @@ static void writeTypeIdCompatibleVtableSummaryRecord(
 
 static void writeFunctionHeapProfileRecords(
     BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev,
-    unsigned AllocAbbrev, bool PerModule,
+    unsigned AllocAbbrev, unsigned ContextIdAbbvId, bool PerModule,
     std::function<unsigned(const ValueInfo &VI)> GetValueID,
-    std::function<unsigned(unsigned)> GetStackIndex) {
+    std::function<unsigned(unsigned)> GetStackIndex,
+    bool WriteContextSizeInfoIndex) {
   SmallVector<uint64_t> Record;
 
   for (auto &CI : FS->callsites()) {
@@ -4240,10 +4242,34 @@ static void writeFunctionHeapProfileRecords(
       for (auto V : AI.Versions)
         Record.push_back(V);
     }
-    assert(AI.TotalSizes.empty() || AI.TotalSizes.size() == AI.MIBs.size());
-    if (!AI.TotalSizes.empty()) {
-      for (auto Size : AI.TotalSizes)
-        Record.push_back(Size);
+    assert(AI.ContextSizeInfos.empty() ||
+           AI.ContextSizeInfos.size() == AI.MIBs.size());
+    // Optionally emit the context size information if it exists.
+    if (WriteContextSizeInfoIndex && !AI.ContextSizeInfos.empty()) {
+      // The abbreviation id for the context ids record should have been created
+      // if we are emitting the per-module index, which is where we write this
+      // info.
+      assert(ContextIdAbbvId);
+      SmallVector<uint32_t> ContextIds;
+      // At least one context id per ContextSizeInfos entry (MIB), broken into 2
+      // halves.
+      ContextIds.reserve(AI.ContextSizeInfos.size() * 2);
+      for (auto &Infos : AI.ContextSizeInfos) {
+        Record.push_back(Infos.size());
+        for (auto [FullStackId, TotalSize] : Infos) {
+          // The context ids are emitted separately as a fixed width array,
+          // which is more efficient than a VBR given that these hashes are
+          // typically close to 64-bits. The max fixed width entry is 32 bits so
+          // it is split into 2.
+          ContextIds.push_back(static_cast<uint32_t>(FullStackId >> 32));
+          ContextIds.push_back(static_cast<uint32_t>(FullStackId));
+          Record.push_back(TotalSize);
+        }
+      }
+      // The context ids are expected by the reader to immediately precede the
+      // associated alloc info record.
+      Stream.EmitRecord(bitc::FS_ALLOC_CONTEXT_IDS, ContextIds,
+                        ContextIdAbbvId);
     }
     Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO
                                 : bitc::FS_COMBINED_ALLOC_INFO,
@@ -4256,7 +4282,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
     SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
     unsigned ValueID, unsigned FSCallsRelBFAbbrev,
     unsigned FSCallsProfileAbbrev, unsigned CallsiteAbbrev,
-    unsigned AllocAbbrev, const Function &F) {
+    unsigned AllocAbbrev, unsigned ContextIdAbbvId, const Function &F) {
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
@@ -4267,10 +4293,11 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
       });
 
   writeFunctionHeapProfileRecords(
-      Stream, FS, CallsiteAbbrev, AllocAbbrev,
+      Stream, FS, CallsiteAbbrev, AllocAbbrev, ContextIdAbbvId,
       /*PerModule*/ true,
       /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); },
-      /*GetStackIndex*/ [&](unsigned I) { return I; });
+      /*GetStackIndex*/ [&](unsigned I) { return I; },
+      /*WriteContextSizeInfoIndex*/ true);
 
   auto SpecialRefCnts = FS->specialRefCounts();
   NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
@@ -4402,11 +4429,23 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     StackIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_STACK_IDS));
     // numids x stackid
     StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    // FIXME: The stack ids are hashes that are close to 64 bits in size, so
+    // emitting as a pair of 32-bit fixed-width values, as we do for context
+    // ids, would be more efficient.
     StackIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     unsigned StackIdAbbvId = Stream.EmitAbbrev(std::move(StackIdAbbv));
     Stream.EmitRecord(bitc::FS_STACK_IDS, Index->stackIds(), StackIdAbbvId);
   }
 
+  // n x context id
+  auto ContextIdAbbv = std::make_shared<BitCodeAbbrev>();
+  ContextIdAbbv->Add(BitCodeAbbrevOp(bitc::FS_ALLOC_CONTEXT_IDS));
+  ContextIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  // The context ids are hashes that are close to 64 bits in size, so emitting
+  // as a pair of 32-bit fixed-width values is more efficient than a VBR.
+  ContextIdAbbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32));
+  unsigned ContextIdAbbvId = Stream.EmitAbbrev(std::move(ContextIdAbbv));
+
   // Abbrev for FS_PERMODULE_PROFILE.
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
@@ -4487,7 +4526,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
   // n x (alloc type, numstackids, numstackids x stackidindex)
-  // optional: nummib x total size
+  // optional: nummib x (numcontext x total size)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
@@ -4511,7 +4550,7 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     auto *Summary = VI.getSummaryList()[0].get();
     writePerModuleFunctionSummaryRecord(
         NameVals, Summary, VE.getValueID(&F), FSCallsRelBFAbbrev,
-        FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, F);
+        FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, ContextIdAbbvId, F);
   }
 
   // Capture references from GlobalVariable initializers, which are outside
@@ -4740,7 +4779,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     getReferencedTypeIds(FS, ReferencedTypeIds);
 
     writeFunctionHeapProfileRecords(
-        Stream, FS, CallsiteAbbrev, AllocAbbrev,
+        Stream, FS, CallsiteAbbrev, AllocAbbrev, /*ContextIdAbbvId*/ 0,
         /*PerModule*/ false,
         /*GetValueId*/
         [&](const ValueInfo &VI) -> unsigned {
@@ -4760,7 +4799,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
           // the case of distributed indexes).
           assert(StackIdIndicesToIndex.contains(I));
           return StackIdIndicesToIndex[I];
-        });
+        },
+        /*WriteContextSizeInfoIndex*/ false);
 
     NameVals.push_back(*ValueId);
     assert(ModuleIdMap.count(FS->modulePath()));

diff  --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 6ee0a5a16e2202..5cfcd21e508595 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4995,14 +4995,35 @@ void Verifier::visitMemProfMetadata(Instruction &I, MDNode *MD) {
     MDNode *StackMD = dyn_cast<MDNode>(MIB->getOperand(0));
     visitCallStackMetadata(StackMD);
 
-    // Check that remaining operands, except possibly the last, are MDString.
-    Check(llvm::all_of(MIB->operands().drop_front().drop_back(),
-                       [](const MDOperand &Op) { return isa<MDString>(Op); }),
-          "Not all !memprof MemInfoBlock operands 1 to N-1 are MDString", MIB);
-    // The last operand might be the total profiled size so can be an integer.
-    auto &LastOperand = MIB->operands().back();
-    Check(isa<MDString>(LastOperand) || mdconst::hasa<ConstantInt>(LastOperand),
-          "Last !memprof MemInfoBlock operand not MDString or int", MIB);
+    // The next set of 1 or more operands should be MDString.
+    unsigned I = 1;
+    for (; I < MIB->getNumOperands(); ++I) {
+      if (!isa<MDString>(MIB->getOperand(I))) {
+        Check(I > 1,
+              "!memprof MemInfoBlock second operand should be an MDString",
+              MIB);
+        break;
+      }
+    }
+
+    // Any remaining should be MDNode that are pairs of integers
+    for (; I < MIB->getNumOperands(); ++I) {
+      MDNode *OpNode = dyn_cast<MDNode>(MIB->getOperand(I));
+      Check(OpNode, "Not all !memprof MemInfoBlock operands 2 to N are MDNode",
+            MIB);
+      Check(OpNode->getNumOperands() == 2,
+            "Not all !memprof MemInfoBlock operands 2 to N are MDNode with 2 "
+            "operands",
+            MIB);
+      // Check that all of Op's operands are ConstantInt.
+      Check(llvm::all_of(OpNode->operands(),
+                         [](const MDOperand &Op) {
+                           return mdconst::hasa<ConstantInt>(Op);
+                         }),
+            "Not all !memprof MemInfoBlock operands 2 to N are MDNode with "
+            "ConstantInt operands",
+            MIB);
+    }
   }
 }
 

diff  --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index a37e888cc04bc7..ea92c6e2f59e59 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -478,7 +478,8 @@ class CallsiteContextGraph {
   void addStackNodesForMIB(ContextNode *AllocNode,
                            CallStack<NodeT, IteratorT> &StackContext,
                            CallStack<NodeT, IteratorT> &CallsiteContext,
-                           AllocationType AllocType, uint64_t TotalSize);
+                           AllocationType AllocType,
+                           ArrayRef<ContextTotalSize> ContextSizeInfo);
 
   /// Matches all callsite metadata (or summary) to the nodes created for
   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
@@ -708,9 +709,10 @@ class CallsiteContextGraph {
   /// Map from each context ID to the AllocationType assigned to that context.
   DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
 
-  /// Map from each contextID to the profiled aggregate allocation size,
+  /// Map from each contextID to the profiled full contexts and their total
+  /// sizes (there may be more than one due to context trimming),
   /// optionally populated when requested (via MemProfReportHintedSizes).
-  DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;
+  DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
 
   /// Identifies the context node created for a stack id when adding the MIB
   /// contexts to the graph. This is used to locate the context nodes when
@@ -1206,8 +1208,7 @@ template <class NodeT, class IteratorT>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
-    uint64_t TotalSize) {
-  assert(!MemProfReportHintedSizes || TotalSize > 0);
+    ArrayRef<ContextTotalSize> ContextSizeInfo) {
   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
   // is done.
   if (AllocType == AllocationType::Hot)
@@ -1216,8 +1217,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
   ContextIdToAllocationType[++LastContextId] = AllocType;
 
   if (MemProfReportHintedSizes) {
-    assert(TotalSize);
-    ContextIdToTotalSize[LastContextId] = TotalSize;
+    assert(!ContextSizeInfo.empty());
+    auto &Entry = ContextIdToContextSizeInfos[LastContextId];
+    Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
   }
 
   // Update alloc type and context ids for this MIB.
@@ -1262,10 +1264,6 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
     assert(ContextIdToAllocationType.count(OldId));
     // The new context has the same allocation type as original.
     ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
-    // For now set this to 0 so we don't duplicate sizes. Not clear how to divvy
-    // up the size. Assume that if we are able to duplicate context ids that we
-    // will be able to disambiguate all copies.
-    ContextIdToTotalSize[LastContextId] = 0;
   }
   return NewContextIds;
 }
@@ -1964,12 +1962,28 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
           // Add all of the MIBs and their stack nodes.
           for (auto &MDOp : MemProfMD->operands()) {
             auto *MIBMD = cast<const MDNode>(MDOp);
+            std::vector<ContextTotalSize> ContextSizeInfo;
+            // Collect the context size information if it exists.
+            if (MIBMD->getNumOperands() > 2) {
+              for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
+                MDNode *ContextSizePair =
+                    dyn_cast<MDNode>(MIBMD->getOperand(I));
+                assert(ContextSizePair->getNumOperands() == 2);
+                uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
+                                           ContextSizePair->getOperand(0))
+                                           ->getZExtValue();
+                uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
+                                         ContextSizePair->getOperand(1))
+                                         ->getZExtValue();
+                ContextSizeInfo.push_back({FullStackId, TotalSize});
+              }
+            }
             MDNode *StackNode = getMIBStackNode(MIBMD);
             assert(StackNode);
             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
                 AllocNode, StackContext, CallsiteContext,
-                getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));
+                getMIBAllocType(MIBMD), ContextSizeInfo);
           }
           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
           // Memprof and callsite metadata on memory allocations no longer
@@ -2045,17 +2059,19 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
               EmptyContext;
           unsigned I = 0;
           assert(!MemProfReportHintedSizes ||
-                 AN.TotalSizes.size() == AN.MIBs.size());
+                 AN.ContextSizeInfos.size() == AN.MIBs.size());
           // Now add all of the MIBs and their stack nodes.
           for (auto &MIB : AN.MIBs) {
             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
                 StackContext(&MIB);
-            uint64_t TotalSize = 0;
-            if (MemProfReportHintedSizes)
-              TotalSize = AN.TotalSizes[I];
+            std::vector<ContextTotalSize> ContextSizeInfo;
+            if (MemProfReportHintedSizes) {
+              for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
+                ContextSizeInfo.push_back({FullStackId, TotalSize});
+            }
             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
-                TotalSize);
+                ContextSizeInfo);
             I++;
           }
           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
@@ -2827,13 +2843,18 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
     std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
     std::sort(SortedIds.begin(), SortedIds.end());
     for (auto Id : SortedIds) {
-      auto SizeI = ContextIdToTotalSize.find(Id);
-      assert(SizeI != ContextIdToTotalSize.end());
       auto TypeI = ContextIdToAllocationType.find(Id);
       assert(TypeI != ContextIdToAllocationType.end());
-      OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id
-         << " with total size " << SizeI->second << " is "
-         << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
+      auto CSI = ContextIdToContextSizeInfos.find(Id);
+      if (CSI != ContextIdToContextSizeInfos.end()) {
+        for (auto &Info : CSI->second) {
+          OS << "MemProf hinting: "
+             << getAllocTypeString((uint8_t)TypeI->second)
+             << " full allocation context " << Info.FullStackId
+             << " with total size " << Info.TotalSize << " is "
+             << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
+        }
+      }
     }
   }
 }

diff  --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index eda07a3d40e153..d59e0d26487d4f 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -716,19 +716,22 @@ computeFullStackId(const std::vector<memprof::Frame> &CallStack) {
 }
 
 static AllocationType addCallStack(CallStackTrie &AllocTrie,
-                                   const AllocationInfo *AllocInfo) {
+                                   const AllocationInfo *AllocInfo,
+                                   uint64_t FullStackId) {
   SmallVector<uint64_t> StackIds;
   for (const auto &StackFrame : AllocInfo->CallStack)
     StackIds.push_back(computeStackId(StackFrame));
   auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
                                 AllocInfo->Info.getAllocCount(),
                                 AllocInfo->Info.getTotalLifetime());
-  uint64_t TotalSize = 0;
+  std::vector<ContextTotalSize> ContextSizeInfo;
   if (MemProfReportHintedSizes) {
-    TotalSize = AllocInfo->Info.getTotalSize();
+    auto TotalSize = AllocInfo->Info.getTotalSize();
     assert(TotalSize);
+    assert(FullStackId != 0);
+    ContextSizeInfo.push_back({FullStackId, TotalSize});
   }
-  AllocTrie.addCallStack(AllocType, StackIds, TotalSize);
+  AllocTrie.addCallStack(AllocType, StackIds, std::move(ContextSizeInfo));
   return AllocType;
 }
 
@@ -1022,11 +1025,14 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
           if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
                                                  InlinedCallStack)) {
             NumOfMemProfMatchedAllocContexts++;
-            auto AllocType = addCallStack(AllocTrie, AllocInfo);
+            uint64_t FullStackId = 0;
+            if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes)
+              FullStackId = computeFullStackId(AllocInfo->CallStack);
+            auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
             // Record information about the allocation if match info printing
             // was requested.
             if (ClPrintMemProfMatchInfo) {
-              auto FullStackId = computeFullStackId(AllocInfo->CallStack);
+              assert(FullStackId != 0);
               FullStackIdToAllocMatchInfo[FullStackId] = {
                   AllocInfo->Info.getTotalSize(), AllocType, /*Matched=*/true};
             }

diff  --git a/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll
index 42819d5421ca0f..3e8aa9766d6c5f 100644
--- a/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll
+++ b/llvm/test/ThinLTO/X86/memprof-aliased-location1.ll
@@ -62,9 +62,9 @@ attributes #0 = { noinline optnone }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold"}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold"}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}

diff  --git a/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll
index 663f8525043c2f..9169cc03d08d6f 100644
--- a/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll
+++ b/llvm/test/ThinLTO/X86/memprof-aliased-location2.ll
@@ -62,9 +62,9 @@ attributes #0 = { noinline optnone }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold"}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold"}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}

diff  --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 6922dbfd368467..6de301340acf35 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -45,7 +45,7 @@
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
 ; RUN:	-memprof-report-hinted-sizes \
 ; RUN:	-stats -pass-remarks=memprof-context-disambiguation -save-temps \
-; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
+; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=DUMP-SIZES \
 ; RUN:	--check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS \
 ; RUN:  --check-prefix=SIZES
 
@@ -128,13 +128,16 @@ attributes #0 = { noinline optnone }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold", !10}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold", !11, !12}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}
 !9 = !{i64 2732490490862098848}
+!10 = !{i64 123, i64 100}
+!11 = !{i64 456, i64 200}
+!12 = !{i64 789, i64 300}
 
 
 ; DUMP: CCG before cloning:
@@ -143,6 +146,9 @@ attributes #0 = { noinline optnone }
 ; DUMP: 	Versions: 1 MIB:
 ; DUMP: 		AllocType 1 StackIds: 2, 3, 0
 ; DUMP: 		AllocType 2 StackIds: 2, 3, 1
+; DUMP-SIZES:	ContextSizeInfo per MIB:
+; DUMP-SIZES:		{ 123, 100 }
+; DUMP-SIZES:		{ 456, 200 }, { 789, 300 }
 ; DUMP: 	(clone 0)
 ; DUMP: 	AllocTypes: NotColdCold
 ; DUMP: 	ContextIds: 1 2
@@ -267,8 +273,9 @@ attributes #0 = { noinline optnone }
 ; DUMP: 		Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2
 ; DUMP:		Clone of [[BAR]]
 
-; SIZES: NotCold context 1 with total size 100 is NotCold after cloning
-; SIZES: Cold context 2 with total size 400 is Cold after cloning
+; SIZES: NotCold full allocation context 123 with total size 100 is NotCold after cloning
+; SIZES: Cold full allocation context 456 with total size 200 is Cold after cloning
+; SIZES: Cold full allocation context 789 with total size 300 is Cold after cloning
 
 ; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
 ; REMARKS: created clone _Z3barv.memprof.1

diff  --git a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll
index 8f9df20471e41c..c2810dfabffbd7 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location1.ll
@@ -66,9 +66,9 @@ attributes #6 = { builtin }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold"}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold"}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}

diff  --git a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll
index c3c164d4928632..068e1f116519e8 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/aliased-location2.ll
@@ -66,9 +66,9 @@ attributes #6 = { builtin }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold"}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold"}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}

diff  --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
index a82f872d51c7d5..952e2519bbf0b3 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
@@ -106,13 +106,16 @@ attributes #6 = { builtin }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold", i64 100}
+!3 = !{!4, !"notcold", !10}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold", i64 400}
+!5 = !{!6, !"cold", !11, !12}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}
 !9 = !{i64 2732490490862098848}
+!10 = !{i64 123, i64 100}
+!11 = !{i64 456, i64 200}
+!12 = !{i64 789, i64 300}
 
 
 ; DUMP: CCG before cloning:
@@ -249,8 +252,9 @@ attributes #6 = { builtin }
 ; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv
 ; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
 
-; SIZES: NotCold context 1 with total size 100 is NotCold after cloning
-; SIZES: Cold context 2 with total size 400 is Cold after cloning
+; SIZES: NotCold full allocation context 123 with total size 100 is NotCold after cloning
+; SIZES: Cold full allocation context 456 with total size 200 is Cold after cloning
+; SIZES: Cold full allocation context 789 with total size 300 is Cold after cloning
 
 ; IR: define {{.*}} @main
 ;; The first call to foo does not allocate cold memory. It should call the

diff  --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index 205eeb8878989d..f1b361de0fbba5 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -347,17 +347,24 @@ for.end:                                          ; preds = %for.cond
 ; MEMPROF: ![[C11]] = !{i64 1544787832369987002}
 
 ;; For non-context sensitive allocations that get attributes we emit a message
-;; with the allocation hash, type, and size in bytes.
-; TOTALSIZES: Total size for allocation with location hash 6792096022461663180 and single alloc type notcold: 10
-; TOTALSIZES: Total size for allocation with location hash 15737101490731057601 and single alloc type cold: 10
-;; For context sensitive allocations the size in bytes is included on the MIB
-;; metadata.
-; TOTALSIZES: !"cold", i64 10}
-; TOTALSIZES: !"cold", i64 10}
-; TOTALSIZES: !"notcold", i64 10}
-; TOTALSIZES: !"cold", i64 20}
-; TOTALSIZES: !"notcold", i64 10}
-
+;; with the full allocation context hash, type, and size in bytes.
+; TOTALSIZES: Total size for full allocation context hash 6792096022461663180 and single alloc type notcold: 10
+; TOTALSIZES: Total size for full allocation context hash 15737101490731057601 and single alloc type cold: 10
+;; For context sensitive allocations the full context hash and size in bytes
+;; are in separate metadata nodes included on the MIB metadata.
+; TOTALSIZES: !"cold", ![[CONTEXT1:[0-9]+]]}
+; TOTALSIZES: ![[CONTEXT1]] = !{i64 8525406123785421946, i64 10}
+; TOTALSIZES: !"cold", ![[CONTEXT2:[0-9]+]]}
+; TOTALSIZES: ![[CONTEXT2]] = !{i64 -6732513409544482918, i64 10}
+; TOTALSIZES: !"notcold", ![[CONTEXT3:[0-9]+]]}
+; TOTALSIZES: ![[CONTEXT3]] = !{i64 5725971306423925017, i64 10}
+;; There can be more than one context id / size pair due to context trimming
+;; when we match.
+; TOTALSIZES: !"cold", ![[CONTEXT4:[0-9]+]], ![[CONTEXT5:[0-9]+]]}
+; TOTALSIZES: ![[CONTEXT4]] = !{i64 -2103941543456458045, i64 10}
+; TOTALSIZES: ![[CONTEXT5]] = !{i64 -191931298737547222, i64 10}
+; TOTALSIZES: !"notcold", ![[CONTEXT6:[0-9]+]]}
+; TOTALSIZES: ![[CONTEXT6]] = !{i64 1093248920606587996, i64 10}
 
 ; MEMPROFNOCOLINFO: #[[A1]] = { builtin allocsize(0) "memprof"="notcold" }
 ; MEMPROFNOCOLINFO: #[[A2]] = { builtin allocsize(0) "memprof"="cold" }

diff  --git a/llvm/test/Verifier/memprof-metadata-bad.ll b/llvm/test/Verifier/memprof-metadata-bad.ll
index f4f1f6bb0a4635..b8c2c2d8a2c993 100644
--- a/llvm/test/Verifier/memprof-metadata-bad.ll
+++ b/llvm/test/Verifier/memprof-metadata-bad.ll
@@ -43,7 +43,7 @@ declare dso_local noalias noundef ptr @malloc(i64 noundef)
 !6 = !{i64 0}
 !7 = !{!8}
 ; CHECK: call stack metadata should have at least 1 operand
-; CHECK: Not all !memprof MemInfoBlock operands 1 to N-1 are MDString
+; CHECK: Not all !memprof MemInfoBlock operands 2 to N are MDNode
 !8 = !{!0, !"default", i64 0, i64 5}
 !9 = !{i64 123}
 ; CHECK: call stack metadata operand should be constant integer


        


More information about the llvm-commits mailing list