[llvm] [MemProf] Track and report profiled sizes through cloning (PR #98382)

Teresa Johnson via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 11 14:01:46 PDT 2024


https://github.com/teresajohnson updated https://github.com/llvm/llvm-project/pull/98382

>From 73358b0b8640744b1247c17ae25450ff9d52642a Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Wed, 10 Jul 2024 10:02:03 -0700
Subject: [PATCH 1/4] [MemProf] Track and report profiled sizes through cloning

If requested, via the -memprof-report-hinted-sizes option, track the
total profiled size of each MIB through the thin link, then report on
the corresponding allocation coldness after all cloning is complete.

To save size, a different bitcode record type is used for the allocation
info when the option is specified, and the sizes are kept separate from
the MIBs in the index.
---
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      | 10 +++
 llvm/include/llvm/IR/ModuleSummaryIndex.h     | 14 ++++
 llvm/include/llvm/IR/ModuleSummaryIndexYAML.h |  3 +-
 llvm/lib/Analysis/ModuleSummaryAnalysis.cpp   | 16 +++-
 llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp   |  2 +
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     | 34 +++++++-
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     | 59 +++++++++++---
 .../IPO/MemProfContextDisambiguation.cpp      | 81 +++++++++++++++----
 llvm/test/ThinLTO/X86/memprof-basic.ll        | 17 ++--
 .../MemProfContextDisambiguation/basic.ll     | 11 ++-
 10 files changed, 205 insertions(+), 42 deletions(-)

diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 5b5e08b5cbc3f..9f989aaf22779 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -319,6 +319,16 @@ enum GlobalValueSummarySymtabCodes {
   //  numver x version]
   FS_COMBINED_ALLOC_INFO = 29,
   FS_STACK_IDS = 30,
+  // Summary of per-module allocation memprof metadata when we are tracking
+  // total sizes.
+  // [n x (alloc type, total size, nummib, nummib x stackidindex)]
+  FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES = 31,
+  // Summary of combined index allocation memprof metadata when we are tracking
+  // total sizes.
+  // [nummib, numver,
+  //  nummib x (alloc type, total size, numstackids,
+  //  numstackids x stackidindex), numver x version]
+  FS_COMBINED_ALLOC_INFO_TOTAL_SIZES = 32,
 };
 
 enum MetadataCodes {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 31271ed388e54..950c0ed824dbb 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -403,6 +403,10 @@ struct AllocInfo {
   // Vector of MIBs in this memprof metadata.
   std::vector<MIBInfo> MIBs;
 
+  // If requested, keep track of total profiled sizes for each MIB. This will be
+  // a vector of the same length and order as the MIBs vector.
+  std::unique_ptr<std::vector<uint64_t>> TotalSizes;
+
   AllocInfo(std::vector<MIBInfo> MIBs) : MIBs(std::move(MIBs)) {
     Versions.push_back(0);
   }
@@ -423,6 +427,16 @@ inline raw_ostream &operator<<(raw_ostream &OS, const AllocInfo &AE) {
   for (auto &M : AE.MIBs) {
     OS << "\t\t" << M << "\n";
   }
+  if (AE.TotalSizes) {
+    OS << " TotalSizes per MIB:\n\t\t";
+    First = true;
+    for (auto &TS : *AE.TotalSizes) {
+      if (!First)
+        OS << ", ";
+      First = false;
+      OS << TS << "\n";
+    }
+  }
   return OS;
 }
 
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index b2747d24c5396..26bf08f9f7439 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -224,6 +224,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
           V.emplace(RefGUID, /*IsAnalysis=*/false);
         Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*V.find(RefGUID)));
       }
+      std::vector<AllocInfo> EmptyAllocInfo;
       Elem.SummaryList.push_back(std::make_unique<FunctionSummary>(
           GlobalValueSummary::GVFlags(
               static_cast<GlobalValue::LinkageTypes>(FSum.Linkage),
@@ -238,7 +239,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
           std::move(FSum.TypeTestAssumeConstVCalls),
           std::move(FSum.TypeCheckedLoadConstVCalls),
           ArrayRef<FunctionSummary::ParamAccess>{}, ArrayRef<CallsiteInfo>{},
-          ArrayRef<AllocInfo>{}));
+          std::move(EmptyAllocInfo)));
     }
   }
   static void output(IO &io, GlobalValueSummaryMapTy &V) {
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 94ac0484f5ec7..c3b6fd8bc20af 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -85,6 +85,8 @@ extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
 
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
 
+extern cl::opt<bool> MemProfReportHintedSizes;
+
 // Walk through the operands of a given User via worklist iteration and populate
 // the set of GlobalValue references encountered. Invoked either on an
 // Instruction or a GlobalVariable (which walks its initializer).
@@ -517,6 +519,7 @@ static void computeFunctionSummary(
       auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
       if (MemProfMD) {
         std::vector<MIBInfo> MIBs;
+        std::vector<uint64_t> TotalSizes;
         for (auto &MDOp : MemProfMD->operands()) {
           auto *MIBMD = cast<const MDNode>(MDOp);
           MDNode *StackNode = getMIBStackNode(MIBMD);
@@ -536,8 +539,18 @@ static void computeFunctionSummary(
           }
           MIBs.push_back(
               MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices)));
+          if (MemProfReportHintedSizes) {
+            auto TotalSize = getMIBTotalSize(MIBMD);
+            assert(TotalSize);
+            TotalSizes.push_back(TotalSize);
+          }
         }
         Allocs.push_back(AllocInfo(std::move(MIBs)));
+        if (MemProfReportHintedSizes) {
+          assert(Allocs.back().MIBs.size() == TotalSizes.size());
+          Allocs.back().TotalSizes =
+              std::make_unique<std::vector<uint64_t>>(std::move(TotalSizes));
+        }
       } else if (!InstCallsite.empty()) {
         SmallVector<unsigned> StackIdIndices;
         for (auto StackId : InstCallsite)
@@ -935,6 +948,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
           CantBePromoted.insert(GV->getGUID());
           // Create the appropriate summary type.
           if (Function *F = dyn_cast<Function>(GV)) {
+            std::vector<AllocInfo> EmptyAllocInfo;
             std::unique_ptr<FunctionSummary> Summary =
                 std::make_unique<FunctionSummary>(
                     GVFlags, /*InstCount=*/0,
@@ -957,7 +971,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                     ArrayRef<FunctionSummary::ConstVCall>{},
                     ArrayRef<FunctionSummary::ConstVCall>{},
                     ArrayRef<FunctionSummary::ParamAccess>{},
-                    ArrayRef<CallsiteInfo>{}, ArrayRef<AllocInfo>{});
+                    ArrayRef<CallsiteInfo>{}, std::move(EmptyAllocInfo));
             Index.addGlobalValueSummary(*GV, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index b7ed9cdf63145..82aa7b7e077a1 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -328,6 +328,8 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO)
       STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO)
       STRINGIFY_CODE(FS, STACK_IDS)
+      STRINGIFY_CODE(FS, PERMODULE_ALLOC_INFO_TOTAL_SIZES)
+      STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO_TOTAL_SIZES)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch (CodeID) {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index f56b2b32ff98f..1cba859a3fade 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7991,12 +7991,19 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       break;
     }
 
-    case bitc::FS_PERMODULE_ALLOC_INFO: {
+    case bitc::FS_PERMODULE_ALLOC_INFO:
+    case bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES: {
+      bool HasTotalSizes = BitCode == bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES;
       unsigned I = 0;
       std::vector<MIBInfo> MIBs;
+      std::vector<uint64_t> TotalSizes;
       while (I < Record.size()) {
-        assert(Record.size() - I >= 2);
+        assert(Record.size() - I >= (HasTotalSizes ? 3 : 2));
         AllocationType AllocType = (AllocationType)Record[I++];
+        if (HasTotalSizes) {
+          TotalSizes.push_back(Record[I++]);
+          assert(TotalSizes.back());
+        }
         unsigned NumStackEntries = Record[I++];
         assert(Record.size() - I >= NumStackEntries);
         SmallVector<unsigned> StackIdList;
@@ -8008,18 +8015,31 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
         MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
       }
       PendingAllocs.push_back(AllocInfo(std::move(MIBs)));
+      assert(HasTotalSizes != TotalSizes.empty());
+      if (HasTotalSizes) {
+        assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
+        PendingAllocs.back().TotalSizes =
+            std::make_unique<std::vector<uint64_t>>(std::move(TotalSizes));
+      }
       break;
     }
 
-    case bitc::FS_COMBINED_ALLOC_INFO: {
+    case bitc::FS_COMBINED_ALLOC_INFO:
+    case bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES: {
+      bool HasTotalSizes = BitCode == bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES;
       unsigned I = 0;
       std::vector<MIBInfo> MIBs;
+      std::vector<uint64_t> TotalSizes;
       unsigned NumMIBs = Record[I++];
       unsigned NumVersions = Record[I++];
       unsigned MIBsRead = 0;
       while (MIBsRead++ < NumMIBs) {
-        assert(Record.size() - I >= 2);
+        assert(Record.size() - I >= (HasTotalSizes ? 3 : 2));
         AllocationType AllocType = (AllocationType)Record[I++];
+        if (HasTotalSizes) {
+          TotalSizes.push_back(Record[I++]);
+          assert(TotalSizes.back());
+        }
         unsigned NumStackEntries = Record[I++];
         assert(Record.size() - I >= NumStackEntries);
         SmallVector<unsigned> StackIdList;
@@ -8036,6 +8056,12 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
         Versions.push_back(Record[I++]);
       PendingAllocs.push_back(
           AllocInfo(std::move(Versions), std::move(MIBs)));
+      assert(HasTotalSizes != TotalSizes.empty());
+      if (HasTotalSizes) {
+        assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
+        PendingAllocs.back().TotalSizes =
+            std::make_unique<std::vector<uint64_t>>(std::move(TotalSizes));
+      }
       break;
     }
     }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 3378931065f9b..7b8f6371b5698 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -230,7 +230,8 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase {
   void writePerModuleFunctionSummaryRecord(
       SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
       unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev,
-      unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F);
+      unsigned CallsiteAbbrev, unsigned AllocAbbrev,
+      unsigned AllocTotalSizeAbbrev, const Function &F);
   void writeModuleLevelReferences(const GlobalVariable &V,
                                   SmallVector<uint64_t, 64> &NameVals,
                                   unsigned FSModRefsAbbrev,
@@ -4158,7 +4159,7 @@ static void writeTypeIdCompatibleVtableSummaryRecord(
 
 static void writeFunctionHeapProfileRecords(
     BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev,
-    unsigned AllocAbbrev, bool PerModule,
+    unsigned AllocAbbrev, unsigned AllocTotalSizeAbbrev, bool PerModule,
     std::function<unsigned(const ValueInfo &VI)> GetValueID,
     std::function<unsigned(unsigned)> GetStackIndex) {
   SmallVector<uint64_t> Record;
@@ -4193,19 +4194,31 @@ static void writeFunctionHeapProfileRecords(
       Record.push_back(AI.MIBs.size());
       Record.push_back(AI.Versions.size());
     }
+    unsigned I = 0;
+    assert(!AI.TotalSizes || AI.TotalSizes->size() == AI.MIBs.size());
     for (auto &MIB : AI.MIBs) {
       Record.push_back((uint8_t)MIB.AllocType);
+      if (AI.TotalSizes) {
+        Record.push_back((*AI.TotalSizes)[I]);
+        assert((*AI.TotalSizes)[I]);
+      }
       Record.push_back(MIB.StackIdIndices.size());
       for (auto Id : MIB.StackIdIndices)
         Record.push_back(GetStackIndex(Id));
+      I++;
     }
     if (!PerModule) {
       for (auto V : AI.Versions)
         Record.push_back(V);
     }
-    Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO
-                                : bitc::FS_COMBINED_ALLOC_INFO,
-                      Record, AllocAbbrev);
+    unsigned Code =
+        PerModule ? (AI.TotalSizes ? bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES
+                                   : bitc::FS_PERMODULE_ALLOC_INFO)
+                  : (AI.TotalSizes ? bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES
+                                   : bitc::FS_COMBINED_ALLOC_INFO);
+    unsigned AllocAbbrevToUse =
+        AI.TotalSizes ? AllocTotalSizeAbbrev : AllocAbbrev;
+    Stream.EmitRecord(Code, Record, AllocAbbrevToUse);
   }
 }
 
@@ -4214,7 +4227,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
     SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
     unsigned ValueID, unsigned FSCallsRelBFAbbrev,
     unsigned FSCallsProfileAbbrev, unsigned CallsiteAbbrev,
-    unsigned AllocAbbrev, const Function &F) {
+    unsigned AllocAbbrev, unsigned AllocTotalSizeAbbrev, const Function &F) {
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
@@ -4225,7 +4238,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
       });
 
   writeFunctionHeapProfileRecords(
-      Stream, FS, CallsiteAbbrev, AllocAbbrev,
+      Stream, FS, CallsiteAbbrev, AllocAbbrev, AllocTotalSizeAbbrev,
       /*PerModule*/ true,
       /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); },
       /*GetStackIndex*/ [&](unsigned I) { return I; });
@@ -4437,6 +4450,13 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES));
+  // n x (alloc type, total size, numstackids, numstackids x stackidindex)
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned AllocTotalSizeAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
   SmallVector<uint64_t, 64> NameVals;
   // Iterate over the list of functions instead of the Index to
   // ensure the ordering is stable.
@@ -4454,9 +4474,10 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
       continue;
     }
     auto *Summary = VI.getSummaryList()[0].get();
-    writePerModuleFunctionSummaryRecord(
-        NameVals, Summary, VE.getValueID(&F), FSCallsRelBFAbbrev,
-        FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, F);
+    writePerModuleFunctionSummaryRecord(NameVals, Summary, VE.getValueID(&F),
+                                        FSCallsRelBFAbbrev,
+                                        FSCallsProfileAbbrev, CallsiteAbbrev,
+                                        AllocAbbrev, AllocTotalSizeAbbrev, F);
   }
 
   // Capture references from GlobalVariable initializers, which are outside
@@ -4586,6 +4607,16 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     return DecSummaries->count(GVS);
   };
 
+  Abbv = std::make_shared<BitCodeAbbrev>();
+  Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
+  // nummib x (alloc type, total size, numstackids, numstackids x stackidindex),
+  // numver x version
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+  unsigned AllocTotalSizeAbbrev = Stream.EmitAbbrev(std::move(Abbv));
+
   // The aliases are emitted as a post-pass, and will point to the value
   // id of the aliasee. Save them in a vector for post-processing.
   SmallVector<AliasSummary *, 64> Aliases;
@@ -4673,9 +4704,10 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     getReferencedTypeIds(FS, ReferencedTypeIds);
 
     writeFunctionHeapProfileRecords(
-        Stream, FS, CallsiteAbbrev, AllocAbbrev,
+        Stream, FS, CallsiteAbbrev, AllocAbbrev, AllocTotalSizeAbbrev,
         /*PerModule*/ false,
-        /*GetValueId*/ [&](const ValueInfo &VI) -> unsigned {
+        /*GetValueId*/
+        [&](const ValueInfo &VI) -> unsigned {
           std::optional<unsigned> ValueID = GetValueId(VI);
           // This can happen in shared index files for distributed ThinLTO if
           // the callee function summary is not included. Record 0 which we
@@ -4685,7 +4717,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
             return 0;
           return *ValueID;
         },
-        /*GetStackIndex*/ [&](unsigned I) {
+        /*GetStackIndex*/
+        [&](unsigned I) {
           // Get the corresponding index into the list of StackIds actually
           // being written for this combined index (which may be a subset in
           // the case of distributed indexes).
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index c1e5ab1a2b561..f5713d31e263f 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -134,6 +134,8 @@ cl::opt<bool> SupportsHotColdNew(
     cl::desc("Linking with hot/cold operator new interfaces"));
 } // namespace llvm
 
+extern cl::opt<bool> MemProfReportHintedSizes;
+
 namespace {
 /// CRTP base for graphs built from either IR or ThinLTO summary index.
 ///
@@ -172,6 +174,7 @@ class CallsiteContextGraph {
 
   void dump() const;
   void print(raw_ostream &OS) const;
+  void printTotalSizes(raw_ostream &OS) const;
 
   friend raw_ostream &operator<<(raw_ostream &OS,
                                  const CallsiteContextGraph &CCG) {
@@ -439,7 +442,7 @@ class CallsiteContextGraph {
   void addStackNodesForMIB(ContextNode *AllocNode,
                            CallStack<NodeT, IteratorT> &StackContext,
                            CallStack<NodeT, IteratorT> &CallsiteContext,
-                           AllocationType AllocType);
+                           AllocationType AllocType, uint64_t TotalSize);
 
   /// Matches all callsite metadata (or summary) to the nodes created for
   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
@@ -611,6 +614,8 @@ class CallsiteContextGraph {
   /// Map from each context ID to the AllocationType assigned to that context.
   DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
 
+  std::map<uint32_t, uint64_t> ContextIdToTotalSize;
+
   /// Identifies the context node created for a stack id when adding the MIB
   /// contexts to the graph. This is used to locate the context nodes when
   /// trying to assign the corresponding callsites with those stack ids to these
@@ -1004,11 +1009,24 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
   return AllocNode;
 }
 
+static std::string getAllocTypeString(uint8_t AllocTypes) {
+  if (!AllocTypes)
+    return "None";
+  std::string Str;
+  if (AllocTypes & (uint8_t)AllocationType::NotCold)
+    Str += "NotCold";
+  if (AllocTypes & (uint8_t)AllocationType::Cold)
+    Str += "Cold";
+  return Str;
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 template <class NodeT, class IteratorT>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
-    CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
+    CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
+    uint64_t TotalSize) {
+  assert(!MemProfReportHintedSizes || TotalSize > 0);
   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
   // is done.
   if (AllocType == AllocationType::Hot)
@@ -1016,6 +1034,11 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
 
   ContextIdToAllocationType[++LastContextId] = AllocType;
 
+  if (MemProfReportHintedSizes) {
+    assert(TotalSize);
+    ContextIdToTotalSize[LastContextId] = TotalSize;
+  }
+
   // Update alloc type and context ids for this MIB.
   AllocNode->AllocTypes |= (uint8_t)AllocType;
 
@@ -1060,6 +1083,10 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
     assert(ContextIdToAllocationType.count(OldId));
     // The new context has the same allocation type as original.
     ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
+    // For now set this to 0 so we don't duplicate sizes. Not clear how to divvy
+    // up the size. Assume that if we are able to duplicate context ids that we
+    // will be able to disambiguate all copies.
+    ContextIdToTotalSize[LastContextId] = 0;
   }
   return NewContextIds;
 }
@@ -1663,7 +1690,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
                 AllocNode, StackContext, CallsiteContext,
-                getMIBAllocType(MIBMD));
+                getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));
           }
           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
           // Memprof and callsite metadata on memory allocations no longer
@@ -1735,12 +1762,20 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
           // stack ids on the allocation call during ModuleSummaryAnalysis.
           CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
               EmptyContext;
+          unsigned I = 0;
+          assert(!MemProfReportHintedSizes ||
+                 (AN.TotalSizes && AN.TotalSizes->size() == AN.MIBs.size()));
           // Now add all of the MIBs and their stack nodes.
           for (auto &MIB : AN.MIBs) {
             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
                 StackContext(&MIB);
+            uint64_t TotalSize = 0;
+            if (MemProfReportHintedSizes)
+              TotalSize = (*AN.TotalSizes)[I];
             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
-                AllocNode, StackContext, EmptyContext, MIB.AllocType);
+                AllocNode, StackContext, EmptyContext, MIB.AllocType,
+                TotalSize);
+            I++;
           }
           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
           // Initialize version 0 on the summary alloc node to the current alloc
@@ -2171,17 +2206,6 @@ bool IndexCallsiteContextGraph::calleeMatchesFunc(
   return true;
 }
 
-static std::string getAllocTypeString(uint8_t AllocTypes) {
-  if (!AllocTypes)
-    return "None";
-  std::string Str;
-  if (AllocTypes & (uint8_t)AllocationType::NotCold)
-    Str += "NotCold";
-  if (AllocTypes & (uint8_t)AllocationType::Cold)
-    Str += "Cold";
-  return Str;
-}
-
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
     const {
@@ -2261,6 +2285,30 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
   }
 }
 
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
+    raw_ostream &OS) const {
+  using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
+  for (const auto Node : nodes<GraphType>(this)) {
+    if (Node->isRemoved())
+      continue;
+    if (!Node->IsAllocation)
+      continue;
+    auto ContextIds = Node->getContextIds();
+    std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
+    std::sort(SortedIds.begin(), SortedIds.end());
+    for (auto Id : SortedIds) {
+      auto SizeI = ContextIdToTotalSize.find(Id);
+      assert(SizeI != ContextIdToTotalSize.end());
+      auto TypeI = ContextIdToAllocationType.find(Id);
+      assert(TypeI != ContextIdToAllocationType.end());
+      OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id
+         << " with total size " << SizeI->second << " is "
+         << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
+    }
+  }
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
@@ -3797,6 +3845,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
   if (ExportToDot)
     exportToDot("clonefuncassign");
 
+  if (MemProfReportHintedSizes)
+    printTotalSizes(errs());
+
   return Changed;
 }
 
diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll
index 54e01e5fcdf95..e080279a16412 100644
--- a/llvm/test/ThinLTO/X86/memprof-basic.ll
+++ b/llvm/test/ThinLTO/X86/memprof-basic.ll
@@ -34,7 +34,7 @@
 ;; -stats requires asserts
 ; REQUIRES: asserts
 
-; RUN: opt -thinlto-bc %s >%t.o
+; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
 ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
 ; RUN:	-supports-hot-cold-new \
 ; RUN:	-r=%t.o,main,plx \
@@ -43,9 +43,11 @@
 ; RUN:	-r=%t.o,_Znam, \
 ; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
+; RUN:	-memprof-report-hinted-sizes \
 ; RUN:	-stats -pass-remarks=memprof-context-disambiguation -save-temps \
 ; RUN:	-o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \
-; RUN:	--check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS
+; RUN:	--check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS \
+; RUN:  --check-prefix=SIZES
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should have cloned bar, baz, and foo, for the cold memory allocation.
@@ -64,9 +66,10 @@
 ; RUN:	-r=%t.o,_Znam, \
 ; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \
+; RUN:	-memprof-report-hinted-sizes \
 ; RUN:	-stats -pass-remarks=memprof-context-disambiguation \
 ; RUN:	-o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \
-; RUN:	--check-prefix=STATS
+; RUN:	--check-prefix=STATS --check-prefix=SIZES
 
 ; RUN:	cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should have cloned bar, baz, and foo, for the cold memory allocation.
@@ -125,9 +128,9 @@ attributes #0 = { noinline optnone }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold"}
+!3 = !{!4, !"notcold", i64 100}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold"}
+!5 = !{!6, !"cold", i64 400}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}
@@ -265,6 +268,10 @@ attributes #0 = { noinline optnone }
 ; DUMP:		Clone of [[BAR]]
 
 
+; SIZES: NotCold context 1 with total size 100 is NotCold after cloning
+; SIZES: Cold context 2 with total size 400 is Cold after cloning
+
+
 ; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1
 ; REMARKS: created clone _Z3barv.memprof.1
 ; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
index 483582c6ced95..c685a413718a6 100644
--- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll
@@ -38,8 +38,9 @@
 ; RUN:	-memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \
 ; RUN:	-memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \
 ; RUN:	-stats -pass-remarks=memprof-context-disambiguation \
+; RUN:  -memprof-report-hinted-sizes \
 ; RUN:	%s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \
-; RUN:	--check-prefix=STATS --check-prefix=REMARKS
+; RUN:	--check-prefix=STATS --check-prefix=REMARKS --check-prefix=SIZES
 
 ; RUN:	cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT
 ;; We should have cloned bar, baz, and foo, for the cold memory allocation.
@@ -105,9 +106,9 @@ attributes #6 = { builtin }
 !0 = !{i64 8632435727821051414}
 !1 = !{i64 -3421689549917153178}
 !2 = !{!3, !5}
-!3 = !{!4, !"notcold"}
+!3 = !{!4, !"notcold", i64 100}
 !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414}
-!5 = !{!6, !"cold"}
+!5 = !{!6, !"cold", i64 400}
 !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178}
 !7 = !{i64 9086428284934609951}
 !8 = !{i64 -5964873800580613432}
@@ -249,6 +250,10 @@ attributes #6 = { builtin }
 ; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold
 
 
+; SIZES: NotCold context 1 with total size 100 is NotCold after cloning
+; SIZES: Cold context 2 with total size 400 is Cold after cloning
+
+
 ; IR: define {{.*}} @main
 ;; The first call to foo does not allocate cold memory. It should call the
 ;; original functions, which ultimately call the original allocation decorated

>From f320adfa050b3bec2e5094ef7d97f4cdc9d7c5e7 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Thu, 11 Jul 2024 08:52:20 -0700
Subject: [PATCH 2/4] Address comments

---
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp                | 6 ++----
 llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 6 ++++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 7b8f6371b5698..4463eb48b9811 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4706,8 +4706,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     writeFunctionHeapProfileRecords(
         Stream, FS, CallsiteAbbrev, AllocAbbrev, AllocTotalSizeAbbrev,
         /*PerModule*/ false,
-        /*GetValueId*/
-        [&](const ValueInfo &VI) -> unsigned {
+        /*GetValueId*/ [&](const ValueInfo &VI) -> unsigned {
           std::optional<unsigned> ValueID = GetValueId(VI);
           // This can happen in shared index files for distributed ThinLTO if
           // the callee function summary is not included. Record 0 which we
@@ -4717,8 +4716,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
             return 0;
           return *ValueID;
         },
-        /*GetStackIndex*/
-        [&](unsigned I) {
+        /*GetStackIndex*/ [&](unsigned I) {
           // Get the corresponding index into the list of StackIds actually
           // being written for this combined index (which may be a subset in
           // the case of distributed indexes).
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index f5713d31e263f..01138555a8fbc 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -614,7 +614,9 @@ class CallsiteContextGraph {
   /// Map from each context ID to the AllocationType assigned to that context.
   DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
 
-  std::map<uint32_t, uint64_t> ContextIdToTotalSize;
+  /// Map from each contextID to the profiled aggregate allocation size,
+  /// optionally populated when requested (via MemProfReportHintedSizes).
+  DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;
 
   /// Identifies the context node created for a stack id when adding the MIB
   /// contexts to the graph. This is used to locate the context nodes when
@@ -2294,7 +2296,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
       continue;
     if (!Node->IsAllocation)
       continue;
-    auto ContextIds = Node->getContextIds();
+    DenseSet<uint32_t> ContextIds = Node->getContextIds();
     std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
     std::sort(SortedIds.begin(), SortedIds.end());
     for (auto Id : SortedIds) {

>From 8be1c3cc1eee302c15fe105ca1ac5f38bde2fcec Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Thu, 11 Jul 2024 13:59:18 -0700
Subject: [PATCH 3/4] Address comments

---
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      | 15 +---
 llvm/include/llvm/IR/ModuleSummaryIndex.h     | 10 +--
 llvm/include/llvm/IR/ModuleSummaryIndexYAML.h |  3 +-
 llvm/lib/Analysis/ModuleSummaryAnalysis.cpp   |  6 +-
 llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp   |  2 -
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     | 58 ++++++++-------
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     | 72 +++++++------------
 .../IPO/MemProfContextDisambiguation.cpp      |  4 +-
 llvm/test/Bitcode/summary_version.ll          |  2 +-
 .../thinlto-func-summary-vtableref-pgo.ll     |  2 +-
 10 files changed, 71 insertions(+), 103 deletions(-)

diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 9f989aaf22779..4e6ef4c1a8c31 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -307,7 +307,8 @@ enum GlobalValueSummarySymtabCodes {
   // [valueid, n x stackidindex]
   FS_PERMODULE_CALLSITE_INFO = 26,
   // Summary of per-module allocation memprof metadata.
-  // [n x (alloc type, nummib, nummib x stackidindex)]
+  // [nummib, nummib x (alloc type, nummib, nummib x stackidindex),
+  // [nummib x total size]?]
   FS_PERMODULE_ALLOC_INFO = 27,
   // Summary of combined index memprof callsite metadata.
   // [valueid, numstackindices, numver,
@@ -316,19 +317,9 @@ enum GlobalValueSummarySymtabCodes {
   // Summary of combined index allocation memprof metadata.
   // [nummib, numver,
   //  nummib x (alloc type, numstackids, numstackids x stackidindex),
-  //  numver x version]
+  //  numver x version, [nummib x total size]?]
   FS_COMBINED_ALLOC_INFO = 29,
   FS_STACK_IDS = 30,
-  // Summary of per-module allocation memprof metadata when we are tracking
-  // total sizes.
-  // [n x (alloc type, total size, nummib, nummib x stackidindex)]
-  FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES = 31,
-  // Summary of combined index allocation memprof metadata when we are tracking
-  // total sizes.
-  // [nummib, numver,
-  //  nummib x (alloc type, total size, numstackids,
-  //  numstackids x stackidindex), numver x version]
-  FS_COMBINED_ALLOC_INFO_TOTAL_SIZES = 32,
 };
 
 enum MetadataCodes {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 950c0ed824dbb..949e60ffd1dbf 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -404,8 +404,8 @@ struct AllocInfo {
   std::vector<MIBInfo> MIBs;
 
   // If requested, keep track of total profiled sizes for each MIB. This will be
-  // a vector of the same length and order as the MIBs vector.
-  std::unique_ptr<std::vector<uint64_t>> TotalSizes;
+  // a vector of the same length and order as the MIBs vector, if non-empty.
+  std::vector<uint64_t> TotalSizes;
 
   AllocInfo(std::vector<MIBInfo> MIBs) : MIBs(std::move(MIBs)) {
     Versions.push_back(0);
@@ -427,10 +427,10 @@ inline raw_ostream &operator<<(raw_ostream &OS, const AllocInfo &AE) {
   for (auto &M : AE.MIBs) {
     OS << "\t\t" << M << "\n";
   }
-  if (AE.TotalSizes) {
+  if (!AE.TotalSizes.empty()) {
     OS << " TotalSizes per MIB:\n\t\t";
     First = true;
-    for (auto &TS : *AE.TotalSizes) {
+    for (auto &TS : AE.TotalSizes) {
       if (!First)
         OS << ", ";
       First = false;
@@ -1445,7 +1445,7 @@ class ModuleSummaryIndex {
   // in the way some record are interpreted, like flags for instance.
   // Note that incrementing this may require changes in both BitcodeReader.cpp
   // and BitcodeWriter.cpp.
-  static constexpr uint64_t BitcodeSummaryVersion = 9;
+  static constexpr uint64_t BitcodeSummaryVersion = 10;
 
   // Regular LTO module name for ASM writer
   static constexpr const char *getRegularLTOModuleName() {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
index 26bf08f9f7439..b2747d24c5396 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -224,7 +224,6 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
           V.emplace(RefGUID, /*IsAnalysis=*/false);
         Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*V.find(RefGUID)));
       }
-      std::vector<AllocInfo> EmptyAllocInfo;
       Elem.SummaryList.push_back(std::make_unique<FunctionSummary>(
           GlobalValueSummary::GVFlags(
               static_cast<GlobalValue::LinkageTypes>(FSum.Linkage),
@@ -239,7 +238,7 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
           std::move(FSum.TypeTestAssumeConstVCalls),
           std::move(FSum.TypeCheckedLoadConstVCalls),
           ArrayRef<FunctionSummary::ParamAccess>{}, ArrayRef<CallsiteInfo>{},
-          std::move(EmptyAllocInfo)));
+          ArrayRef<AllocInfo>{}));
     }
   }
   static void output(IO &io, GlobalValueSummaryMapTy &V) {
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index c3b6fd8bc20af..e9490ccba8215 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -548,8 +548,7 @@ static void computeFunctionSummary(
         Allocs.push_back(AllocInfo(std::move(MIBs)));
         if (MemProfReportHintedSizes) {
           assert(Allocs.back().MIBs.size() == TotalSizes.size());
-          Allocs.back().TotalSizes =
-              std::make_unique<std::vector<uint64_t>>(std::move(TotalSizes));
+          Allocs.back().TotalSizes = std::move(TotalSizes);
         }
       } else if (!InstCallsite.empty()) {
         SmallVector<unsigned> StackIdIndices;
@@ -948,7 +947,6 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
           CantBePromoted.insert(GV->getGUID());
           // Create the appropriate summary type.
           if (Function *F = dyn_cast<Function>(GV)) {
-            std::vector<AllocInfo> EmptyAllocInfo;
             std::unique_ptr<FunctionSummary> Summary =
                 std::make_unique<FunctionSummary>(
                     GVFlags, /*InstCount=*/0,
@@ -971,7 +969,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                     ArrayRef<FunctionSummary::ConstVCall>{},
                     ArrayRef<FunctionSummary::ConstVCall>{},
                     ArrayRef<FunctionSummary::ParamAccess>{},
-                    ArrayRef<CallsiteInfo>{}, std::move(EmptyAllocInfo));
+                    ArrayRef<CallsiteInfo>{}, ArrayRef<AllocInfo>{});
             Index.addGlobalValueSummary(*GV, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 82aa7b7e077a1..b7ed9cdf63145 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -328,8 +328,6 @@ GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO)
       STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO)
       STRINGIFY_CODE(FS, STACK_IDS)
-      STRINGIFY_CODE(FS, PERMODULE_ALLOC_INFO_TOTAL_SIZES)
-      STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO_TOTAL_SIZES)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch (CodeID) {
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 1cba859a3fade..6203c6e5119d1 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7991,19 +7991,17 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       break;
     }
 
-    case bitc::FS_PERMODULE_ALLOC_INFO:
-    case bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES: {
-      bool HasTotalSizes = BitCode == bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES;
+    case bitc::FS_PERMODULE_ALLOC_INFO: {
       unsigned I = 0;
       std::vector<MIBInfo> MIBs;
-      std::vector<uint64_t> TotalSizes;
-      while (I < Record.size()) {
-        assert(Record.size() - I >= (HasTotalSizes ? 3 : 2));
+      unsigned NumMIBs = 0;
+      if (Version >= 10)
+        NumMIBs = Record[I++];
+      unsigned MIBsRead = 0;
+      while ((Version >= 10 && MIBsRead++ < NumMIBs) ||
+             (Version < 10 && I < Record.size())) {
+        assert(Record.size() - I >= 2);
         AllocationType AllocType = (AllocationType)Record[I++];
-        if (HasTotalSizes) {
-          TotalSizes.push_back(Record[I++]);
-          assert(TotalSizes.back());
-        }
         unsigned NumStackEntries = Record[I++];
         assert(Record.size() - I >= NumStackEntries);
         SmallVector<unsigned> StackIdList;
@@ -8014,32 +8012,31 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
         }
         MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
       }
+      std::vector<uint64_t> TotalSizes;
+      // We either have no sizes or NumMIBs of them.
+      assert(I == Record.size() || Record.size() - I == NumMIBs);
+      if (I < Record.size()) {
+        MIBsRead = 0;
+        while (MIBsRead++ < NumMIBs)
+          TotalSizes.push_back(Record[I++]);
+      }
       PendingAllocs.push_back(AllocInfo(std::move(MIBs)));
-      assert(HasTotalSizes != TotalSizes.empty());
-      if (HasTotalSizes) {
+      if (!TotalSizes.empty()) {
         assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
-        PendingAllocs.back().TotalSizes =
-            std::make_unique<std::vector<uint64_t>>(std::move(TotalSizes));
+        PendingAllocs.back().TotalSizes = std::move(TotalSizes);
       }
       break;
     }
 
-    case bitc::FS_COMBINED_ALLOC_INFO:
-    case bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES: {
-      bool HasTotalSizes = BitCode == bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES;
+    case bitc::FS_COMBINED_ALLOC_INFO: {
       unsigned I = 0;
       std::vector<MIBInfo> MIBs;
-      std::vector<uint64_t> TotalSizes;
       unsigned NumMIBs = Record[I++];
       unsigned NumVersions = Record[I++];
       unsigned MIBsRead = 0;
       while (MIBsRead++ < NumMIBs) {
-        assert(Record.size() - I >= (HasTotalSizes ? 3 : 2));
+        assert(Record.size() - I >= 2);
         AllocationType AllocType = (AllocationType)Record[I++];
-        if (HasTotalSizes) {
-          TotalSizes.push_back(Record[I++]);
-          assert(TotalSizes.back());
-        }
         unsigned NumStackEntries = Record[I++];
         assert(Record.size() - I >= NumStackEntries);
         SmallVector<unsigned> StackIdList;
@@ -8054,13 +8051,20 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       SmallVector<uint8_t> Versions;
       for (unsigned J = 0; J < NumVersions; J++)
         Versions.push_back(Record[I++]);
+      std::vector<uint64_t> TotalSizes;
+      // We either have no sizes or NumMIBs of them.
+      assert(I == Record.size() || Record.size() - I == NumMIBs);
+      if (I < Record.size()) {
+        MIBsRead = 0;
+        while (MIBsRead++ < NumMIBs) {
+          TotalSizes.push_back(Record[I++]);
+        }
+      }
       PendingAllocs.push_back(
           AllocInfo(std::move(Versions), std::move(MIBs)));
-      assert(HasTotalSizes != TotalSizes.empty());
-      if (HasTotalSizes) {
+      if (!TotalSizes.empty()) {
         assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
-        PendingAllocs.back().TotalSizes =
-            std::make_unique<std::vector<uint64_t>>(std::move(TotalSizes));
+        PendingAllocs.back().TotalSizes = std::move(TotalSizes);
       }
       break;
     }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 4463eb48b9811..b3ebe70e8c52f 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -230,8 +230,7 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase {
   void writePerModuleFunctionSummaryRecord(
       SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
       unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev,
-      unsigned CallsiteAbbrev, unsigned AllocAbbrev,
-      unsigned AllocTotalSizeAbbrev, const Function &F);
+      unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F);
   void writeModuleLevelReferences(const GlobalVariable &V,
                                   SmallVector<uint64_t, 64> &NameVals,
                                   unsigned FSModRefsAbbrev,
@@ -4159,7 +4158,7 @@ static void writeTypeIdCompatibleVtableSummaryRecord(
 
 static void writeFunctionHeapProfileRecords(
     BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev,
-    unsigned AllocAbbrev, unsigned AllocTotalSizeAbbrev, bool PerModule,
+    unsigned AllocAbbrev, bool PerModule,
     std::function<unsigned(const ValueInfo &VI)> GetValueID,
     std::function<unsigned(unsigned)> GetStackIndex) {
   SmallVector<uint64_t> Record;
@@ -4190,35 +4189,27 @@ static void writeFunctionHeapProfileRecords(
     // Per module alloc versions should always have a single entry of
     // value 0.
     assert(!PerModule || (AI.Versions.size() == 1 && AI.Versions[0] == 0));
-    if (!PerModule) {
-      Record.push_back(AI.MIBs.size());
+    Record.push_back(AI.MIBs.size());
+    if (!PerModule)
       Record.push_back(AI.Versions.size());
-    }
-    unsigned I = 0;
-    assert(!AI.TotalSizes || AI.TotalSizes->size() == AI.MIBs.size());
     for (auto &MIB : AI.MIBs) {
       Record.push_back((uint8_t)MIB.AllocType);
-      if (AI.TotalSizes) {
-        Record.push_back((*AI.TotalSizes)[I]);
-        assert((*AI.TotalSizes)[I]);
-      }
       Record.push_back(MIB.StackIdIndices.size());
       for (auto Id : MIB.StackIdIndices)
         Record.push_back(GetStackIndex(Id));
-      I++;
     }
     if (!PerModule) {
       for (auto V : AI.Versions)
         Record.push_back(V);
     }
-    unsigned Code =
-        PerModule ? (AI.TotalSizes ? bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES
-                                   : bitc::FS_PERMODULE_ALLOC_INFO)
-                  : (AI.TotalSizes ? bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES
-                                   : bitc::FS_COMBINED_ALLOC_INFO);
-    unsigned AllocAbbrevToUse =
-        AI.TotalSizes ? AllocTotalSizeAbbrev : AllocAbbrev;
-    Stream.EmitRecord(Code, Record, AllocAbbrevToUse);
+    assert(AI.TotalSizes.empty() || AI.TotalSizes.size() == AI.MIBs.size());
+    if (!AI.TotalSizes.empty()) {
+      for (auto Size : AI.TotalSizes)
+        Record.push_back(Size);
+    }
+    Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO
+                                : bitc::FS_COMBINED_ALLOC_INFO,
+                      Record, AllocAbbrev);
   }
 }
 
@@ -4227,7 +4218,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
     SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
     unsigned ValueID, unsigned FSCallsRelBFAbbrev,
     unsigned FSCallsProfileAbbrev, unsigned CallsiteAbbrev,
-    unsigned AllocAbbrev, unsigned AllocTotalSizeAbbrev, const Function &F) {
+    unsigned AllocAbbrev, const Function &F) {
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
@@ -4238,7 +4229,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord(
       });
 
   writeFunctionHeapProfileRecords(
-      Stream, FS, CallsiteAbbrev, AllocAbbrev, AllocTotalSizeAbbrev,
+      Stream, FS, CallsiteAbbrev, AllocAbbrev,
       /*PerModule*/ true,
       /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); },
       /*GetStackIndex*/ [&](unsigned I) { return I; });
@@ -4445,18 +4436,13 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
 
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO));
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
   // n x (alloc type, numstackids, numstackids x stackidindex)
+  // optional: nummib x total size
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
-  Abbv = std::make_shared<BitCodeAbbrev>();
-  Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES));
-  // n x (alloc type, total size, numstackids, numstackids x stackidindex)
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned AllocTotalSizeAbbrev = Stream.EmitAbbrev(std::move(Abbv));
-
   SmallVector<uint64_t, 64> NameVals;
   // Iterate over the list of functions instead of the Index to
   // ensure the ordering is stable.
@@ -4474,10 +4460,9 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
       continue;
     }
     auto *Summary = VI.getSummaryList()[0].get();
-    writePerModuleFunctionSummaryRecord(NameVals, Summary, VE.getValueID(&F),
-                                        FSCallsRelBFAbbrev,
-                                        FSCallsProfileAbbrev, CallsiteAbbrev,
-                                        AllocAbbrev, AllocTotalSizeAbbrev, F);
+    writePerModuleFunctionSummaryRecord(
+        NameVals, Summary, VE.getValueID(&F), FSCallsRelBFAbbrev,
+        FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, F);
   }
 
   // Capture references from GlobalVariable initializers, which are outside
@@ -4597,6 +4582,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
   // nummib x (alloc type, numstackids, numstackids x stackidindex),
   // numver x version
+  // optional: nummib x total size
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
@@ -4607,16 +4593,6 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     return DecSummaries->count(GVS);
   };
 
-  Abbv = std::make_shared<BitCodeAbbrev>();
-  Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
-  // nummib x (alloc type, total size, numstackids, numstackids x stackidindex),
-  // numver x version
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
-  unsigned AllocTotalSizeAbbrev = Stream.EmitAbbrev(std::move(Abbv));
-
   // The aliases are emitted as a post-pass, and will point to the value
   // id of the aliasee. Save them in a vector for post-processing.
   SmallVector<AliasSummary *, 64> Aliases;
@@ -4704,9 +4680,10 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     getReferencedTypeIds(FS, ReferencedTypeIds);
 
     writeFunctionHeapProfileRecords(
-        Stream, FS, CallsiteAbbrev, AllocAbbrev, AllocTotalSizeAbbrev,
+        Stream, FS, CallsiteAbbrev, AllocAbbrev,
         /*PerModule*/ false,
-        /*GetValueId*/ [&](const ValueInfo &VI) -> unsigned {
+        /*GetValueId*/
+        [&](const ValueInfo &VI) -> unsigned {
           std::optional<unsigned> ValueID = GetValueId(VI);
           // This can happen in shared index files for distributed ThinLTO if
           // the callee function summary is not included. Record 0 which we
@@ -4716,7 +4693,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
             return 0;
           return *ValueID;
         },
-        /*GetStackIndex*/ [&](unsigned I) {
+        /*GetStackIndex*/
+        [&](unsigned I) {
           // Get the corresponding index into the list of StackIds actually
           // being written for this combined index (which may be a subset in
           // the case of distributed indexes).
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 01138555a8fbc..ef9ddeaaab632 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1766,14 +1766,14 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
               EmptyContext;
           unsigned I = 0;
           assert(!MemProfReportHintedSizes ||
-                 (AN.TotalSizes && AN.TotalSizes->size() == AN.MIBs.size()));
+                 AN.TotalSizes.size() == AN.MIBs.size());
           // Now add all of the MIBs and their stack nodes.
           for (auto &MIB : AN.MIBs) {
             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
                 StackContext(&MIB);
             uint64_t TotalSize = 0;
             if (MemProfReportHintedSizes)
-              TotalSize = (*AN.TotalSizes)[I];
+              TotalSize = AN.TotalSizes[I];
             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
                 TotalSize);
diff --git a/llvm/test/Bitcode/summary_version.ll b/llvm/test/Bitcode/summary_version.ll
index 98feab6fe2f99..26c64f81a773f 100644
--- a/llvm/test/Bitcode/summary_version.ll
+++ b/llvm/test/Bitcode/summary_version.ll
@@ -2,7 +2,7 @@
 ; RUN: opt  -module-summary  %s -o - | llvm-bcanalyzer -dump | FileCheck %s
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK: <VERSION op0=9/>
+; CHECK: <VERSION op0=10/>
 
 
 
diff --git a/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll b/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
index 19e228fd5355c..b3f1e770810d2 100644
--- a/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
+++ b/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll
@@ -11,7 +11,7 @@
 ; RUN: llvm-dis -o - %t.o | llvm-as -o - | llvm-dis -o - | FileCheck %s --check-prefix=DIS
 
 ; CHECK: <GLOBALVAL_SUMMARY_BLOCK
-; CHECK-NEXT:   <VERSION op0=9/>
+; CHECK-NEXT:   <VERSION op0=
 ; CHECK-NEXT:   <FLAGS op0=0/>
 ; The `VALUE_GUID` below represents the "_ZTV4Base" referenced by the instruction
 ; that loads vtable pointers.

>From 3ad6a4ee65c6a76e227f2e38b081cf9f7ff362e0 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson at google.com>
Date: Thu, 11 Jul 2024 14:01:23 -0700
Subject: [PATCH 4/4] Fix typo in original comment

---
 llvm/include/llvm/Bitcode/LLVMBitCodes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 4e6ef4c1a8c31..184bbe32df695 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -307,7 +307,7 @@ enum GlobalValueSummarySymtabCodes {
   // [valueid, n x stackidindex]
   FS_PERMODULE_CALLSITE_INFO = 26,
   // Summary of per-module allocation memprof metadata.
-  // [nummib, nummib x (alloc type, nummib, nummib x stackidindex),
+  // [nummib, nummib x (alloc type, numstackids, numstackids x stackidindex),
   // [nummib x total size]?]
   FS_PERMODULE_ALLOC_INFO = 27,
   // Summary of combined index memprof callsite metadata.



More information about the llvm-commits mailing list