[llvm] [MemProf] Add option to emit full call context for matched allocations (PR #170516)

via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 3 09:35:23 PST 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-pgo

Author: Teresa Johnson (teresajohnson)

<details>
<summary>Changes</summary>

Add the -memprof-print-matched-alloc-stack option to enable emitting the
full allocation call context (of stack ids) for each matched allocation
reported by -memprof-print-match-info. Noop when the latter is not
enabled.


---
Full diff: https://github.com/llvm/llvm-project/pull/170516.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/Instrumentation/MemProfUse.cpp (+64-28) 
- (modified) llvm/test/Transforms/PGOProfile/memprof-dump-matched-alloc-site.ll (+13-4) 


``````````diff
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index b72d41a748857..bda1d4555af2d 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -62,6 +62,12 @@ static cl::opt<bool>
                                      "context in this module's profiles"),
                             cl::Hidden, cl::init(false));
 
+static cl::opt<bool> PrintMatchedAllocStack(
+    "memprof-print-matched-alloc-stack",
+    cl::desc("Print full stack context for matched "
+             "allocations with -memprof-print-match-info."),
+    cl::Hidden, cl::init(false));
+
 static cl::opt<bool>
     SalvageStaleProfile("memprof-salvage-stale-profile",
                         cl::desc("Salvage stale MemProf profile"),
@@ -222,9 +228,26 @@ static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
                     << Reason << ".\n");
 }
 
+// Structure for tracking info about matched allocation contexts for use with
+// -memprof-print-match-info and -memprof-print-matched-alloc-stack.
 struct AllocMatchInfo {
+  // Total size in bytes of matched context.
   uint64_t TotalSize = 0;
+  // Matched allocation's type.
   AllocationType AllocType = AllocationType::None;
+  // Number of frames matched to the allocation itself (values will be >1 in
+  // cases where allocation was already inlined). Use a set because there can
+  // be multiple inlined instances and each may have a different inline depth.
+  // Use std::set to iterate in sorted order when printing.
+  std::set<unsigned> MatchedFramesSet;
+  // The full call stack of the allocation, for cases where requested via
+  // -memprof-print-matched-alloc-stack.
+  std::vector<Frame> CallStack;
+
+  // Caller responsible for inserting the matched frames and the call stack when
+  // appropriate.
+  AllocMatchInfo(uint64_t TotalSize, AllocationType AllocType)
+      : TotalSize(TotalSize), AllocType(AllocType) {}
 };
 
 DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>>
@@ -402,13 +425,11 @@ static void addVPMetadata(Module &M, Instruction &I,
   }
 }
 
-static void
-handleAllocSite(Instruction &I, CallBase *CI,
-                ArrayRef<uint64_t> InlinedCallStack, LLVMContext &Ctx,
-                OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize,
-                const std::set<const AllocationInfo *> &AllocInfoSet,
-                std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
-                    &FullStackIdToAllocMatchInfo) {
+static void handleAllocSite(
+    Instruction &I, CallBase *CI, ArrayRef<uint64_t> InlinedCallStack,
+    LLVMContext &Ctx, OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize,
+    const std::set<const AllocationInfo *> &AllocInfoSet,
+    std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo) {
   // TODO: Remove this once the profile creation logic deduplicates contexts
   // that are the same other than the IsInlineFrame bool. Until then, keep the
   // largest.
@@ -450,9 +471,15 @@ handleAllocSite(Instruction &I, CallBase *CI,
       // was requested.
       if (ClPrintMemProfMatchInfo) {
         assert(FullStackId != 0);
-        FullStackIdToAllocMatchInfo[std::make_pair(FullStackId,
-                                                   InlinedCallStack.size())] = {
-            AllocInfo->Info.getTotalSize(), AllocType};
+        auto [Iter, Inserted] = FullStackIdToAllocMatchInfo.try_emplace(
+            FullStackId,
+            AllocMatchInfo(AllocInfo->Info.getTotalSize(), AllocType));
+        // Always insert the new matched frame count, since it may differ.
+        Iter->second.MatchedFramesSet.insert(InlinedCallStack.size());
+        if (Inserted && PrintMatchedAllocStack)
+          Iter->second.CallStack.insert(Iter->second.CallStack.begin(),
+                                        AllocInfo->CallStack.begin(),
+                                        AllocInfo->CallStack.end());
       }
     }
   }
@@ -544,14 +571,13 @@ static void handleCallSite(
   }
 }
 
-static void readMemprof(Module &M, Function &F,
-                        IndexedInstrProfReader *MemProfReader,
-                        const TargetLibraryInfo &TLI,
-                        std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
-                            &FullStackIdToAllocMatchInfo,
-                        std::set<std::vector<uint64_t>> &MatchedCallSites,
-                        DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
-                        OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize) {
+static void
+readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
+            const TargetLibraryInfo &TLI,
+            std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo,
+            std::set<std::vector<uint64_t>> &MatchedCallSites,
+            DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
+            OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize) {
   auto &Ctx = M.getContext();
   // Previously we used getIRPGOFuncName() here. If F is local linkage,
   // getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But
@@ -777,11 +803,11 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
   if (SalvageStaleProfile)
     UndriftMaps = computeUndriftMap(M, MemProfReader.get(), TLI);
 
-  // Map from the stack hash and matched frame count of each allocation context
-  // in the function profiles to the total profiled size (bytes) and allocation
-  // type.
-  std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo>
-      FullStackIdToAllocMatchInfo;
+  // Map from the stack hash of each matched allocation context in the function
+  // profiles to match info such as the total profiled size (bytes), allocation
+  // type, number of frames matched to the allocation itself, and the full array
+  // of call stack ids.
+  std::map<uint64_t, AllocMatchInfo> FullStackIdToAllocMatchInfo;
 
   // Set of the matched call sites, each expressed as a sequence of an inline
   // call stack.
@@ -802,11 +828,21 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
   }
 
   if (ClPrintMemProfMatchInfo) {
-    for (const auto &[IdLengthPair, Info] : FullStackIdToAllocMatchInfo) {
-      auto [Id, Length] = IdLengthPair;
-      errs() << "MemProf " << getAllocTypeAttributeString(Info.AllocType)
-             << " context with id " << Id << " has total profiled size "
-             << Info.TotalSize << " is matched with " << Length << " frames\n";
+    for (const auto &[Id, Info] : FullStackIdToAllocMatchInfo) {
+      for (auto Frames : Info.MatchedFramesSet) {
+        // TODO: To reduce verbosity, should we change the existing message
+        // so that we emit a list of matched frame counts in a single message
+        // about the context (instead of one message per frame count?
+        errs() << "MemProf " << getAllocTypeAttributeString(Info.AllocType)
+               << " context with id " << Id << " has total profiled size "
+               << Info.TotalSize << " is matched with " << Frames << " frames";
+        if (PrintMatchedAllocStack) {
+          errs() << " and call stack";
+          for (auto &F : Info.CallStack)
+            errs() << " " << computeStackId(F);
+        }
+        errs() << "\n";
+      }
     }
 
     for (const auto &CallStack : MatchedCallSites) {
diff --git a/llvm/test/Transforms/PGOProfile/memprof-dump-matched-alloc-site.ll b/llvm/test/Transforms/PGOProfile/memprof-dump-matched-alloc-site.ll
index 2dcaa9d492869..2eec875d16488 100644
--- a/llvm/test/Transforms/PGOProfile/memprof-dump-matched-alloc-site.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof-dump-matched-alloc-site.ll
@@ -26,7 +26,13 @@
 ; REQUIRES: x86_64-linux
 ; RUN: split-file %s %t
 ; RUN: llvm-profdata merge %t/memprof-dump-matched-alloc-site.yaml -o %t/memprof-dump-matched-alloc-site.memprofdata
-; RUN: opt < %t/memprof-dump-matched-alloc-site.ll -passes='memprof-use<profile-filename=%t/memprof-dump-matched-alloc-site.memprofdata>' -memprof-print-match-info -S 2>&1 | FileCheck %s
+; RUN: opt < %t/memprof-dump-matched-alloc-site.ll -passes='memprof-use<profile-filename=%t/memprof-dump-matched-alloc-site.memprofdata>' -memprof-print-match-info -S 2>&1 | FileCheck %s --check-prefix=MATCH
+;; Test that -memprof-print-matched-alloc-stack enables reporting of the full
+;; matched stack.
+; RUN: opt < %t/memprof-dump-matched-alloc-site.ll -passes='memprof-use<profile-filename=%t/memprof-dump-matched-alloc-site.memprofdata>' -memprof-print-match-info -memprof-print-matched-alloc-stack -S 2>&1 | FileCheck %s --check-prefixes=MATCH,STACK
+;; Test that -memprof-print-matched-alloc-stack without -memprof-print-match-info
+;; is a noop.
+; RUN: opt < %t/memprof-dump-matched-alloc-site.ll -passes='memprof-use<profile-filename=%t/memprof-dump-matched-alloc-site.memprofdata>' -memprof-print-matched-alloc-stack -S 2>&1 | FileCheck %s --implicit-check-not="context with id" --implicit-check-not="and call stack"
 
 ;--- memprof-dump-matched-alloc-site.yaml
 ---
@@ -77,9 +83,12 @@ HeapProfileRecords:
       # Kept empty here because this section is irrelevant for this test.
 ...
 ;--- memprof-dump-matched-alloc-site.ll
-; CHECK: MemProf notcold context with id 5736731103568718490 has total profiled size 3 is matched with 1 frames
-; CHECK: MemProf notcold context with id 5736731103568718490 has total profiled size 3 is matched with 2 frames
-; CHECK: MemProf notcold context with id 5736731103568718490 has total profiled size 3 is matched with 3 frames
+; MATCH: MemProf notcold context with id 5736731103568718490 has total profiled size 3 is matched with 1 frames
+; STACK-SAME: and call stack 16675831946704128299 1244320836757332728 8373967866436022208 5401059281181789382
+; MATCH: MemProf notcold context with id 5736731103568718490 has total profiled size 3 is matched with 2 frames
+; STACK-SAME: and call stack 16675831946704128299 1244320836757332728 8373967866436022208 5401059281181789382
+; MATCH: MemProf notcold context with id 5736731103568718490 has total profiled size 3 is matched with 3 frames
+; STACK-SAME: and call stack 16675831946704128299 1244320836757332728 8373967866436022208 5401059281181789382
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"

``````````

</details>


https://github.com/llvm/llvm-project/pull/170516


More information about the llvm-commits mailing list