[llvm] [memprof] Add YAML-based deserialization for MemProf profile (PR #117829)

Kazu Hirata via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 26 21:48:49 PST 2024


https://github.com/kazutakahirata updated https://github.com/llvm/llvm-project/pull/117829

>From 1ee8dff20f26e5a39eb930d878675994a001f87b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Fri, 19 Apr 2024 07:01:41 -0700
Subject: [PATCH 1/2] [memprof] Add YAML-based deserialization for MemProf
 profile

This patch adds YAML-based deserialization for MemProf profile.

It's been painful to write tests for MemProf passes because we do not
have a text format for the MemProf profile.  We would write a test
case in C++, run it for a binary MemProf profile, and then finally run
a test written in LLVM IR with the binary profile.

This patch paves the way toward YAML-based MemProf profile.
Specifically, it adds new class YAMLMemProfReader derived from
MemProfReader.  For now, it only adds a function to parse StringRef
pointing to YAML data.  Subseqeunt patches will wire it to
llvm-profdata and read from a file.

The field names are based on various printYAML functions in MemProf.h.
I'm not aiming for compatibility with the format used in printYAML,
but I don't see a point in changing the field names.
---
 llvm/include/llvm/ProfileData/MemProf.h       |  22 ++++
 llvm/include/llvm/ProfileData/MemProfReader.h |   6 +
 llvm/lib/ProfileData/MemProfReader.cpp        | 106 ++++++++++++++++++
 llvm/unittests/ProfileData/MemProfTest.cpp    |  77 +++++++++++++
 4 files changed, 211 insertions(+)

diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index d64addcfe3ed06..3b631fad4029e7 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -19,6 +19,10 @@
 #include <optional>
 
 namespace llvm {
+namespace yaml {
+template <typename T> struct CustomMappingTraits;
+} // namespace yaml
+
 namespace memprof {
 
 struct MemProfRecord;
@@ -193,6 +197,9 @@ struct PortableMemInfoBlock {
     return Result;
   }
 
+  // Give YAML access to the individual MIB fields.
+  friend struct yaml::CustomMappingTraits<memprof::PortableMemInfoBlock>;
+
 private:
   // The set of available fields, indexed by Meta::Name.
   std::bitset<llvm::to_underlying(Meta::Size)> Schema;
@@ -362,6 +369,8 @@ struct IndexedAllocationInfo {
   IndexedAllocationInfo(CallStackId CSId, const MemInfoBlock &MB,
                         const MemProfSchema &Schema = getFullSchema())
       : CSId(CSId), Info(MB, Schema) {}
+  IndexedAllocationInfo(CallStackId CSId, const PortableMemInfoBlock &MB)
+      : CSId(CSId), Info(MB) {}
 
   // Returns the size in bytes when this allocation info struct is serialized.
   size_t serializedSize(const MemProfSchema &Schema,
@@ -498,6 +507,19 @@ struct MemProfRecord {
   }
 };
 
+// Helper struct for AllMemProfData.  In YAML, we treat the GUID and the fields
+// within MemProfRecord at the same level as if the GUID were part of
+// MemProfRecord.
+struct GUIDMemProfRecordPair {
+  GlobalValue::GUID GUID;
+  MemProfRecord Record;
+};
+
+// The top-level data structure, only used with YAML for now.
+struct AllMemProfData {
+  std::vector<GUIDMemProfRecordPair> HeapProfileRecords;
+};
+
 // Reads a memprof schema from a buffer. All entries in the buffer are
 // interpreted as uint64_t. The first entry in the buffer denotes the number of
 // ids in the schema. Subsequent entries are integers which map to memprof::Meta
diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h
index bf2f91548fac07..0529f794606465 100644
--- a/llvm/include/llvm/ProfileData/MemProfReader.h
+++ b/llvm/include/llvm/ProfileData/MemProfReader.h
@@ -209,6 +209,12 @@ class RawMemProfReader final : public MemProfReader {
   // A mapping of the hash to symbol name, only used if KeepSymbolName is true.
   llvm::DenseMap<uint64_t, std::string> GuidToSymbolName;
 };
+
+class YAMLMemProfReader final : public MemProfReader {
+public:
+  YAMLMemProfReader() = default;
+  void parse(StringRef YAMLData);
+};
 } // namespace memprof
 } // namespace llvm
 
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index 921ff3429fed74..e0b8207020dced 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -40,6 +40,71 @@
 #include "llvm/Support/Path.h"
 
 #define DEBUG_TYPE "memprof"
+
+namespace llvm {
+namespace yaml {
+template <> struct MappingTraits<memprof::Frame> {
+  static void mapping(IO &Io, memprof::Frame &F) {
+    Io.mapRequired("Function", F.Function);
+    Io.mapRequired("LineOffset", F.LineOffset);
+    Io.mapRequired("Column", F.Column);
+    Io.mapRequired("Inline", F.IsInlineFrame);
+  }
+};
+
+template <> struct CustomMappingTraits<memprof::PortableMemInfoBlock> {
+  static void inputOne(IO &Io, StringRef KeyStr,
+                       memprof::PortableMemInfoBlock &MIB) {
+    // PortableMemInfoBlock keeps track of the set of fields that actually have
+    // values.  We update the set here as we receive a key-value pair from the
+    // YAML document.
+#define MIBEntryDef(NameTag, Name, Type)                                       \
+  if (KeyStr == #Name) {                                                       \
+    Io.mapRequired(KeyStr.str().c_str(), MIB.Name);                            \
+    MIB.Schema.set(llvm::to_underlying(memprof::Meta::Name));                  \
+    return;                                                                    \
+  }
+#include "llvm/ProfileData/MIBEntryDef.inc"
+#undef MIBEntryDef
+    Io.setError("Key is not a valid validation event");
+  }
+
+  static void output(IO &Io, memprof::PortableMemInfoBlock &VI) {
+    llvm_unreachable("To be implemented");
+  }
+};
+
+template <> struct MappingTraits<memprof::AllocationInfo> {
+  static void mapping(IO &Io, memprof::AllocationInfo &AI) {
+    Io.mapRequired("Callstack", AI.CallStack);
+    Io.mapRequired("MemInfoBlock", AI.Info);
+  }
+};
+
+// In YAML, we use GUIDMemProfRecordPair instead of MemProfRecord so that we can
+// treat the GUID and the fields within MemProfRecord at the same level as if
+// the GUID were part of MemProfRecord.
+template <> struct MappingTraits<memprof::GUIDMemProfRecordPair> {
+  static void mapping(IO &Io, memprof::GUIDMemProfRecordPair &Pair) {
+    Io.mapRequired("GUID", Pair.GUID);
+    Io.mapRequired("AllocSites", Pair.Record.AllocSites);
+    Io.mapRequired("CallSites", Pair.Record.CallSites);
+  }
+};
+
+template <> struct MappingTraits<memprof::AllMemProfData> {
+  static void mapping(IO &Io, memprof::AllMemProfData &Data) {
+    Io.mapRequired("HeapProfileRecords", Data.HeapProfileRecords);
+  }
+};
+} // namespace yaml
+} // namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(memprof::Frame)
+LLVM_YAML_IS_SEQUENCE_VECTOR(std::vector<memprof::Frame>)
+LLVM_YAML_IS_SEQUENCE_VECTOR(memprof::AllocationInfo)
+LLVM_YAML_IS_SEQUENCE_VECTOR(memprof::GUIDMemProfRecordPair)
+
 namespace llvm {
 namespace memprof {
 namespace {
@@ -756,5 +821,46 @@ Error RawMemProfReader::readNextRecord(
   };
   return MemProfReader::readNextRecord(GuidRecord, IdToFrameCallback);
 }
+
+void YAMLMemProfReader::parse(StringRef YAMLData) {
+  memprof::AllMemProfData Doc;
+  yaml::Input Yin(YAMLData);
+
+  Yin >> Doc;
+  if (Yin.error())
+    return;
+
+  // Add a call stack to MemProfData.CallStacks and return its CallStackId.
+  auto AddCallStack = [&](ArrayRef<Frame> CallStack) -> CallStackId {
+    SmallVector<FrameId> IndexedCallStack;
+    IndexedCallStack.reserve(CallStack.size());
+    for (const Frame &F : CallStack) {
+      FrameId Id = F.hash();
+      MemProfData.Frames.try_emplace(Id, F);
+      IndexedCallStack.push_back(Id);
+    }
+    CallStackId CSId = hashCallStack(IndexedCallStack);
+    MemProfData.CallStacks.try_emplace(CSId, std::move(IndexedCallStack));
+    return CSId;
+  };
+
+  for (const auto &[GUID, Record] : Doc.HeapProfileRecords) {
+    IndexedMemProfRecord IndexedRecord;
+
+    // Convert AllocationInfo to IndexedAllocationInfo.
+    for (const AllocationInfo &AI : Record.AllocSites) {
+      CallStackId CSId = AddCallStack(AI.CallStack);
+      IndexedRecord.AllocSites.emplace_back(CSId, AI.Info);
+    }
+
+    // Populate CallSiteIds.
+    for (const auto &CallSite : Record.CallSites) {
+      CallStackId CSId = AddCallStack(CallSite);
+      IndexedRecord.CallSiteIds.push_back(CSId);
+    }
+
+    MemProfData.Records.try_emplace(GUID, std::move(IndexedRecord));
+  }
+}
 } // namespace memprof
 } // namespace llvm
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index b3b6249342d630..53a4c8a09472ae 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -742,4 +742,81 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) {
   EXPECT_THAT(Mappings, testing::Contains(testing::Pair(
                             llvm::memprof::hashCallStack(CS4), 10U)));
 }
+
+// Verify that we can parse YAML and retrieve IndexedMemProfData as expected.
+TEST(MemProf, YAMLParser) {
+  StringRef YAMLData = R"YAML(
+---
+HeapProfileRecords:
+- GUID: 0xdeadbeef12345678
+  AllocSites:
+  - Callstack:
+    - {Function: 0x100, LineOffset: 11, Column: 10, Inline: true}
+    - {Function: 0x200, LineOffset: 22, Column: 20, Inline: false}
+    MemInfoBlock:
+      AllocCount: 777
+      TotalSize: 888
+  - Callstack:
+    - {Function: 0x300, LineOffset: 33, Column: 30, Inline: false}
+    - {Function: 0x400, LineOffset: 44, Column: 40, Inline: true}
+    MemInfoBlock:
+      AllocCount: 666
+      TotalSize: 555
+  CallSites:
+  - - {Function: 0x500, LineOffset: 55, Column: 50, Inline: true}
+    - {Function: 0x600, LineOffset: 66, Column: 60, Inline: false}
+  - - {Function: 0x700, LineOffset: 77, Column: 70, Inline: true}
+    - {Function: 0x800, LineOffset: 88, Column: 80, Inline: false}
+)YAML";
+
+  llvm::memprof::YAMLMemProfReader YAMLReader;
+  YAMLReader.parse(YAMLData);
+  llvm::memprof::IndexedMemProfData MemProfData = YAMLReader.takeMemProfData();
+
+  Frame F1(0x100, 11, 10, true);
+  Frame F2(0x200, 22, 20, false);
+  Frame F3(0x300, 33, 30, false);
+  Frame F4(0x400, 44, 40, true);
+  Frame F5(0x500, 55, 50, true);
+  Frame F6(0x600, 66, 60, false);
+  Frame F7(0x700, 77, 70, true);
+  Frame F8(0x800, 88, 80, false);
+
+  llvm::SmallVector<FrameId> CS1 = {F1.hash(), F2.hash()};
+  llvm::SmallVector<FrameId> CS2 = {F3.hash(), F4.hash()};
+  llvm::SmallVector<FrameId> CS3 = {F5.hash(), F6.hash()};
+  llvm::SmallVector<FrameId> CS4 = {F7.hash(), F8.hash()};
+
+  // Verify the entire contents of MemProfData.Frames.
+  EXPECT_THAT(
+      MemProfData.Frames,
+      ::testing::UnorderedElementsAre(
+          ::testing::Pair(F1.hash(), F1), ::testing::Pair(F2.hash(), F2),
+          ::testing::Pair(F3.hash(), F3), ::testing::Pair(F4.hash(), F4),
+          ::testing::Pair(F5.hash(), F5), ::testing::Pair(F6.hash(), F6),
+          ::testing::Pair(F7.hash(), F7), ::testing::Pair(F8.hash(), F8)));
+
+  // Verify the entire contents of MemProfData.Frames.
+  EXPECT_THAT(MemProfData.CallStacks,
+              ::testing::UnorderedElementsAre(
+                  ::testing::Pair(llvm::memprof::hashCallStack(CS1), CS1),
+                  ::testing::Pair(llvm::memprof::hashCallStack(CS2), CS2),
+                  ::testing::Pair(llvm::memprof::hashCallStack(CS3), CS3),
+                  ::testing::Pair(llvm::memprof::hashCallStack(CS4), CS4)));
+
+  // Verify the entire contents of MemProfData.Records.
+  ASSERT_THAT(MemProfData.Records, SizeIs(1));
+  const auto &[GUID, Record] = *MemProfData.Records.begin();
+  EXPECT_EQ(GUID, 0xdeadbeef12345678ULL);
+  ASSERT_THAT(Record.AllocSites, SizeIs(2));
+  EXPECT_EQ(Record.AllocSites[0].CSId, llvm::memprof::hashCallStack(CS1));
+  EXPECT_EQ(Record.AllocSites[0].Info.getAllocCount(), 777U);
+  EXPECT_EQ(Record.AllocSites[0].Info.getTotalSize(), 888U);
+  EXPECT_EQ(Record.AllocSites[1].CSId, llvm::memprof::hashCallStack(CS2));
+  EXPECT_EQ(Record.AllocSites[1].Info.getAllocCount(), 666U);
+  EXPECT_EQ(Record.AllocSites[1].Info.getTotalSize(), 555U);
+  EXPECT_THAT(Record.CallSiteIds,
+              ::testing::ElementsAre(llvm::memprof::hashCallStack(CS3),
+                                     llvm::memprof::hashCallStack(CS4)));
+}
 } // namespace

>From 46aedcef7db8acc2abfeff3adf5613dd121b603b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Tue, 26 Nov 2024 19:22:28 -0800
Subject: [PATCH 2/2] Address comments.

---
 llvm/lib/ProfileData/MemProfReader.cpp     | 14 ++++++++++
 llvm/unittests/ProfileData/MemProfTest.cpp | 32 +++++++++++-----------
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index e0b8207020dced..16729c8652549b 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -49,6 +49,20 @@ template <> struct MappingTraits<memprof::Frame> {
     Io.mapRequired("LineOffset", F.LineOffset);
     Io.mapRequired("Column", F.Column);
     Io.mapRequired("Inline", F.IsInlineFrame);
+
+    // Assert that the definition of Frame matches what we expect.  The
+    // structured bindings below detect changes to the number of fields.
+    // static_assert checks the type of each field.
+    const auto &[Function, SymbolName, LineOffset, Column, IsInlineFrame] = F;
+    static_assert(
+        std::is_same_v<remove_cvref_t<decltype(Function)>, GlobalValue::GUID>);
+    static_assert(std::is_same_v<remove_cvref_t<decltype(SymbolName)>,
+                                 std::unique_ptr<std::string>>);
+    static_assert(
+        std::is_same_v<remove_cvref_t<decltype(LineOffset)>, uint32_t>);
+    static_assert(std::is_same_v<remove_cvref_t<decltype(Column)>, uint32_t>);
+    static_assert(
+        std::is_same_v<remove_cvref_t<decltype(IsInlineFrame)>, bool>);
   }
 };
 
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 53a4c8a09472ae..5ab860e025b8ce 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -34,6 +34,7 @@ using ::llvm::memprof::CallStackId;
 using ::llvm::memprof::CallStackMap;
 using ::llvm::memprof::Frame;
 using ::llvm::memprof::FrameId;
+using ::llvm::memprof::hashCallStack;
 using ::llvm::memprof::IndexedAllocationInfo;
 using ::llvm::memprof::IndexedMemProfRecord;
 using ::llvm::memprof::MemInfoBlock;
@@ -46,8 +47,11 @@ using ::llvm::memprof::RawMemProfReader;
 using ::llvm::memprof::SegmentEntry;
 using ::llvm::object::SectionedAddress;
 using ::llvm::symbolize::SymbolizableModule;
+using ::testing::ElementsAre;
+using ::testing::Pair;
 using ::testing::Return;
 using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
 
 class MockSymbolizer : public SymbolizableModule {
 public:
@@ -788,35 +792,31 @@ TEST(MemProf, YAMLParser) {
   llvm::SmallVector<FrameId> CS4 = {F7.hash(), F8.hash()};
 
   // Verify the entire contents of MemProfData.Frames.
-  EXPECT_THAT(
-      MemProfData.Frames,
-      ::testing::UnorderedElementsAre(
-          ::testing::Pair(F1.hash(), F1), ::testing::Pair(F2.hash(), F2),
-          ::testing::Pair(F3.hash(), F3), ::testing::Pair(F4.hash(), F4),
-          ::testing::Pair(F5.hash(), F5), ::testing::Pair(F6.hash(), F6),
-          ::testing::Pair(F7.hash(), F7), ::testing::Pair(F8.hash(), F8)));
+  EXPECT_THAT(MemProfData.Frames,
+              UnorderedElementsAre(Pair(F1.hash(), F1), Pair(F2.hash(), F2),
+                                   Pair(F3.hash(), F3), Pair(F4.hash(), F4),
+                                   Pair(F5.hash(), F5), Pair(F6.hash(), F6),
+                                   Pair(F7.hash(), F7), Pair(F8.hash(), F8)));
 
   // Verify the entire contents of MemProfData.Frames.
   EXPECT_THAT(MemProfData.CallStacks,
-              ::testing::UnorderedElementsAre(
-                  ::testing::Pair(llvm::memprof::hashCallStack(CS1), CS1),
-                  ::testing::Pair(llvm::memprof::hashCallStack(CS2), CS2),
-                  ::testing::Pair(llvm::memprof::hashCallStack(CS3), CS3),
-                  ::testing::Pair(llvm::memprof::hashCallStack(CS4), CS4)));
+              UnorderedElementsAre(Pair(hashCallStack(CS1), CS1),
+                                   Pair(hashCallStack(CS2), CS2),
+                                   Pair(hashCallStack(CS3), CS3),
+                                   Pair(hashCallStack(CS4), CS4)));
 
   // Verify the entire contents of MemProfData.Records.
   ASSERT_THAT(MemProfData.Records, SizeIs(1));
   const auto &[GUID, Record] = *MemProfData.Records.begin();
   EXPECT_EQ(GUID, 0xdeadbeef12345678ULL);
   ASSERT_THAT(Record.AllocSites, SizeIs(2));
-  EXPECT_EQ(Record.AllocSites[0].CSId, llvm::memprof::hashCallStack(CS1));
+  EXPECT_EQ(Record.AllocSites[0].CSId, hashCallStack(CS1));
   EXPECT_EQ(Record.AllocSites[0].Info.getAllocCount(), 777U);
   EXPECT_EQ(Record.AllocSites[0].Info.getTotalSize(), 888U);
-  EXPECT_EQ(Record.AllocSites[1].CSId, llvm::memprof::hashCallStack(CS2));
+  EXPECT_EQ(Record.AllocSites[1].CSId, hashCallStack(CS2));
   EXPECT_EQ(Record.AllocSites[1].Info.getAllocCount(), 666U);
   EXPECT_EQ(Record.AllocSites[1].Info.getTotalSize(), 555U);
   EXPECT_THAT(Record.CallSiteIds,
-              ::testing::ElementsAre(llvm::memprof::hashCallStack(CS3),
-                                     llvm::memprof::hashCallStack(CS4)));
+              ElementsAre(hashCallStack(CS3), hashCallStack(CS4)));
 }
 } // namespace



More information about the llvm-commits mailing list