[llvm] [StaticDataLayout][PGO] Add profile format for static data layout, and the classes to operate on the profiles. (PR #138170)

Snehasish Kumar via llvm-commits llvm-commits at lists.llvm.org
Fri May 9 13:37:30 PDT 2025


================
@@ -0,0 +1,167 @@
+//===- DataAccessProf.h - Data access profile format support ---------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support to construct and use data access profiles.
+//
+// For the original RFC of this pass please see
+// https://discourse.llvm.org/t/rfc-profile-guided-static-data-partitioning/83744
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PROFILEDATA_DATAACCESSPROF_H_
+#define LLVM_PROFILEDATA_DATAACCESSPROF_H_
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfoVariant.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/StringSaver.h"
+
+#include <cstdint>
+#include <variant>
+
+namespace llvm {
+
+namespace data_access_prof {
+// The location of data in the source code.
+struct DataLocation {
+  // The filename where the data is located.
+  StringRef FileName;
+  // The line number in the source code.
+  uint32_t Line;
+};
+
+// The data access profiles for a symbol.
+struct DataAccessProfRecord {
+  DataAccessProfRecord(uint64_t SymbolID, uint64_t AccessCount,
+                       bool IsStringLiteral)
+      : SymbolID(SymbolID), AccessCount(AccessCount),
+        IsStringLiteral(IsStringLiteral) {}
+
+  // Represents a data symbol. The semantic comes in two forms: a symbol index
+  // for symbol name if `IsStringLiteral` is false, or the hash of a string
+  // content if `IsStringLiteral` is true. For most of the symbolizable static
+  // data, the mangled symbol names remain stable relative to the source code
+  // and therefore used to identify symbols across binary releases. String
+  // literals have unstable name patterns like `.str.N[.llvm.hash]`, so we use
+  // the content hash instead. This is a required field.
+  uint64_t SymbolID;
+
+  // The access count of symbol. Required.
+  uint64_t AccessCount;
+
+  // True iff this is a record for string literal (symbols with name pattern
+  // `.str.*` in the symbol table). Required.
+  bool IsStringLiteral;
+
+  // The locations of data in the source code. Optional.
+  llvm::SmallVector<DataLocation, 0> Locations;
+};
+
+/// Encapsulates the data access profile data and the methods to operate on it.
+/// This class provides profile look-up, serialization and deserialization.
+class DataAccessProfData {
+public:
+  // SymbolID is either a string representing symbol name if the symbol has
+  // stable mangled name relative to source code, or a uint64_t representing the
+  // content hash of a string literal (with unstable name patterns like
+  // `.str.N[.llvm.hash]`). The StringRef is owned by the class's saver object.
+  using SymbolHandle = std::variant<StringRef, uint64_t>;
+  using StringToIndexMap = llvm::MapVector<StringRef, uint64_t>;
+
+  DataAccessProfData() : Saver(Allocator) {}
+
+  /// Serialize profile data to the output stream.
+  /// Storage layout:
+  /// - Serialized strings.
+  /// - The encoded hashes.
+  /// - Records.
+  Error serialize(ProfOStream &OS) const;
+
+  /// Deserialize this class from the given buffer.
+  Error deserialize(const unsigned char *&Ptr);
+
+  /// Returns a pointer of profile record for \p SymbolID, or nullptr if there
+  /// isn't a record. Internally, this function will canonicalize the symbol
+  /// name before the lookup.
+  const DataAccessProfRecord *getProfileRecord(const SymbolHandle SymID) const;
+
+  /// Returns true if \p SymID is seen in profiled binaries and cold.
+  bool isKnownColdSymbol(const SymbolHandle SymID) const;
+
+  /// Methods to set symbolized data access profile. Returns error if duplicated
+  /// symbol names or content hashes are seen. The user of this class should
+  /// aggregate counters that correspond to the same symbol name or with the
+  /// same string literal hash before calling 'set*' methods.
+  Error setDataAccessProfile(SymbolHandle SymbolID, uint64_t AccessCount);
+  /// Similar to the method above, for records with \p Locations representing
+  /// the `filename:line` where this symbol shows up. Note because of linker's
+  /// merge of identical symbols (e.g., unnamed_addr string literals), one
+  /// symbol is likely to have multiple locations.
+  Error setDataAccessProfile(SymbolHandle SymbolID, uint64_t AccessCount,
+                             ArrayRef<DataLocation> Locations);
----------------
snehasish wrote:

In actual usage (outside of the existing tests) will we need to accept an `ArrayRef<DataLocation>` or can it be something like setDataAccessProfile(SymbolHandle, AccessCount, `ArrayRef<DILocation>`)?  

https://github.com/llvm/llvm-project/pull/138170


More information about the llvm-commits mailing list