[llvm] [GSYM] Callsites: Add data format support and loading from YAML (PR #109781)

via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 27 12:08:35 PDT 2024


================
@@ -0,0 +1,275 @@
+//===- CallSiteInfo.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/GSYM/CallSiteInfo.h"
+#include "llvm/ADT/CachedHashString.h"
+#include "llvm/DebugInfo/GSYM/FileWriter.h"
+#include "llvm/DebugInfo/GSYM/FunctionInfo.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include <fstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+using namespace llvm;
+using namespace gsym;
+
+llvm::Error CallSiteInfo::encode(FileWriter &O) const {
+  O.writeU64(ReturnAddress);
+  O.writeU8(Flags);
+  O.writeU32(MatchRegex.size());
+  for (uint32_t Entry : MatchRegex)
+    O.writeU32(Entry);
+  return llvm::Error::success();
+}
+
+llvm::Expected<CallSiteInfo>
+CallSiteInfo::decode(DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr) {
+  CallSiteInfo CSI;
+
+  // Read ReturnAddress
+  if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(uint64_t)))
+    return createStringError(std::errc::io_error,
+                             "0x%8.8" PRIx64 ": missing ReturnAddress", Offset);
+  CSI.ReturnAddress = Data.getU64(&Offset);
+
+  // Read Flags
+  if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(uint8_t)))
+    return createStringError(std::errc::io_error,
+                             "0x%8.8" PRIx64 ": missing Flags", Offset);
+  CSI.Flags = Data.getU8(&Offset);
+
+  // Read number of MatchRegex entries
+  if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(uint32_t)))
+    return createStringError(std::errc::io_error,
+                             "0x%8.8" PRIx64 ": missing MatchRegex count",
+                             Offset);
+  uint32_t NumEntries = Data.getU32(&Offset);
+
+  CSI.MatchRegex.reserve(NumEntries);
+  for (uint32_t i = 0; i < NumEntries; ++i) {
+    if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(uint32_t)))
+      return createStringError(std::errc::io_error,
+                               "0x%8.8" PRIx64 ": missing MatchRegex entry",
+                               Offset);
+    uint32_t Entry = Data.getU32(&Offset);
+    CSI.MatchRegex.push_back(Entry);
+  }
+
+  return CSI;
+}
+
+llvm::Error CallSiteInfoCollection::encode(FileWriter &O) const {
+  O.writeU32(CallSites.size());
+  for (const CallSiteInfo &CSI : CallSites) {
+    if (llvm::Error Err = CSI.encode(O))
+      return Err;
+  }
+  return llvm::Error::success();
+}
+
+llvm::Expected<CallSiteInfoCollection>
+CallSiteInfoCollection::decode(DataExtractor &Data, uint64_t BaseAddr) {
+  CallSiteInfoCollection CSC;
+  uint64_t Offset = 0;
+
+  // Read number of CallSiteInfo entries
+  if (!Data.isValidOffsetForDataOfSize(Offset, sizeof(uint32_t)))
+    return createStringError(std::errc::io_error,
+                             "0x%8.8" PRIx64 ": missing CallSiteInfo count",
+                             Offset);
+  uint32_t NumCallSites = Data.getU32(&Offset);
+
+  CSC.CallSites.reserve(NumCallSites);
+  for (uint32_t i = 0; i < NumCallSites; ++i) {
+    llvm::Expected<CallSiteInfo> ECSI =
+        CallSiteInfo::decode(Data, Offset, BaseAddr);
+    if (!ECSI)
+      return ECSI.takeError();
+    CSC.CallSites.emplace_back(*ECSI);
+  }
+
+  return CSC;
+}
+
+/// Structures necessary for reading CallSiteInfo from YAML.
+namespace llvm {
+namespace yaml {
+
+struct CallSiteYAML {
+  // The offset of the return address of the call site - relative to the start
+  // of the function.
+  llvm::yaml::Hex64 return_offset;
+  std::vector<std::string> match_regex;
+  std::vector<std::string> flags;
+};
+
+struct FunctionYAML {
+  std::string name;
+  std::vector<CallSiteYAML> callsites;
+};
+
+struct FunctionsYAML {
+  std::vector<FunctionYAML> functions;
+};
+
+template <> struct MappingTraits<CallSiteYAML> {
+  static void mapping(IO &io, CallSiteYAML &callsite) {
+    io.mapRequired("return_offset", callsite.return_offset);
+    io.mapRequired("match_regex", callsite.match_regex);
+    io.mapOptional("flags", callsite.flags);
+  }
+};
+
+template <> struct MappingTraits<FunctionYAML> {
+  static void mapping(IO &io, FunctionYAML &func) {
+    io.mapRequired("name", func.name);
+    io.mapOptional("callsites", func.callsites);
+  }
+};
+
+template <> struct MappingTraits<FunctionsYAML> {
+  static void mapping(IO &io, FunctionsYAML &functionsYAML) {
+    io.mapRequired("functions", functionsYAML.functions);
+  }
+};
+
+} // namespace yaml
+} // namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(CallSiteYAML)
+LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionYAML)
+
+// Implementation of CallSiteInfoLoader
+StringRef CallSiteInfoLoader::stringFromOffset(uint32_t offset) const {
+  assert(StringOffsetMap.count(offset) &&
+         "expected function name offset to already be in StringOffsetMap");
+  return StringOffsetMap.find(offset)->second.val();
+}
+
+uint32_t CallSiteInfoLoader::offsetFromString(StringRef str) {
+  return StrTab.add(StringStorage.insert(str).first->getKey());
+}
+
+llvm::Error CallSiteInfoLoader::loadYAML(std::vector<FunctionInfo> &Funcs,
+                                         StringRef YAMLFile) {
+  // Step 1: Read YAML file
+  auto BufferOrError = llvm::MemoryBuffer::getFile(YAMLFile);
+  if (!BufferOrError)
+    return errorCodeToError(BufferOrError.getError());
+
+  std::unique_ptr<llvm::MemoryBuffer> Buffer = std::move(*BufferOrError);
+
+  // Step 2: Parse YAML content
+  llvm::yaml::FunctionsYAML functionsYAML;
+  llvm::yaml::Input yin(Buffer->getMemBufferRef());
+  yin >> functionsYAML;
----------------
alx32 wrote:

This would work, but it would require changing the format of the YAML file. Currently it's:

```
functions:
  - name: func1
    callsites:
      - return_offset: 0x0C
        match_regex: ["func_mainBin_dec_01"]
        flags:
          - "InternalCall"
      - return_offset: 0x10
        match_regex: ["func_mainBin_dec_02"]
        flags:
          - "InternalCall"

  - name: func2
    callsites:
      - return_offset: 0x20
        match_regex: ["func_validate_input"]
        flags:
          - "InternalCall"
      - return_offset: 0x28
        match_regex: ["func_transform_data"]
```

After we would have to get rid of the top level array and have the elements at the top level like this:
```

- name: func1
  callsites:
    - return_offset: 0x0C
      match_regex: ["func_mainBin_dec_01"]
      flags:
        - "InternalCall"
    - return_offset: 0x10
      match_regex: ["func_mainBin_dec_02"]
      flags:
        - "InternalCall"

- name: func2
  callsites:
    - return_offset: 0x20
      match_regex: ["func_validate_input"]
      flags:
        - "InternalCall"
    - return_offset: 0x28
      match_regex: ["func_transform_data"]
```

This is technically correct but the YAML contents may be less intuitive and will provide less opportunities for extension in the future (no adding sibling elements to the "functions" element).

https://github.com/llvm/llvm-project/pull/109781


More information about the llvm-commits mailing list