[llvm] 88e3358 - [ORC][JITLink] Non-debuginfo JITLink perf jitdump support.

Lang Hames via llvm-commits llvm-commits at lists.llvm.org
Sun Sep 17 21:10:35 PDT 2023


Author: Prem Chintalapudi
Date: 2023-09-18T04:10:29Z
New Revision: 88e3358f331d727e7bbcddb2821ef89d25d1ab7a

URL: https://github.com/llvm/llvm-project/commit/88e3358f331d727e7bbcddb2821ef89d25d1ab7a
DIFF: https://github.com/llvm/llvm-project/commit/88e3358f331d727e7bbcddb2821ef89d25d1ab7a.diff

LOG: [ORC][JITLink] Non-debuginfo JITLink perf jitdump support.

This patch ports PerfJITEventListener to a JITLink plugin, but adds unwind
record support and drops debuginfo support temporarily. Debuginfo can be
enabled in the future by providing a way to obtain a DWARFContext from a
LinkGraph.

Reviewed By: lhames

Differential Revision: https://reviews.llvm.org/D146169

Added: 
    llvm/include/llvm/ExecutionEngine/Orc/PerfSupportPlugin.h
    llvm/include/llvm/ExecutionEngine/Orc/Shared/PerfSharedStructs.h
    llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h
    llvm/lib/ExecutionEngine/Orc/PerfSupportPlugin.cpp
    llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
    llvm/test/ExecutionEngine/JITLink/x86-64/ELF_perf.s

Modified: 
    llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
    llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
    llvm/tools/llvm-jitlink/llvm-jitlink.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/ExecutionEngine/Orc/PerfSupportPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/PerfSupportPlugin.h
new file mode 100644
index 000000000000000..c663377b17b652b
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/PerfSupportPlugin.h
@@ -0,0 +1,65 @@
+//===----- PerfSupportPlugin.h ----- Utils for perf support -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Handles support for registering code with perf
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_PERFSUPPORTPLUGIN_H
+#define LLVM_EXECUTIONENGINE_ORC_PERFSUPPORTPLUGIN_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/PerfSharedStructs.h"
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+
+namespace llvm {
+namespace orc {
+
+/// Log perf jitdump events for each object (see
+/// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/jitdump-specification.txt).
+/// Currently has support for dumping code load records and unwind info records.
+class PerfSupportPlugin : public ObjectLinkingLayer::Plugin {
+public:
+  PerfSupportPlugin(ExecutorProcessControl &EPC,
+                    ExecutorAddr RegisterPerfStartAddr,
+                    ExecutorAddr RegisterPerfEndAddr,
+                    ExecutorAddr RegisterPerfImplAddr, bool EmitUnwindInfo);
+  ~PerfSupportPlugin();
+
+  void modifyPassConfig(MaterializationResponsibility &MR,
+                        jitlink::LinkGraph &G,
+                        jitlink::PassConfiguration &Config) override;
+
+  Error notifyFailed(MaterializationResponsibility &MR) override {
+    return Error::success();
+  }
+
+  Error notifyRemovingResources(JITDylib &JD, ResourceKey K) override {
+    return Error::success();
+  }
+
+  void notifyTransferringResources(JITDylib &JD, ResourceKey DstKey,
+                                   ResourceKey SrcKey) override {}
+
+  static Expected<std::unique_ptr<PerfSupportPlugin>>
+  Create(ExecutorProcessControl &EPC, JITDylib &JD, bool EmitUnwindInfo);
+
+private:
+  ExecutorProcessControl &EPC;
+  ExecutorAddr RegisterPerfStartAddr;
+  ExecutorAddr RegisterPerfEndAddr;
+  ExecutorAddr RegisterPerfImplAddr;
+  std::atomic<uint64_t> CodeIndex;
+  bool EmitUnwindInfo;
+};
+
+} // namespace orc
+} // namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_PERFSUPPORTPLUGIN_H
\ No newline at end of file

diff  --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/PerfSharedStructs.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/PerfSharedStructs.h
new file mode 100644
index 000000000000000..f4788bcebc3cf97
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/PerfSharedStructs.h
@@ -0,0 +1,233 @@
+//===--- PerfSharedStructs.h --- RPC Structs for perf support ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Structs and serialization to share perf-related information
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_PERFSHAREDSTRUCTS_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_PERFSHAREDSTRUCTS_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
+
+namespace llvm {
+
+namespace orc {
+
+// The following are POD struct definitions from the perf jit specification
+
+enum class PerfJITRecordType {
+  JIT_CODE_LOAD = 0,
+  JIT_CODE_MOVE = 1, // not emitted, code isn't moved
+  JIT_CODE_DEBUG_INFO = 2,
+  JIT_CODE_CLOSE = 3,          // not emitted, unnecessary
+  JIT_CODE_UNWINDING_INFO = 4, // not emitted
+
+  JIT_CODE_MAX
+};
+
+struct PerfJITRecordPrefix {
+  PerfJITRecordType Id; // record type identifier, uint32_t
+  uint32_t TotalSize;
+};
+struct PerfJITCodeLoadRecord {
+  PerfJITRecordPrefix Prefix;
+
+  uint32_t Pid;
+  uint32_t Tid;
+  uint64_t Vma;
+  uint64_t CodeAddr;
+  uint64_t CodeSize;
+  uint64_t CodeIndex;
+  std::string Name;
+};
+
+struct PerfJITDebugEntry {
+  uint64_t Addr;
+  uint32_t Lineno;  // source line number starting at 1
+  uint32_t Discrim; // column discriminator, 0 is default
+  std::string Name;
+};
+
+struct PerfJITDebugInfoRecord {
+  PerfJITRecordPrefix Prefix;
+
+  uint64_t CodeAddr;
+  std::vector<PerfJITDebugEntry> Entries;
+};
+
+struct PerfJITCodeUnwindingInfoRecord {
+  PerfJITRecordPrefix Prefix;
+
+  uint64_t UnwindDataSize;
+  uint64_t EHFrameHdrSize;
+  uint64_t MappedSize;
+  // Union, one will always be 0/"", the other has data
+  uint64_t EHFrameHdrAddr;
+  std::string EHFrameHdr;
+
+  uint64_t EHFrameAddr;
+  // size is UnwindDataSize - EHFrameHdrSize
+};
+
+// Batch vehicle for minimizing RPC calls for perf jit records
+struct PerfJITRecordBatch {
+  std::vector<PerfJITDebugInfoRecord> DebugInfoRecords;
+  std::vector<PerfJITCodeLoadRecord> CodeLoadRecords;
+  // only valid if record size > 0
+  PerfJITCodeUnwindingInfoRecord UnwindingRecord;
+};
+
+// SPS traits for Records
+
+namespace shared {
+
+using SPSPerfJITRecordPrefix = SPSTuple<uint32_t, uint32_t>;
+
+template <>
+class SPSSerializationTraits<SPSPerfJITRecordPrefix, PerfJITRecordPrefix> {
+public:
+  static size_t size(const PerfJITRecordPrefix &Val) {
+    return SPSPerfJITRecordPrefix::AsArgList::size(
+        static_cast<uint32_t>(Val.Id), Val.TotalSize);
+  }
+  static bool deserialize(SPSInputBuffer &IB, PerfJITRecordPrefix &Val) {
+    uint32_t Id;
+    if (!SPSPerfJITRecordPrefix::AsArgList::deserialize(IB, Id, Val.TotalSize))
+      return false;
+    Val.Id = static_cast<PerfJITRecordType>(Id);
+    return true;
+  }
+  static bool serialize(SPSOutputBuffer &OB, const PerfJITRecordPrefix &Val) {
+    return SPSPerfJITRecordPrefix::AsArgList::serialize(
+        OB, static_cast<uint32_t>(Val.Id), Val.TotalSize);
+  }
+};
+
+using SPSPerfJITCodeLoadRecord =
+    SPSTuple<SPSPerfJITRecordPrefix, uint32_t, uint32_t, uint64_t, uint64_t,
+             uint64_t, uint64_t, SPSString>;
+
+template <>
+class SPSSerializationTraits<SPSPerfJITCodeLoadRecord, PerfJITCodeLoadRecord> {
+public:
+  static size_t size(const PerfJITCodeLoadRecord &Val) {
+    return SPSPerfJITCodeLoadRecord::AsArgList::size(
+        Val.Prefix, Val.Pid, Val.Tid, Val.Vma, Val.CodeAddr, Val.CodeSize,
+        Val.CodeIndex, Val.Name);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, PerfJITCodeLoadRecord &Val) {
+    return SPSPerfJITCodeLoadRecord::AsArgList::deserialize(
+        IB, Val.Prefix, Val.Pid, Val.Tid, Val.Vma, Val.CodeAddr, Val.CodeSize,
+        Val.CodeIndex, Val.Name);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const PerfJITCodeLoadRecord &Val) {
+    return SPSPerfJITCodeLoadRecord::AsArgList::serialize(
+        OB, Val.Prefix, Val.Pid, Val.Tid, Val.Vma, Val.CodeAddr, Val.CodeSize,
+        Val.CodeIndex, Val.Name);
+  }
+};
+
+using SPSPerfJITDebugEntry = SPSTuple<uint64_t, uint32_t, uint32_t, SPSString>;
+
+template <>
+class SPSSerializationTraits<SPSPerfJITDebugEntry, PerfJITDebugEntry> {
+public:
+  static size_t size(const PerfJITDebugEntry &Val) {
+    return SPSPerfJITDebugEntry::AsArgList::size(Val.Addr, Val.Lineno,
+                                                 Val.Discrim, Val.Name);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, PerfJITDebugEntry &Val) {
+    return SPSPerfJITDebugEntry::AsArgList::deserialize(
+        IB, Val.Addr, Val.Lineno, Val.Discrim, Val.Name);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const PerfJITDebugEntry &Val) {
+    return SPSPerfJITDebugEntry::AsArgList::serialize(OB, Val.Addr, Val.Lineno,
+                                                      Val.Discrim, Val.Name);
+  }
+};
+
+using SPSPerfJITDebugInfoRecord = SPSTuple<SPSPerfJITRecordPrefix, uint64_t,
+                                           SPSSequence<SPSPerfJITDebugEntry>>;
+
+template <>
+class SPSSerializationTraits<SPSPerfJITDebugInfoRecord,
+                             PerfJITDebugInfoRecord> {
+public:
+  static size_t size(const PerfJITDebugInfoRecord &Val) {
+    return SPSPerfJITDebugInfoRecord::AsArgList::size(Val.Prefix, Val.CodeAddr,
+                                                      Val.Entries);
+  }
+  static bool deserialize(SPSInputBuffer &IB, PerfJITDebugInfoRecord &Val) {
+    return SPSPerfJITDebugInfoRecord::AsArgList::deserialize(
+        IB, Val.Prefix, Val.CodeAddr, Val.Entries);
+  }
+  static bool serialize(SPSOutputBuffer &OB,
+                        const PerfJITDebugInfoRecord &Val) {
+    return SPSPerfJITDebugInfoRecord::AsArgList::serialize(
+        OB, Val.Prefix, Val.CodeAddr, Val.Entries);
+  }
+};
+
+using SPSPerfJITCodeUnwindingInfoRecord =
+    SPSTuple<SPSPerfJITRecordPrefix, uint64_t, uint64_t, uint64_t, uint64_t,
+             SPSString, uint64_t>;
+template <>
+class SPSSerializationTraits<SPSPerfJITCodeUnwindingInfoRecord,
+                             PerfJITCodeUnwindingInfoRecord> {
+public:
+  static size_t size(const PerfJITCodeUnwindingInfoRecord &Val) {
+    return SPSPerfJITCodeUnwindingInfoRecord::AsArgList::size(
+        Val.Prefix, Val.UnwindDataSize, Val.EHFrameHdrSize, Val.MappedSize,
+        Val.EHFrameHdrAddr, Val.EHFrameHdr, Val.EHFrameAddr);
+  }
+  static bool deserialize(SPSInputBuffer &IB,
+                          PerfJITCodeUnwindingInfoRecord &Val) {
+    return SPSPerfJITCodeUnwindingInfoRecord::AsArgList::deserialize(
+        IB, Val.Prefix, Val.UnwindDataSize, Val.EHFrameHdrSize, Val.MappedSize,
+        Val.EHFrameHdrAddr, Val.EHFrameHdr, Val.EHFrameAddr);
+  }
+  static bool serialize(SPSOutputBuffer &OB,
+                        const PerfJITCodeUnwindingInfoRecord &Val) {
+    return SPSPerfJITCodeUnwindingInfoRecord::AsArgList::serialize(
+        OB, Val.Prefix, Val.UnwindDataSize, Val.EHFrameHdrSize, Val.MappedSize,
+        Val.EHFrameHdrAddr, Val.EHFrameHdr, Val.EHFrameAddr);
+  }
+};
+
+using SPSPerfJITRecordBatch = SPSTuple<SPSSequence<SPSPerfJITCodeLoadRecord>,
+                                       SPSSequence<SPSPerfJITDebugInfoRecord>,
+                                       SPSPerfJITCodeUnwindingInfoRecord>;
+template <>
+class SPSSerializationTraits<SPSPerfJITRecordBatch, PerfJITRecordBatch> {
+public:
+  static size_t size(const PerfJITRecordBatch &Val) {
+    return SPSPerfJITRecordBatch::AsArgList::size(
+        Val.CodeLoadRecords, Val.DebugInfoRecords, Val.UnwindingRecord);
+  }
+  static bool deserialize(SPSInputBuffer &IB, PerfJITRecordBatch &Val) {
+    return SPSPerfJITRecordBatch::AsArgList::deserialize(
+        IB, Val.CodeLoadRecords, Val.DebugInfoRecords, Val.UnwindingRecord);
+  }
+  static bool serialize(SPSOutputBuffer &OB, const PerfJITRecordBatch &Val) {
+    return SPSPerfJITRecordBatch::AsArgList::serialize(
+        OB, Val.CodeLoadRecords, Val.DebugInfoRecords, Val.UnwindingRecord);
+  }
+};
+
+} // namespace shared
+
+} // namespace orc
+
+} // namespace llvm
+
+#endif
\ No newline at end of file

diff  --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h
new file mode 100644
index 000000000000000..1d8e33f8013b9f8
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h
@@ -0,0 +1,28 @@
+//===------- JITLoaderPerf.h --- Register profiler objects ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Register objects for access by profilers via the perf JIT interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_JITLOADERPERF_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_JITLOADERPERF_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
+#include <cstdint>
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerJITLoaderPerfImpl(const char *Data, uint64_t Size);
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerJITLoaderPerfStart(const char *Data, uint64_t Size);
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerJITLoaderPerfEnd(const char *Data, uint64_t Size);
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_JITLOADERPERF_H
\ No newline at end of file

diff  --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index c15c2eac0d044d2..3256ed8b7362c66 100644
--- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -41,6 +41,7 @@ add_llvm_component_library(LLVMOrcJIT
   ObjectTransformLayer.cpp
   OrcABISupport.cpp
   OrcV2CBindings.cpp
+  PerfSupportPlugin.cpp
   RTDyldObjectLinkingLayer.cpp
   SimpleRemoteEPC.cpp
   Speculation.cpp

diff  --git a/llvm/lib/ExecutionEngine/Orc/PerfSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/PerfSupportPlugin.cpp
new file mode 100644
index 000000000000000..fd7acbd4446c524
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/PerfSupportPlugin.cpp
@@ -0,0 +1,297 @@
+//===----- PerfSupportPlugin.cpp --- Utils for perf support -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Handles support for registering code with perf
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/PerfSupportPlugin.h"
+
+#include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
+
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/ExecutionEngine/JITLink/x86_64.h"
+#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm;
+using namespace llvm::orc;
+using namespace llvm::jitlink;
+
+namespace {
+
+// Creates an EH frame header prepared for a 32-bit relative relocation
+// to the start of the .eh_frame section. Absolute injects a 64-bit absolute
+// address space offset 4 bytes from the start instead of 4 bytes
+Expected<std::string> createX64EHFrameHeader(Section &EHFrame,
+                                             support::endianness endianness,
+                                             bool absolute) {
+  uint8_t Version = 1;
+  uint8_t EhFramePtrEnc = 0;
+  if (absolute) {
+    EhFramePtrEnc |= dwarf::DW_EH_PE_sdata8 | dwarf::DW_EH_PE_absptr;
+  } else {
+    EhFramePtrEnc |= dwarf::DW_EH_PE_sdata4 | dwarf::DW_EH_PE_datarel;
+  }
+  uint8_t FDECountEnc = dwarf::DW_EH_PE_omit;
+  uint8_t TableEnc = dwarf::DW_EH_PE_omit;
+  // X86_64_64 relocation to the start of the .eh_frame section
+  uint32_t EHFrameRelocation = 0;
+  // uint32_t FDECount = 0;
+  // Skip the FDE binary search table
+  // We'd have to reprocess the CIEs to get this information,
+  // which seems like more trouble than it's worth
+  // TODO consider implementing this.
+  // binary search table goes here
+
+  size_t HeaderSize =
+      (sizeof(Version) + sizeof(EhFramePtrEnc) + sizeof(FDECountEnc) +
+       sizeof(TableEnc) +
+       (absolute ? sizeof(uint64_t) : sizeof(EHFrameRelocation)));
+  std::string HeaderContent(HeaderSize, '\0');
+  BinaryStreamWriter Writer(
+      MutableArrayRef<uint8_t>(
+          reinterpret_cast<uint8_t *>(HeaderContent.data()), HeaderSize),
+      endianness);
+  if (auto Err = Writer.writeInteger(Version))
+    return std::move(Err);
+  if (auto Err = Writer.writeInteger(EhFramePtrEnc))
+    return std::move(Err);
+  if (auto Err = Writer.writeInteger(FDECountEnc))
+    return std::move(Err);
+  if (auto Err = Writer.writeInteger(TableEnc))
+    return std::move(Err);
+  if (absolute) {
+    uint64_t EHFrameAddr = SectionRange(EHFrame).getStart().getValue();
+    if (auto Err = Writer.writeInteger(EHFrameAddr))
+      return std::move(Err);
+  } else {
+    if (auto Err = Writer.writeInteger(EHFrameRelocation))
+      return std::move(Err);
+  }
+  return HeaderContent;
+}
+
+constexpr StringRef RegisterPerfStartSymbolName =
+    "llvm_orc_registerJITLoaderPerfStart";
+constexpr StringRef RegisterPerfEndSymbolName =
+    "llvm_orc_registerJITLoaderPerfEnd";
+constexpr StringRef RegisterPerfImplSymbolName =
+    "llvm_orc_registerJITLoaderPerfImpl";
+
+static PerfJITCodeLoadRecord
+getCodeLoadRecord(const Symbol &Sym, std::atomic<uint64_t> &CodeIndex) {
+  PerfJITCodeLoadRecord Record;
+  auto Name = Sym.getName();
+  auto Addr = Sym.getAddress();
+  auto Size = Sym.getSize();
+  Record.Prefix.Id = PerfJITRecordType::JIT_CODE_LOAD;
+  // Runtime sets PID
+  Record.Pid = 0;
+  // Runtime sets TID
+  Record.Tid = 0;
+  Record.Vma = Addr.getValue();
+  Record.CodeAddr = Addr.getValue();
+  Record.CodeSize = Size;
+  Record.CodeIndex = CodeIndex++;
+  Record.Name = Name.str();
+  // Initialize last, once all the other fields are filled
+  Record.Prefix.TotalSize =
+      (2 * sizeof(uint32_t)   // id, total_size
+       + sizeof(uint64_t)     // timestamp
+       + 2 * sizeof(uint32_t) // pid, tid
+       + 4 * sizeof(uint64_t) // vma, code_addr, code_size, code_index
+       + Name.size() + 1      // symbol name
+       + Record.CodeSize      // code
+      );
+  return Record;
+}
+
+static std::optional<PerfJITDebugInfoRecord>
+getDebugInfoRecord(const Symbol &Sym, DWARFContext *DC) {
+  if (!DC) {
+    LLVM_DEBUG(dbgs() << "No debug info available\n");
+    return std::nullopt;
+  }
+  auto &Section = Sym.getBlock().getSection();
+  auto Addr = Sym.getAddress();
+  auto Size = Sym.getSize();
+  auto SAddr = object::SectionedAddress{Addr.getValue(), Section.getOrdinal()};
+  LLVM_DEBUG(dbgs() << "Getting debug info for symbol " << Sym.getName()
+                    << " at address " << Addr.getValue() << " with size "
+                    << Size << "\n"
+                    << "Section ordinal: " << Section.getOrdinal() << "\n");
+  auto LInfo = DC->getLineInfoForAddressRange(
+      SAddr, Size, DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath);
+  if (LInfo.empty()) {
+    // No line info available
+    LLVM_DEBUG(dbgs() << "No line info available\n");
+    return std::nullopt;
+  }
+  PerfJITDebugInfoRecord Record;
+  Record.Prefix.Id = PerfJITRecordType::JIT_CODE_DEBUG_INFO;
+  Record.CodeAddr = Addr.getValue();
+  for (const auto &Entry : LInfo) {
+    auto Addr = Entry.first;
+    // The function re-created by perf is preceded by a elf
+    // header. Need to adjust for that, otherwise the results are
+    // wrong.
+    Addr += 0x40;
+    Record.Entries.push_back({Addr, Entry.second.Line,
+                              Entry.second.Discriminator,
+                              Entry.second.FileName});
+  }
+  size_t EntriesBytes = (2   // record header
+                         + 2 // record fields
+                         ) *
+                        sizeof(uint64_t);
+  for (const auto &Entry : Record.Entries) {
+    EntriesBytes +=
+        sizeof(uint64_t) + 2 * sizeof(uint32_t); // Addr, Line/Discrim
+    EntriesBytes += Entry.Name.size() + 1;       // Name
+  }
+  Record.Prefix.TotalSize = EntriesBytes;
+  LLVM_DEBUG(dbgs() << "Created debug info record\n"
+                    << "Total size: " << Record.Prefix.TotalSize << "\n"
+                    << "Nr entries: " << Record.Entries.size() << "\n");
+  return Record;
+}
+
+static Expected<PerfJITCodeUnwindingInfoRecord>
+getUnwindingRecord(LinkGraph &G) {
+  PerfJITCodeUnwindingInfoRecord Record;
+  Record.Prefix.Id = PerfJITRecordType::JIT_CODE_UNWINDING_INFO;
+  Record.Prefix.TotalSize = 0;
+  auto Eh_frame = G.findSectionByName(".eh_frame");
+  if (!Eh_frame) {
+    LLVM_DEBUG(dbgs() << "No .eh_frame section found\n");
+    return Record;
+  }
+  if (!G.getTargetTriple().isOSBinFormatELF()) {
+    LLVM_DEBUG(dbgs() << "Not an ELF file, will not emit unwinding info\n");
+    return Record;
+  }
+  auto SR = SectionRange(*Eh_frame);
+  auto EHFrameSize = SR.getSize();
+  auto Eh_frame_hdr = G.findSectionByName(".eh_frame_hdr");
+  if (!Eh_frame_hdr) {
+    if (G.getTargetTriple().getArch() == Triple::x86_64) {
+      auto Hdr = createX64EHFrameHeader(*Eh_frame, G.getEndianness(), true);
+      if (!Hdr)
+        return Hdr.takeError();
+      Record.EHFrameHdr = std::move(*Hdr);
+    } else {
+      LLVM_DEBUG(dbgs() << "No .eh_frame_hdr section found\n");
+      return Record;
+    }
+    Record.EHFrameHdrAddr = 0;
+    Record.EHFrameHdrSize = Record.EHFrameHdr.size();
+    Record.UnwindDataSize = EHFrameSize + Record.EHFrameHdrSize;
+    Record.MappedSize = 0; // Because the EHFrame header was not mapped
+  } else {
+    auto SR = SectionRange(*Eh_frame_hdr);
+    Record.EHFrameHdrAddr = SR.getStart().getValue();
+    Record.EHFrameHdrSize = SR.getSize();
+    Record.UnwindDataSize = EHFrameSize + Record.EHFrameHdrSize;
+    Record.MappedSize = Record.UnwindDataSize;
+  }
+  Record.EHFrameAddr = SR.getStart().getValue();
+  Record.Prefix.TotalSize =
+      (2 * sizeof(uint32_t) // id, total_size
+       + sizeof(uint64_t)   // timestamp
+       +
+       3 * sizeof(uint64_t) // unwind_data_size, eh_frame_hdr_size, mapped_size
+       + Record.UnwindDataSize // eh_frame_hdr, eh_frame
+      );
+  LLVM_DEBUG(dbgs() << "Created unwind record\n"
+                    << "Total size: " << Record.Prefix.TotalSize << "\n"
+                    << "Unwind size: " << Record.UnwindDataSize << "\n"
+                    << "EHFrame size: " << EHFrameSize << "\n"
+                    << "EHFrameHdr size: " << Record.EHFrameHdrSize << "\n");
+  return Record;
+}
+
+static PerfJITRecordBatch getRecords(ExecutionSession &ES, LinkGraph &G,
+                                     DWARFContext *DC,
+                                     std::atomic<uint64_t> &CodeIndex,
+                                     bool EmitUnwindInfo) {
+  PerfJITRecordBatch Batch;
+  for (auto Sym : G.defined_symbols()) {
+    if (!Sym->hasName() || !Sym->isCallable())
+      continue;
+    auto DebugInfo = getDebugInfoRecord(*Sym, DC);
+    if (DebugInfo)
+      Batch.DebugInfoRecords.push_back(std::move(*DebugInfo));
+    Batch.CodeLoadRecords.push_back(getCodeLoadRecord(*Sym, CodeIndex));
+  }
+  if (EmitUnwindInfo) {
+    auto UWR = getUnwindingRecord(G);
+    if (!UWR) {
+      ES.reportError(UWR.takeError());
+    } else {
+      Batch.UnwindingRecord = std::move(*UWR);
+    }
+  } else {
+    Batch.UnwindingRecord.Prefix.TotalSize = 0;
+  }
+  return Batch;
+}
+} // namespace
+
+PerfSupportPlugin::PerfSupportPlugin(ExecutorProcessControl &EPC,
+                                     ExecutorAddr RegisterPerfStartAddr,
+                                     ExecutorAddr RegisterPerfEndAddr,
+                                     ExecutorAddr RegisterPerfImplAddr,
+                                     bool EmitUnwindInfo)
+    : EPC(EPC), RegisterPerfStartAddr(RegisterPerfStartAddr),
+      RegisterPerfEndAddr(RegisterPerfEndAddr),
+      RegisterPerfImplAddr(RegisterPerfImplAddr), CodeIndex(0),
+      EmitUnwindInfo(EmitUnwindInfo) {
+  cantFail(EPC.callSPSWrapper<void()>(RegisterPerfStartAddr));
+}
+PerfSupportPlugin::~PerfSupportPlugin() {
+  cantFail(EPC.callSPSWrapper<void()>(RegisterPerfEndAddr));
+}
+
+void PerfSupportPlugin::modifyPassConfig(MaterializationResponsibility &MR,
+                                         LinkGraph &G,
+                                         PassConfiguration &Config) {
+  Config.PostFixupPasses.push_back([this](LinkGraph &G) {
+    // TODO get an actual DWARFContext for line info
+    DWARFContext *DWC = nullptr;
+    auto Batch = getRecords(EPC.getExecutionSession(), G, DWC, CodeIndex,
+                            EmitUnwindInfo);
+    G.allocActions().push_back(
+        {cantFail(shared::WrapperFunctionCall::Create<
+                  shared::SPSArgList<shared::SPSPerfJITRecordBatch>>(
+             RegisterPerfImplAddr, Batch)),
+         {}});
+    return Error::success();
+  });
+}
+
+Expected<std::unique_ptr<PerfSupportPlugin>>
+PerfSupportPlugin::Create(ExecutorProcessControl &EPC, JITDylib &JD,
+                          bool EmitUnwindInfo) {
+  if (!EPC.getTargetTriple().isOSBinFormatELF()) {
+    return make_error<StringError>(
+        "Perf support only available for ELF LinkGraphs!",
+        inconvertibleErrorCode());
+  }
+  auto &ES = EPC.getExecutionSession();
+  ExecutorAddr StartAddr, EndAddr, ImplAddr;
+  if (auto Err = lookupAndRecordAddrs(
+          ES, LookupKind::Static, makeJITDylibSearchOrder({&JD}),
+          {{ES.intern(RegisterPerfStartSymbolName), &StartAddr},
+           {ES.intern(RegisterPerfEndSymbolName), &EndAddr},
+           {ES.intern(RegisterPerfImplSymbolName), &ImplAddr}}))
+    return std::move(Err);
+  return std::make_unique<PerfSupportPlugin>(EPC, StartAddr, EndAddr, ImplAddr,
+                                             EmitUnwindInfo);
+}

diff  --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
index d9cd7b6dad98896..f2005dc1775e3c8 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
@@ -5,6 +5,7 @@ endif()
 add_llvm_component_library(LLVMOrcTargetProcess
   ExecutorSharedMemoryMapperService.cpp
   JITLoaderGDB.cpp
+  JITLoaderPerf.cpp
   OrcRTBootstrap.cpp
   RegisterEHFrames.cpp
   SimpleExecutorDylibManager.cpp

diff  --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
new file mode 100644
index 000000000000000..731b4a173850d6a
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
@@ -0,0 +1,457 @@
+//===------- JITLoaderPerf.cpp - Register profiler objects ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Register objects for access by profilers via the perf JIT interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h"
+
+#include "llvm/ExecutionEngine/Orc/Shared/PerfSharedStructs.h"
+
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Threading.h"
+
+#include <mutex>
+#include <optional>
+
+#ifdef __linux__
+
+#include <sys/mman.h> // mmap()
+#include <time.h>     // clock_gettime(), time(), localtime_r() */
+#include <unistd.h>   // for read(), close()
+
+#define DEBUG_TYPE "orc"
+
+// language identifier (XXX: should we generate something better from debug
+// info?)
+#define JIT_LANG "llvm-IR"
+#define LLVM_PERF_JIT_MAGIC                                                    \
+  ((uint32_t)'J' << 24 | (uint32_t)'i' << 16 | (uint32_t)'T' << 8 |            \
+   (uint32_t)'D')
+#define LLVM_PERF_JIT_VERSION 1
+
+using namespace llvm;
+using namespace llvm::orc;
+
+struct PerfState {
+  // cache lookups
+  uint32_t Pid;
+
+  // base directory for output data
+  std::string JitPath;
+
+  // output data stream, closed via Dumpstream
+  int DumpFd = -1;
+
+  // output data stream
+  std::unique_ptr<raw_fd_ostream> Dumpstream;
+
+  // perf mmap marker
+  void *MarkerAddr = NULL;
+};
+
+// prevent concurrent dumps from messing up the output file
+static std::mutex Mutex;
+static std::optional<PerfState> State;
+
+struct RecHeader {
+  uint32_t Id;
+  uint32_t TotalSize;
+  uint64_t Timestamp;
+};
+
+struct DIR {
+  RecHeader Prefix;
+  uint64_t CodeAddr;
+  uint64_t NrEntry;
+};
+
+struct DIE {
+  uint64_t CodeAddr;
+  uint32_t Line;
+  uint32_t Discrim;
+};
+
+struct CLR {
+  RecHeader Prefix;
+  uint32_t Pid;
+  uint32_t Tid;
+  uint64_t Vma;
+  uint64_t CodeAddr;
+  uint64_t CodeSize;
+  uint64_t CodeIndex;
+};
+
+struct UWR {
+  RecHeader Prefix;
+  uint64_t UnwindDataSize;
+  uint64_t EhFrameHeaderSize;
+  uint64_t MappedSize;
+};
+
+static inline uint64_t timespec_to_ns(const struct timespec *TS) {
+  const uint64_t NanoSecPerSec = 1000000000;
+  return ((uint64_t)TS->tv_sec * NanoSecPerSec) + TS->tv_nsec;
+}
+
+static inline uint64_t perf_get_timestamp() {
+  timespec TS;
+  if (clock_gettime(CLOCK_MONOTONIC, &TS))
+    return 0;
+
+  return timespec_to_ns(&TS);
+}
+
+static void writeDebugRecord(const PerfJITDebugInfoRecord &DebugRecord) {
+  assert(State && "PerfState not initialized");
+  LLVM_DEBUG(dbgs() << "Writing debug record with "
+                    << DebugRecord.Entries.size() << " entries\n");
+  size_t Written = 0;
+  DIR Dir{RecHeader{static_cast<uint32_t>(DebugRecord.Prefix.Id),
+                    DebugRecord.Prefix.TotalSize, perf_get_timestamp()},
+          DebugRecord.CodeAddr, DebugRecord.Entries.size()};
+  State->Dumpstream->write(reinterpret_cast<const char *>(&Dir), sizeof(Dir));
+  Written += sizeof(Dir);
+  for (auto &Die : DebugRecord.Entries) {
+    DIE d{Die.Addr, Die.Lineno, Die.Discrim};
+    State->Dumpstream->write(reinterpret_cast<const char *>(&d), sizeof(d));
+    State->Dumpstream->write(Die.Name.data(), Die.Name.size() + 1);
+    Written += sizeof(d) + Die.Name.size() + 1;
+  }
+  LLVM_DEBUG(dbgs() << "wrote " << Written << " bytes of debug info\n");
+}
+
+static void writeCodeRecord(const PerfJITCodeLoadRecord &CodeRecord) {
+  assert(State && "PerfState not initialized");
+  uint32_t Tid = get_threadid();
+  LLVM_DEBUG(dbgs() << "Writing code record with code size "
+                    << CodeRecord.CodeSize << " and code index "
+                    << CodeRecord.CodeIndex << "\n");
+  CLR Clr{RecHeader{static_cast<uint32_t>(CodeRecord.Prefix.Id),
+                    CodeRecord.Prefix.TotalSize, perf_get_timestamp()},
+          State->Pid,
+          Tid,
+          CodeRecord.Vma,
+          CodeRecord.CodeAddr,
+          CodeRecord.CodeSize,
+          CodeRecord.CodeIndex};
+  LLVM_DEBUG(dbgs() << "wrote " << sizeof(Clr) << " bytes of CLR, "
+                    << CodeRecord.Name.size() + 1 << " bytes of name, "
+                    << CodeRecord.CodeSize << " bytes of code\n");
+  State->Dumpstream->write(reinterpret_cast<const char *>(&Clr), sizeof(Clr));
+  State->Dumpstream->write(CodeRecord.Name.data(), CodeRecord.Name.size() + 1);
+  State->Dumpstream->write((const char *)CodeRecord.CodeAddr,
+                           CodeRecord.CodeSize);
+}
+
+static void
+writeUnwindRecord(const PerfJITCodeUnwindingInfoRecord &UnwindRecord) {
+  assert(State && "PerfState not initialized");
+  dbgs() << "Writing unwind record with unwind data size "
+         << UnwindRecord.UnwindDataSize << " and EH frame header size "
+         << UnwindRecord.EHFrameHdrSize << " and mapped size "
+         << UnwindRecord.MappedSize << "\n";
+  UWR Uwr{RecHeader{static_cast<uint32_t>(UnwindRecord.Prefix.Id),
+                    UnwindRecord.Prefix.TotalSize, perf_get_timestamp()},
+          UnwindRecord.UnwindDataSize, UnwindRecord.EHFrameHdrSize,
+          UnwindRecord.MappedSize};
+  LLVM_DEBUG(dbgs() << "wrote " << sizeof(Uwr) << " bytes of UWR, "
+                    << UnwindRecord.EHFrameHdrSize
+                    << " bytes of EH frame header, "
+                    << UnwindRecord.UnwindDataSize - UnwindRecord.EHFrameHdrSize
+                    << " bytes of EH frame\n");
+  State->Dumpstream->write(reinterpret_cast<const char *>(&Uwr), sizeof(Uwr));
+  if (UnwindRecord.EHFrameHdrAddr)
+    State->Dumpstream->write((const char *)UnwindRecord.EHFrameHdrAddr,
+                             UnwindRecord.EHFrameHdrSize);
+  else
+    State->Dumpstream->write(UnwindRecord.EHFrameHdr.data(),
+                             UnwindRecord.EHFrameHdrSize);
+  State->Dumpstream->write((const char *)UnwindRecord.EHFrameAddr,
+                           UnwindRecord.UnwindDataSize -
+                               UnwindRecord.EHFrameHdrSize);
+}
+
+static Error registerJITLoaderPerfImpl(const PerfJITRecordBatch &Batch) {
+  if (!State)
+    return make_error<StringError>("PerfState not initialized",
+                                   inconvertibleErrorCode());
+
+  // Serialize the batch
+  std::lock_guard<std::mutex> Lock(Mutex);
+  if (Batch.UnwindingRecord.Prefix.TotalSize > 0)
+    writeUnwindRecord(Batch.UnwindingRecord);
+
+  for (const auto &DebugInfo : Batch.DebugInfoRecords)
+    writeDebugRecord(DebugInfo);
+
+  for (const auto &CodeLoad : Batch.CodeLoadRecords)
+    writeCodeRecord(CodeLoad);
+
+  State->Dumpstream->flush();
+
+  return Error::success();
+}
+
+struct Header {
+  uint32_t Magic;     // characters "JiTD"
+  uint32_t Version;   // header version
+  uint32_t TotalSize; // total size of header
+  uint32_t ElfMach;   // elf mach target
+  uint32_t Pad1;      // reserved
+  uint32_t Pid;
+  uint64_t Timestamp; // timestamp
+  uint64_t Flags;     // flags
+};
+
+static Error OpenMarker(PerfState &State) {
+  // We mmap the jitdump to create an MMAP RECORD in perf.data file.  The mmap
+  // is captured either live (perf record running when we mmap) or in deferred
+  // mode, via /proc/PID/maps. The MMAP record is used as a marker of a jitdump
+  // file for more meta data info about the jitted code. Perf report/annotate
+  // detect this special filename and process the jitdump file.
+  //
+  // Mapping must be PROT_EXEC to ensure it is captured by perf record
+  // even when not using -d option.
+  State.MarkerAddr =
+      ::mmap(NULL, sys::Process::getPageSizeEstimate(), PROT_READ | PROT_EXEC,
+             MAP_PRIVATE, State.DumpFd, 0);
+
+  if (State.MarkerAddr == MAP_FAILED)
+    return make_error<llvm::StringError>("could not mmap JIT marker",
+                                         inconvertibleErrorCode());
+
+  return Error::success();
+}
+
+void CloseMarker(PerfState &State) {
+  if (!State.MarkerAddr)
+    return;
+
+  munmap(State.MarkerAddr, sys::Process::getPageSizeEstimate());
+  State.MarkerAddr = nullptr;
+}
+
+static Expected<Header> FillMachine(PerfState &State) {
+  Header Hdr;
+  Hdr.Magic = LLVM_PERF_JIT_MAGIC;
+  Hdr.Version = LLVM_PERF_JIT_VERSION;
+  Hdr.TotalSize = sizeof(Hdr);
+  Hdr.Pid = State.Pid;
+  Hdr.Timestamp = perf_get_timestamp();
+
+  char Id[16];
+  struct {
+    uint16_t e_type;
+    uint16_t e_machine;
+  } Info;
+
+  size_t RequiredMemory = sizeof(Id) + sizeof(Info);
+
+  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
+      MemoryBuffer::getFileSlice("/proc/self/exe", RequiredMemory, 0);
+
+  // This'll not guarantee that enough data was actually read from the
+  // underlying file. Instead the trailing part of the buffer would be
+  // zeroed. Given the ELF signature check below that seems ok though,
+  // it's unlikely that the file ends just after that, and the
+  // consequence would just be that perf wouldn't recognize the
+  // signature.
+  if (!MB)
+    return make_error<llvm::StringError>("could not open /proc/self/exe",
+                                         MB.getError());
+
+  memcpy(&Id, (*MB)->getBufferStart(), sizeof(Id));
+  memcpy(&Info, (*MB)->getBufferStart() + sizeof(Id), sizeof(Info));
+
+  // check ELF signature
+  if (Id[0] != 0x7f || Id[1] != 'E' || Id[2] != 'L' || Id[3] != 'F')
+    return make_error<llvm::StringError>("invalid ELF signature",
+                                         inconvertibleErrorCode());
+
+  Hdr.ElfMach = Info.e_machine;
+
+  return Hdr;
+}
+
+static Error InitDebuggingDir(PerfState &State) {
+  time_t Time;
+  struct tm LocalTime;
+  char TimeBuffer[sizeof("YYYYMMDD")];
+  SmallString<64> Path;
+
+  // search for location to dump data to
+  if (const char *BaseDir = getenv("JITDUMPDIR"))
+    Path.append(BaseDir);
+  else if (!sys::path::home_directory(Path))
+    Path = ".";
+
+  // create debug directory
+  Path += "/.debug/jit/";
+  if (auto EC = sys::fs::create_directories(Path)) {
+    std::string ErrStr;
+    raw_string_ostream ErrStream(ErrStr);
+    ErrStream << "could not create jit cache directory " << Path << ": "
+              << EC.message() << "\n";
+    return make_error<StringError>(std::move(ErrStr), inconvertibleErrorCode());
+  }
+
+  // create unique directory for dump data related to this process
+  time(&Time);
+  localtime_r(&Time, &LocalTime);
+  strftime(TimeBuffer, sizeof(TimeBuffer), "%Y%m%d", &LocalTime);
+  Path += JIT_LANG "-jit-";
+  Path += TimeBuffer;
+
+  SmallString<128> UniqueDebugDir;
+
+  using sys::fs::createUniqueDirectory;
+  if (auto EC = createUniqueDirectory(Path, UniqueDebugDir)) {
+    std::string ErrStr;
+    raw_string_ostream ErrStream(ErrStr);
+    ErrStream << "could not create unique jit cache directory "
+              << UniqueDebugDir << ": " << EC.message() << "\n";
+    return make_error<StringError>(std::move(ErrStr), inconvertibleErrorCode());
+  }
+
+  State.JitPath = std::string(UniqueDebugDir.str());
+
+  return Error::success();
+}
+
+static Error registerJITLoaderPerfStartImpl() {
+  PerfState Tentative;
+  Tentative.Pid = sys::Process::getProcessId();
+  // check if clock-source is supported
+  if (!perf_get_timestamp())
+    return make_error<StringError>("kernel does not support CLOCK_MONOTONIC",
+                                   inconvertibleErrorCode());
+
+  if (auto Err = InitDebuggingDir(Tentative))
+    return Err;
+
+  std::string Filename;
+  raw_string_ostream FilenameBuf(Filename);
+  FilenameBuf << Tentative.JitPath << "/jit-" << Tentative.Pid << ".dump";
+
+  // Need to open ourselves, because we need to hand the FD to OpenMarker() and
+  // raw_fd_ostream doesn't expose the FD.
+  using sys::fs::openFileForWrite;
+  if (auto EC = openFileForReadWrite(FilenameBuf.str(), Tentative.DumpFd,
+                                     sys::fs::CD_CreateNew, sys::fs::OF_None)) {
+    std::string ErrStr;
+    raw_string_ostream ErrStream(ErrStr);
+    ErrStream << "could not open JIT dump file " << FilenameBuf.str() << ": "
+              << EC.message() << "\n";
+    return make_error<StringError>(std::move(ErrStr), inconvertibleErrorCode());
+  }
+
+  Tentative.Dumpstream =
+      std::make_unique<raw_fd_ostream>(Tentative.DumpFd, true);
+
+  auto Header = FillMachine(Tentative);
+  if (!Header)
+    return Header.takeError();
+
+  // signal this process emits JIT information
+  if (auto Err = OpenMarker(Tentative))
+    return Err;
+
+  Tentative.Dumpstream->write(reinterpret_cast<const char *>(&Header.get()),
+                              sizeof(*Header));
+
+  // Everything initialized, can do profiling now.
+  if (Tentative.Dumpstream->has_error())
+    return make_error<StringError>("could not write JIT dump header",
+                                   inconvertibleErrorCode());
+
+  State = std::move(Tentative);
+  return Error::success();
+}
+
+static Error registerJITLoaderPerfEndImpl() {
+  if (!State)
+    return make_error<StringError>("PerfState not initialized",
+                                   inconvertibleErrorCode());
+
+  RecHeader Close;
+  Close.Id = static_cast<uint32_t>(PerfJITRecordType::JIT_CODE_CLOSE);
+  Close.TotalSize = sizeof(Close);
+  Close.Timestamp = perf_get_timestamp();
+  State->Dumpstream->write(reinterpret_cast<const char *>(&Close),
+                           sizeof(Close));
+  if (State->MarkerAddr)
+    CloseMarker(*State);
+
+  State.reset();
+  return Error::success();
+}
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerJITLoaderPerfImpl(const char *Data, uint64_t Size) {
+  using namespace orc::shared;
+  return WrapperFunction<SPSError(SPSPerfJITRecordBatch)>::handle(
+             Data, Size, registerJITLoaderPerfImpl)
+      .release();
+}
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerJITLoaderPerfStart(const char *Data, uint64_t Size) {
+  using namespace orc::shared;
+  return WrapperFunction<SPSError()>::handle(Data, Size,
+                                             registerJITLoaderPerfStartImpl)
+      .release();
+}
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerJITLoaderPerfEnd(const char *Data, uint64_t Size) {
+  using namespace orc::shared;
+  return WrapperFunction<SPSError()>::handle(Data, Size,
+                                             registerJITLoaderPerfEndImpl)
+      .release();
+}
+
+#else
+
+using namespace llvm;
+using namespace llvm::orc;
+
+static Error badOS() {
+  using namespace llvm;
+  return llvm::make_error<StringError>(
+      "unsupported OS (perf support is only available on linux!)",
+      inconvertibleErrorCode());
+}
+
+static Error badOSBatch(PerfJITRecordBatch &Batch) { return badOS(); }
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerJITLoaderPerfImpl(const char *Data, uint64_t Size) {
+  using namespace shared;
+  return WrapperFunction<SPSError(SPSPerfJITRecordBatch)>::handle(Data, Size,
+                                                                  badOSBatch)
+      .release();
+}
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerJITLoaderPerfStart(const char *Data, uint64_t Size) {
+  using namespace shared;
+  return WrapperFunction<SPSError()>::handle(Data, Size, badOS).release();
+}
+
+extern "C" llvm::orc::shared::CWrapperFunctionResult
+llvm_orc_registerJITLoaderPerfEnd(const char *Data, uint64_t Size) {
+  using namespace shared;
+  return WrapperFunction<SPSError()>::handle(Data, Size, badOS).release();
+}
+
+#endif

diff  --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_perf.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_perf.s
new file mode 100644
index 000000000000000..6eb612acd5d6e4a
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_perf.s
@@ -0,0 +1,204 @@
+# REQUIRES: native && x86_64-linux
+
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t/ELF_x86-64_perf.o %s
+# RUN: JITDUMPDIR="%t" llvm-jitlink -perf-support \
+# RUN:     %t/ELF_x86-64_perf.o
+# RUN: test -f %t/.debug/jit/llvm-IR-jit-*/jit-*.dump
+
+# Test ELF perf support for code load records and unwind info
+
+        .text
+        .file   "example.c"
+        .section        .text.source,"ax", at progbits
+        .globl  source                          # -- Begin function source
+        .p2align        4, 0x90
+        .type   source, at function
+source:                                 # @source
+.Lfunc_begin0:
+        .file   1 "/app" "example.c"
+        .loc    1 1 0                           # example.c:1:0
+        .cfi_startproc
+# %bb.0:
+        .loc    1 2 5 prologue_end              # example.c:2:5
+        movl    $1, %eax
+        retq
+.Ltmp0:
+.Lfunc_end0:
+        .size   source, .Lfunc_end0-source
+        .cfi_endproc
+                                        # -- End function
+        .section        .text.passthrough,"ax", at progbits
+        .globl  passthrough                     # -- Begin function passthrough
+        .p2align        4, 0x90
+        .type   passthrough, at function
+passthrough:                            # @passthrough
+.Lfunc_begin1:
+        .loc    1 5 0                           # example.c:5:0
+        .cfi_startproc
+# %bb.0:
+        .loc    1 6 5 prologue_end              # example.c:6:5
+        movl    $1, %eax
+        retq
+.Ltmp1:
+.Lfunc_end1:
+        .size   passthrough, .Lfunc_end1-passthrough
+        .cfi_endproc
+                                        # -- End function
+        .section        .text.main,"ax", at progbits
+        .globl  main                            # -- Begin function main
+        .p2align        4, 0x90
+        .type   main, at function
+main:                                   # @main
+.Lfunc_begin2:
+        .loc    1 9 0                           # example.c:9:0
+        .cfi_startproc
+# %bb.0:
+        .loc    1 10 5 prologue_end             # example.c:10:5
+        xorl    %eax, %eax
+        retq
+.Ltmp2:
+.Lfunc_end2:
+        .size   main, .Lfunc_end2-main
+        .cfi_endproc
+                                        # -- End function
+        .section        .debug_abbrev,"", at progbits
+        .byte   1                               # Abbreviation Code
+        .byte   17                              # DW_TAG_compile_unit
+        .byte   1                               # DW_CHILDREN_yes
+        .byte   37                              # DW_AT_producer
+        .byte   14                              # DW_FORM_strp
+        .byte   19                              # DW_AT_language
+        .byte   5                               # DW_FORM_data2
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   16                              # DW_AT_stmt_list
+        .byte   23                              # DW_FORM_sec_offset
+        .byte   27                              # DW_AT_comp_dir
+        .byte   14                              # DW_FORM_strp
+        .byte   17                              # DW_AT_low_pc
+        .byte   1                               # DW_FORM_addr
+        .byte   85                              # DW_AT_ranges
+        .byte   23                              # DW_FORM_sec_offset
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   2                               # Abbreviation Code
+        .byte   46                              # DW_TAG_subprogram
+        .byte   0                               # DW_CHILDREN_no
+        .byte   17                              # DW_AT_low_pc
+        .byte   1                               # DW_FORM_addr
+        .byte   18                              # DW_AT_high_pc
+        .byte   6                               # DW_FORM_data4
+        .byte   64                              # DW_AT_frame_base
+        .byte   24                              # DW_FORM_exprloc
+        .ascii  "\227B"                         # DW_AT_GNU_all_call_sites
+        .byte   25                              # DW_FORM_flag_present
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   58                              # DW_AT_decl_file
+        .byte   11                              # DW_FORM_data1
+        .byte   59                              # DW_AT_decl_line
+        .byte   11                              # DW_FORM_data1
+        .byte   73                              # DW_AT_type
+        .byte   19                              # DW_FORM_ref4
+        .byte   63                              # DW_AT_external
+        .byte   25                              # DW_FORM_flag_present
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   3                               # Abbreviation Code
+        .byte   36                              # DW_TAG_base_type
+        .byte   0                               # DW_CHILDREN_no
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   62                              # DW_AT_encoding
+        .byte   11                              # DW_FORM_data1
+        .byte   11                              # DW_AT_byte_size
+        .byte   11                              # DW_FORM_data1
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   0                               # EOM(3)
+        .section        .debug_info,"", at progbits
+.Lcu_begin0:
+        .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+        .short  4                               # DWARF version number
+        .long   .debug_abbrev                   # Offset Into Abbrev. Section
+        .byte   8                               # Address Size (in bytes)
+        .byte   1                               # Abbrev [1] 0xb:0x72 DW_TAG_compile_unit
+        .long   .Linfo_string0                  # DW_AT_producer
+        .short  12                              # DW_AT_language
+        .long   .Linfo_string1                  # DW_AT_name
+        .long   .Lline_table_start0             # DW_AT_stmt_list
+        .long   .Linfo_string2                  # DW_AT_comp_dir
+        .quad   0                               # DW_AT_low_pc
+        .long   .Ldebug_ranges0                 # DW_AT_ranges
+        .byte   2                               # Abbrev [2] 0x2a:0x19 DW_TAG_subprogram
+        .quad   .Lfunc_begin0                   # DW_AT_low_pc
+        .long   .Lfunc_end0-.Lfunc_begin0       # DW_AT_high_pc
+        .byte   1                               # DW_AT_frame_base
+        .byte   87
+                                        # DW_AT_GNU_all_call_sites
+        .long   .Linfo_string3                  # DW_AT_name
+        .byte   1                               # DW_AT_decl_file
+        .byte   1                               # DW_AT_decl_line
+        .long   117                             # DW_AT_type
+                                        # DW_AT_external
+        .byte   2                               # Abbrev [2] 0x43:0x19 DW_TAG_subprogram
+        .quad   .Lfunc_begin1                   # DW_AT_low_pc
+        .long   .Lfunc_end1-.Lfunc_begin1       # DW_AT_high_pc
+        .byte   1                               # DW_AT_frame_base
+        .byte   87
+                                        # DW_AT_GNU_all_call_sites
+        .long   .Linfo_string5                  # DW_AT_name
+        .byte   1                               # DW_AT_decl_file
+        .byte   5                               # DW_AT_decl_line
+        .long   117                             # DW_AT_type
+                                        # DW_AT_external
+        .byte   2                               # Abbrev [2] 0x5c:0x19 DW_TAG_subprogram
+        .quad   .Lfunc_begin2                   # DW_AT_low_pc
+        .long   .Lfunc_end2-.Lfunc_begin2       # DW_AT_high_pc
+        .byte   1                               # DW_AT_frame_base
+        .byte   87
+                                        # DW_AT_GNU_all_call_sites
+        .long   .Linfo_string6                  # DW_AT_name
+        .byte   1                               # DW_AT_decl_file
+        .byte   9                               # DW_AT_decl_line
+        .long   117                             # DW_AT_type
+                                        # DW_AT_external
+        .byte   3                               # Abbrev [3] 0x75:0x7 DW_TAG_base_type
+        .long   .Linfo_string4                  # DW_AT_name
+        .byte   5                               # DW_AT_encoding
+        .byte   4                               # DW_AT_byte_size
+        .byte   0                               # End Of Children Mark
+.Ldebug_info_end0:
+        .section        .debug_ranges,"", at progbits
+.Ldebug_ranges0:
+        .quad   .Lfunc_begin0
+        .quad   .Lfunc_end0
+        .quad   .Lfunc_begin1
+        .quad   .Lfunc_end1
+        .quad   .Lfunc_begin2
+        .quad   .Lfunc_end2
+        .quad   0
+        .quad   0
+        .section        .debug_str,"MS", at progbits,1
+.Linfo_string0:
+        .asciz  "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 4ba6a9c9f65bbc8bd06e3652cb20fd4dfc846137)" # string offset=0
+.Linfo_string1:
+        .asciz  "/app/example.c"                # string offset=105
+.Linfo_string2:
+        .asciz  "/app"                          # string offset=120
+.Linfo_string3:
+        .asciz  "source"                        # string offset=125
+.Linfo_string4:
+        .asciz  "int"                           # string offset=132
+.Linfo_string5:
+        .asciz  "passthrough"                   # string offset=136
+.Linfo_string6:
+        .asciz  "main"                          # string offset=148
+        .ident  "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 4ba6a9c9f65bbc8bd06e3652cb20fd4dfc846137)"
+        .section        ".note.GNU-stack","", at progbits
+        .addrsig
+        .section        .debug_line,"", at progbits
+.Lline_table_start0:

diff  --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 64806b554ee7c0b..42ef651851a4ac5 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -28,8 +28,10 @@
 #include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 #include "llvm/ExecutionEngine/Orc/MapperJITLinkMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
+#include "llvm/ExecutionEngine/Orc/PerfSupportPlugin.h"
 #include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.h"
 #include "llvm/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -140,6 +142,11 @@ static cl::opt<bool>
                     cl::desc("Enable debugger suppport (default = !-noexec)"),
                     cl::init(true), cl::Hidden, cl::cat(JITLinkCategory));
 
+static cl::opt<bool> PerfSupport("perf-support",
+                                 cl::desc("Enable perf profiling support"),
+                                 cl::init(false), cl::Hidden,
+                                 cl::cat(JITLinkCategory));
+
 static cl::opt<bool>
     NoProcessSymbols("no-process-syms",
                      cl::desc("Do not resolve to llvm-jitlink process symbols"),
@@ -243,10 +250,14 @@ static cl::opt<bool> UseSharedMemory(
 static ExitOnError ExitOnErr;
 
 static LLVM_ATTRIBUTE_USED void linkComponents() {
-  errs() << (void *)&llvm_orc_registerEHFrameSectionWrapper
-         << (void *)&llvm_orc_deregisterEHFrameSectionWrapper
-         << (void *)&llvm_orc_registerJITLoaderGDBWrapper
-         << (void *)&llvm_orc_registerJITLoaderGDBAllocAction;
+  errs() << "Linking in runtime functions\n"
+         << (void *)&llvm_orc_registerEHFrameSectionWrapper << '\n'
+         << (void *)&llvm_orc_deregisterEHFrameSectionWrapper << '\n'
+         << (void *)&llvm_orc_registerJITLoaderGDBWrapper << '\n'
+         << (void *)&llvm_orc_registerJITLoaderGDBAllocAction << '\n'
+         << (void *)&llvm_orc_registerJITLoaderPerfStart << '\n'
+         << (void *)&llvm_orc_registerJITLoaderPerfEnd << '\n'
+         << (void *)&llvm_orc_registerJITLoaderPerfImpl << '\n';
 }
 
 static bool UseTestResultOverride = false;
@@ -979,6 +990,10 @@ Session::Session(std::unique_ptr<ExecutorProcessControl> EPC, Error &Err)
     ObjLayer.addPlugin(ExitOnErr(
         GDBJITDebugInfoRegistrationPlugin::Create(this->ES, *MainJD, TT)));
 
+  if (PerfSupport && TT.isOSBinFormatELF())
+    ObjLayer.addPlugin(ExitOnErr(PerfSupportPlugin::Create(
+        this->ES.getExecutorProcessControl(), *MainJD, true)));
+
   // Set up the platform.
   if (TT.isOSBinFormatMachO() && !OrcRuntime.empty()) {
     if (auto P =


        


More information about the llvm-commits mailing list