[llvm] 40ca411 - [llvm-profgen] Switch to DWARF-based symbol and ranges

via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 29 09:59:55 PDT 2021


Author: wlei
Date: 2021-10-29T09:59:12-07:00
New Revision: 40ca4112515d03bbcf594bd2dfa6b4394d5b00d6

URL: https://github.com/llvm/llvm-project/commit/40ca4112515d03bbcf594bd2dfa6b4394d5b00d6
DIFF: https://github.com/llvm/llvm-project/commit/40ca4112515d03bbcf594bd2dfa6b4394d5b00d6.diff

LOG: [llvm-profgen] Switch to DWARF-based symbol and ranges

It happened a bug that some callsite name in the profile is not a real function, it turned out that there're some non-function symbol from the ELF text section, e.g. the global accessible branch label and also recalled that we can have one function being split into multiple ranges. We shouldn't count samples for those are not the entry of the real function.

So this change tried to fix this issue by switching to use the name or ranges from DWARF-based debug info, the range of which assure it's the real function start. For the split functions, we assume that the real entry function's DWARF name should always match the symbol table name.

The switching is also consistent with the body samples' symbol which is from DWARF.

Reviewed By: hoy, wenlei

Differential Revision: https://reviews.llvm.org/D112282

Added: 
    

Modified: 
    llvm/tools/llvm-profgen/CMakeLists.txt
    llvm/tools/llvm-profgen/ProfileGenerator.cpp
    llvm/tools/llvm-profgen/ProfiledBinary.cpp
    llvm/tools/llvm-profgen/ProfiledBinary.h

Removed: 
    


################################################################################
diff  --git a/llvm/tools/llvm-profgen/CMakeLists.txt b/llvm/tools/llvm-profgen/CMakeLists.txt
index 125b36b401292..b3e05a94856eb 100644
--- a/llvm/tools/llvm-profgen/CMakeLists.txt
+++ b/llvm/tools/llvm-profgen/CMakeLists.txt
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
   AllTargetsDescs
   AllTargetsDisassemblers
   AllTargetsInfos
+  DebugInfoDWARF
   Core
   MC
   IPO

diff  --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index ac8cdec66eb32..ea0d7a1c29339 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -348,26 +348,14 @@ FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
 RangeSample
 ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) {
   RangeSample Ranges(RangeCounter.begin(), RangeCounter.end());
-  // For each range, we search for the range of the function it belongs to and
+  // For each range, we search for all ranges of the function it belongs to and
   // initialize it with zero count, so it remains zero if doesn't hit any
   // samples. This is to be consistent with compiler that interpret zero count
   // as unexecuted(cold).
   for (auto I : RangeCounter) {
-    uint64_t RangeBegin = I.first.first;
-    uint64_t RangeEnd = I.first.second;
-    // Find the function offset range the current range begin belongs to.
-    auto FuncRange = Binary->findFuncOffsetRange(RangeBegin);
-    if (FuncRange.second == 0)
-      WithColor::warning()
-          << "[" << format("%8" PRIx64, RangeBegin) << " - "
-          << format("%8" PRIx64, RangeEnd)
-          << "]: Invalid range or disassembling error in profiled binary.\n";
-    else if (RangeEnd > FuncRange.second)
-      WithColor::warning() << "[" << format("%8" PRIx64, RangeBegin) << " - "
-                           << format("%8" PRIx64, RangeEnd)
-                           << "]: Range is across 
diff erent functions.\n";
-    else
-      Ranges[FuncRange] += 0;
+    uint64_t StartOffset = I.first.first;
+    for (const auto &Range : Binary->getRangesForOffset(StartOffset))
+      Ranges[{Range.first, Range.second - 1}] += 0;
   }
   RangeSample DisjointRanges;
   findDisjointRanges(DisjointRanges, Ranges);
@@ -401,21 +389,16 @@ void ProfileGenerator::populateBodySamplesForAllFunctions(
   }
 }
 
-static bool isOutlinedFunction(StringRef CalleeName) {
-  // Check whether it's from hot-cold func split or coro split.
-  return CalleeName.contains(".resume") || CalleeName.contains(".cold");
-}
-
 StringRef ProfileGeneratorBase::getCalleeNameForOffset(uint64_t TargetOffset) {
-  // Get the callee name by branch target if it's a call branch.
-  StringRef CalleeName = FunctionSamples::getCanonicalFnName(
-      Binary->getFuncFromStartOffset(TargetOffset));
+  // Get the function range by branch target if it's a call branch.
+  auto *FRange = Binary->findFuncRangeForStartOffset(TargetOffset);
 
-  // We won't accumulate sample count againt outlined function.
-  if (CalleeName.size() == 0 || isOutlinedFunction(CalleeName))
+  // We won't accumulate sample count for a range whose start is not the real
+  // function entry such as outlined function or inner labels.
+  if (!FRange || !FRange->IsFuncEntry)
     return StringRef();
 
-  return CalleeName;
+  return FunctionSamples::getCanonicalFnName(FRange->getFuncName());
 }
 
 void ProfileGenerator::populateBoundarySamplesForAllFunctions(
@@ -482,20 +465,21 @@ void CSProfileGenerator::generateProfile() {
 void CSProfileGenerator::computeSizeForProfiledFunctions() {
   // Hash map to deduplicate the function range and the item is a pair of
   // function start and end offset.
-  std::unordered_map<uint64_t, uint64_t> FuncRanges;
+  std::unordered_map<uint64_t, uint64_t> AggregatedRanges;
   // Go through all the ranges in the CS counters, use the start of the range to
   // look up the function it belongs and record the function range.
   for (const auto &CI : SampleCounters) {
     for (auto Item : CI.second.RangeCounter) {
       // FIXME: Filter the bogus crossing function range.
-      uint64_t RangeStartOffset = Item.first.first;
-      auto FuncRange = Binary->findFuncOffsetRange(RangeStartOffset);
-      if (FuncRange.second != 0)
-        FuncRanges[FuncRange.first] = FuncRange.second;
+      uint64_t StartOffset = Item.first.first;
+      // Note that a function can be spilt into multiple ranges, so get all
+      // ranges of the function.
+      for (const auto &Range : Binary->getRangesForOffset(StartOffset))
+        AggregatedRanges[Range.first] = Range.second;
     }
   }
 
-  for (auto I : FuncRanges) {
+  for (auto I : AggregatedRanges) {
     uint64_t StartOffset = I.first;
     uint64_t EndOffset = I.second;
     Binary->computeInlinedContextSizeForRange(StartOffset, EndOffset);

diff  --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index bbc26d1e6315e..e9c18cd4b1fac 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -175,6 +175,9 @@ void ProfiledBinary::load() {
   // Decode pseudo probe related section
   decodePseudoProbe(Obj);
 
+  // Load debug info of subprograms from DWARF section.
+  loadSymbolsFromDWARF(*dyn_cast<ObjectFile>(&Binary));
+
   // Disassemble the text sections.
   disassemble(Obj);
 
@@ -183,7 +186,7 @@ void ProfiledBinary::load() {
     FuncSizeTracker.trackInlineesOptimizedAway(ProbeDecoder);
 
   // Use function start and return address to infer prolog and epilog
-  ProEpilogTracker.inferPrologOffsets(FuncStartOffsetMap);
+  ProEpilogTracker.inferPrologOffsets(StartOffset2FuncRangeMap);
   ProEpilogTracker.inferEpilogOffsets(RetAddrs);
 
   // TODO: decode other sections.
@@ -306,6 +309,20 @@ void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
     ProbeDecoder.printGUID2FuncDescMap(outs());
 }
 
+void ProfiledBinary::setIsFuncEntry(uint64_t Offset, StringRef RangeSymName) {
+  // Note that the start offset of each ELF section can be a non-function
+  // symbol, we need to binary search for the start of a real function range.
+  auto *FuncRange = findFuncRangeForOffset(Offset);
+  // Skip external function symbol.
+  if (!FuncRange)
+    return;
+
+  // Set IsFuncEntry to ture if the RangeSymName from ELF is equal to its
+  // DWARF-based function name.
+  if (!FuncRange->IsFuncEntry && FuncRange->getFuncName() == RangeSymName)
+    FuncRange->IsFuncEntry = true;
+}
+
 bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
                                         SectionSymbolsTy &Symbols,
                                         const SectionRef &Section) {
@@ -316,7 +333,7 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
   uint64_t NextStartOffset =
       (SI + 1 < SE) ? Symbols[SI + 1].Addr - getPreferredBaseAddress()
                     : SectionOffset + SectSize;
-  if (StartOffset >= NextStartOffset)
+  if (StartOffset > NextStartOffset)
     return true;
 
   StringRef SymbolName =
@@ -404,8 +421,8 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
   if (ShowDisassembly)
     outs() << "\n";
 
-  FuncStartOffsetMap.emplace(StartOffset,
-                             std::make_pair(Symbols[SI].Name.str(), EndOffset));
+  setIsFuncEntry(StartOffset, Symbols[SI].Name);
+
   return true;
 }
 
@@ -517,6 +534,71 @@ void ProfiledBinary::disassemble(const ELFObjectFileBase *Obj) {
   }
 }
 
+void ProfiledBinary::loadSymbolsFromDWARF(ObjectFile &Obj) {
+  auto DebugContext = llvm::DWARFContext::create(Obj);
+  if (!DebugContext)
+    exitWithError("Misssing debug info.", Path);
+
+  for (const auto &CompilationUnit : DebugContext->compile_units()) {
+    for (const auto &DieInfo : CompilationUnit->dies()) {
+      llvm::DWARFDie Die(CompilationUnit.get(), &DieInfo);
+
+      if (!Die.isSubprogramDIE())
+        continue;
+      auto Name = Die.getName(llvm::DINameKind::LinkageName);
+      if (!Name)
+        Name = Die.getName(llvm::DINameKind::ShortName);
+      if (!Name)
+        continue;
+
+      auto RangesOrError = Die.getAddressRanges();
+      if (!RangesOrError)
+        continue;
+      const DWARFAddressRangesVector &Ranges = RangesOrError.get();
+
+      if (Ranges.empty())
+        continue;
+
+      // Different DWARF symbols can have same function name, search or create
+      // BinaryFunction indexed by the name.
+      auto Ret = BinaryFunctions.emplace(Name, BinaryFunction());
+      auto &Func = Ret.first->second;
+      if (Ret.second)
+        Func.FuncName = Ret.first->first;
+
+      for (const auto &Range : Ranges) {
+        uint64_t FuncStart = Range.LowPC;
+        uint64_t FuncSize = Range.HighPC - FuncStart;
+
+        if (FuncSize == 0 || FuncStart < getPreferredBaseAddress())
+          continue;
+
+        uint64_t StartOffset = FuncStart - getPreferredBaseAddress();
+        uint64_t EndOffset = Range.HighPC - getPreferredBaseAddress();
+
+        // We may want to know all ranges for one function. Here group the
+        // ranges and store them into BinaryFunction.
+        Func.Ranges.emplace_back(StartOffset, EndOffset);
+
+        auto R = StartOffset2FuncRangeMap.emplace(StartOffset, FuncRange());
+        if (R.second) {
+          FuncRange &FRange = R.first->second;
+          FRange.Func = &Func;
+          FRange.StartOffset = StartOffset;
+          FRange.EndOffset = EndOffset;
+        } else {
+          WithColor::warning()
+              << "Duplicated symbol start address at "
+              << format("%8" PRIx64, StartOffset + getPreferredBaseAddress())
+              << " " << R.first->second.getFuncName() << " and " << Name
+              << "\n";
+        }
+      }
+    }
+  }
+  assert(!StartOffset2FuncRangeMap.empty() && "Misssing debug info.");
+}
+
 void ProfiledBinary::setupSymbolizer() {
   symbolize::LLVMSymbolizer::Options SymbolizerOpts;
   SymbolizerOpts.PrintFunctions =
@@ -576,7 +658,7 @@ void ProfiledBinary::computeInlinedContextSizeForRange(uint64_t StartOffset,
                          << format("%8" PRIx64, StartOffset) << "\n";
 
   uint64_t Offset = CodeAddrOffsets[Index];
-  while (Offset <= EndOffset) {
+  while (Offset < EndOffset) {
     const SampleContextFrameVector &SymbolizedCallStack =
         getFrameLocationStack(Offset, UsePseudoProbes);
     uint64_t Size = Offset2InstSizeMap[Offset];

diff  --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index 64759cad8fa92..ba7c70343132c 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -12,6 +12,7 @@
 #include "CallContext.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -68,6 +69,27 @@ struct InstructionPointer {
   void update(uint64_t Addr);
 };
 
+using RangesTy = std::vector<std::pair<uint64_t, uint64_t>>;
+
+struct BinaryFunction {
+  StringRef FuncName;
+  RangesTy Ranges;
+};
+
+// Info about function range. A function can be split into multiple
+// non-continuous ranges, each range corresponds to one FuncRange.
+struct FuncRange {
+  uint64_t StartOffset;
+  // EndOffset is a exclusive bound.
+  uint64_t EndOffset;
+  // Function the range belongs to
+  BinaryFunction *Func;
+  // Whether the start offset is the real entry of the function.
+  bool IsFuncEntry = false;
+
+  StringRef getFuncName() { return Func->FuncName; }
+};
+
 // PrologEpilog offset tracker, used to filter out broken stack samples
 // Currently we use a heuristic size (two) to infer prolog and epilog
 // based on the start address and return address. In the future,
@@ -79,8 +101,7 @@ struct PrologEpilogTracker {
   PrologEpilogTracker(ProfiledBinary *Bin) : Binary(Bin){};
 
   // Take the two addresses from the start of function as prolog
-  void inferPrologOffsets(std::map<uint64_t, std::pair<std::string, uint64_t>>
-                              &FuncStartOffsetMap) {
+  void inferPrologOffsets(std::map<uint64_t, FuncRange> &FuncStartOffsetMap) {
     for (auto I : FuncStartOffsetMap) {
       PrologEpilogSet.insert(I.first);
       InstructionPointer IP(Binary, I.first);
@@ -164,9 +185,15 @@ class ProfiledBinary {
   // A list of text sections sorted by start RVA and size. Used to check
   // if a given RVA is a valid code address.
   std::set<std::pair<uint64_t, uint64_t>> TextSections;
-  // An ordered map of mapping function's start offset to its name and
-  // end offset.
-  std::map<uint64_t, std::pair<std::string, uint64_t>> FuncStartOffsetMap;
+
+  // A map of mapping function name to BinaryFunction info.
+  std::unordered_map<std::string, BinaryFunction> BinaryFunctions;
+
+  // An ordered map of mapping function's start offset to function range
+  // relevant info. Currently to determine if the offset of ELF is the start of
+  // a real function, we leverage the function range info from DWARF.
+  std::map<uint64_t, FuncRange> StartOffset2FuncRangeMap;
+
   // Offset to context location map. Used to expand the context.
   std::unordered_map<uint64_t, SampleContextFrameVector> Offset2LocStackMap;
 
@@ -221,6 +248,14 @@ class ProfiledBinary {
   void setUpDisassembler(const ELFObjectFileBase *Obj);
   void setupSymbolizer();
 
+  // Load debug info of subprograms from DWARF section.
+  void loadSymbolsFromDWARF(ObjectFile &Obj);
+
+  // A function may be spilt into multiple non-continuous address ranges. We use
+  // this to set whether start offset of a function is the real entry of the
+  // function and also set false to the non-function label.
+  void setIsFuncEntry(uint64_t Offset, StringRef RangeSymName);
+
   /// Dissassemble the text section and build various address maps.
   void disassemble(const ELFObjectFileBase *O);
 
@@ -313,19 +348,34 @@ class ProfiledBinary {
     return 0;
   }
 
-  StringRef getFuncFromStartOffset(uint64_t Offset) {
-    auto I = FuncStartOffsetMap.find(Offset);
-    if (I == FuncStartOffsetMap.end())
-      return StringRef();
-    return I->second.first;
+  FuncRange *findFuncRangeForStartOffset(uint64_t Offset) {
+    auto I = StartOffset2FuncRangeMap.find(Offset);
+    if (I == StartOffset2FuncRangeMap.end())
+      return nullptr;
+    return &I->second;
   }
 
-  OffsetRange findFuncOffsetRange(uint64_t Offset) {
-    auto I = FuncStartOffsetMap.upper_bound(Offset);
-    if (I == FuncStartOffsetMap.begin())
-      return {0, 0};
+  // Binary search the function range which includes the input offset.
+  FuncRange *findFuncRangeForOffset(uint64_t Offset) {
+    auto I = StartOffset2FuncRangeMap.upper_bound(Offset);
+    if (I == StartOffset2FuncRangeMap.begin())
+      return nullptr;
     I--;
-    return {I->first, I->second.second};
+
+    if (Offset >= I->second.EndOffset)
+      return nullptr;
+
+    return &I->second;
+  }
+
+  // Get all ranges of one function.
+  RangesTy getRangesForOffset(uint64_t Offset) {
+    auto *FRange = findFuncRangeForOffset(Offset);
+    // Ignore the range which falls into plt section or system lib.
+    if (!FRange)
+      return RangesTy();
+
+    return FRange->Func->Ranges;
   }
 
   uint32_t getFuncSizeForContext(SampleContext &Context) {


        


More information about the llvm-commits mailing list