[llvm] 18eefc1 - Modify llvm-gsymutil lookups to handle overlapping ranges correctly. (#72350)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 17 10:31:16 PST 2023


Author: Greg Clayton
Date: 2023-11-17T10:31:12-08:00
New Revision: 18eefc186d75f60c9a828e13a8379769eb5c1812

URL: https://github.com/llvm/llvm-project/commit/18eefc186d75f60c9a828e13a8379769eb5c1812
DIFF: https://github.com/llvm/llvm-project/commit/18eefc186d75f60c9a828e13a8379769eb5c1812.diff

LOG: Modify llvm-gsymutil lookups to handle overlapping ranges correctly. (#72350)

llvm-gsymutil allows address ranges to overlap. There was a bug where if
we had debug info for a function with a range like [0x100-0x200) and a
symbol at the same start address yet with a larger range like
[0x100-0x300), we would randomly get either only information from the
first or second entry. This could cause lookups to fail due to the way
the binary search worked.

This patch makes sure that when lookups happen we find the first address
table entry that can match an address, and also ensures that we always
select the first FunctionInfo that could match. FunctionInfo entries are
sorted such that the most debug info rich entries come first. And if we
have two ranges that have the same start address, the smaller range
comes first and the larger one comes next. This patch also adds the
ability to iterate over all function infos with the same start address
to always find a range that contains the address.

Added a unit test to test this functionality that failed prior to this
fix and now succeeds.

Also fix an issue when dumping an entire GSYM file that has duplicate address entries where it used to always print out the binary search match for the FunctionInfo, not the actual data for the address index.

Added: 
    

Modified: 
    llvm/include/llvm/DebugInfo/GSYM/GsymReader.h
    llvm/lib/DebugInfo/GSYM/GsymReader.cpp
    llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h b/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h
index cd4fdfa0e9e37d9..c1bdc68d808c7ab 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/GsymReader.h
@@ -106,6 +106,15 @@ class GsymReader {
   /// address.
   llvm::Expected<FunctionInfo> getFunctionInfo(uint64_t Addr) const;
 
+  /// Get the full function info given an address index.
+  ///
+  /// \param AddrIdx A address index for an address in the address table.
+  ///
+  /// \returns An expected FunctionInfo that contains the function info object
+  /// or an error object that indicates reason for failing get the function
+  /// info object.
+  llvm::Expected<FunctionInfo> getFunctionInfoAtIndex(uint64_t AddrIdx) const;
+
   /// Lookup an address in the a GSYM.
   ///
   /// Lookup just the information needed for a specific address \a Addr. This
@@ -266,6 +275,19 @@ class GsymReader {
       return std::nullopt;
     if (Iter == End || AddrOffset < *Iter)
       --Iter;
+
+    // GSYM files have sorted function infos with the most information (line
+    // table and/or inline info) first in the array of function infos, so
+    // always backup as much as possible as long as the address offset is the
+    // same as the previous entry.
+    while (Iter != Begin) {
+      auto Prev = Iter - 1;
+      if (*Prev == *Iter)
+        Iter = Prev;
+      else
+        break;
+    }
+
     return std::distance(Begin, Iter);
   }
 
@@ -303,6 +325,38 @@ class GsymReader {
   /// \returns An optional GSYM data offset for the offset of the FunctionInfo
   /// that needs to be decoded.
   std::optional<uint64_t> getAddressInfoOffset(size_t Index) const;
+
+  /// Given an address, find the correct function info data and function
+  /// address.
+  ///
+  /// Binary search the address table and find the matching address info
+  /// and make sure that the function info contains the address. GSYM allows
+  /// functions to overlap, and the most debug info is contained in the first
+  /// entries due to the sorting when GSYM files are created. We can have
+  /// multiple function info that start at the same address only if their
+  /// address range doesn't match. So find the first entry that matches \a Addr
+  /// and iterate forward until we find one that contains the address.
+  ///
+  /// \param[in] Addr A virtual address that matches the original object file
+  /// to lookup.
+  ///
+  /// \param[out] FuncStartAddr A virtual address that is the base address of
+  /// the function that is used for decoding the FunctionInfo.
+  ///
+  /// \returns An valid data extractor on success, or an error if we fail to
+  /// find the address in a function info or corrrectly decode the data
+  llvm::Expected<llvm::DataExtractor>
+  getFunctionInfoDataForAddress(uint64_t Addr, uint64_t &FuncStartAddr) const;
+
+  /// Get the function data and address given an address index.
+  ///
+  /// \param AddrIdx A address index from the address table.
+  ///
+  /// \returns An expected FunctionInfo that contains the function info object
+  /// or an error object that indicates reason for failing to lookup the
+  /// address.
+  llvm::Expected<llvm::DataExtractor>
+  getFunctionInfoDataAtIndex(uint64_t AddrIdx, uint64_t &FuncStartAddr) const;
 };
 
 } // namespace gsym

diff  --git a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
index 1fe90ef579a3d91..4b1b352466175c6 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
@@ -253,49 +253,94 @@ GsymReader::getAddressIndex(const uint64_t Addr) const {
 
 }
 
-llvm::Expected<FunctionInfo> GsymReader::getFunctionInfo(uint64_t Addr) const {
-  Expected<uint64_t> AddressIndex = getAddressIndex(Addr);
-  if (!AddressIndex)
-    return AddressIndex.takeError();
-  // Address info offsets size should have been checked in parse().
-  assert(*AddressIndex < AddrInfoOffsets.size());
-  auto AddrInfoOffset = AddrInfoOffsets[*AddressIndex];
-  assert(
-      (Endian == llvm::endianness::big || Endian == llvm::endianness::little) &&
-      "Endian must be either big or little");
-  DataExtractor Data(MemBuffer->getBuffer().substr(AddrInfoOffset),
-                     Endian == llvm::endianness::little, 4);
-  if (std::optional<uint64_t> OptAddr = getAddress(*AddressIndex)) {
-    auto ExpectedFI = FunctionInfo::decode(Data, *OptAddr);
-    if (ExpectedFI) {
-      if (ExpectedFI->Range.contains(Addr) || ExpectedFI->Range.size() == 0)
-        return ExpectedFI;
-      return createStringError(std::errc::invalid_argument,
-                                "address 0x%" PRIx64 " is not in GSYM", Addr);
+llvm::Expected<DataExtractor>
+GsymReader::getFunctionInfoDataForAddress(uint64_t Addr,
+                                          uint64_t &FuncStartAddr) const {
+  Expected<uint64_t> ExpectedAddrIdx = getAddressIndex(Addr);
+  if (!ExpectedAddrIdx)
+    return ExpectedAddrIdx.takeError();
+  const uint64_t FirstAddrIdx = *ExpectedAddrIdx;
+  // The AddrIdx is the first index of the function info entries that match
+  // \a Addr. We need to iterate over all function info objects that start with
+  // the same address until we find a range that contains \a Addr.
+  std::optional<uint64_t> FirstFuncStartAddr;
+  const size_t NumAddresses = getNumAddresses();
+  for (uint64_t AddrIdx = FirstAddrIdx; AddrIdx < NumAddresses; ++AddrIdx) {
+    auto ExpextedData = getFunctionInfoDataAtIndex(AddrIdx, FuncStartAddr);
+    // If there was an error, return the error.
+    if (!ExpextedData)
+      return ExpextedData;
+
+    // Remember the first function start address if it hasn't already been set.
+    // If it is already valid, check to see if it matches the first function
+    // start address and only continue if it matches.
+    if (FirstFuncStartAddr.has_value()) {
+      if (*FirstFuncStartAddr != FuncStartAddr)
+        break; // Done with consecutive function entries with same address.
+    } else {
+      FirstFuncStartAddr = FuncStartAddr;
     }
+    // Make sure the current function address ranges contains \a Addr.
+    // Some symbols on Darwin don't have valid sizes, so if we run into a
+    // symbol with zero size, then we have found a match for our address.
+
+    // The first thing the encoding of a FunctionInfo object is the function
+    // size.
+    uint64_t Offset = 0;
+    uint32_t FuncSize = ExpextedData->getU32(&Offset);
+    if (FuncSize == 0 ||
+        AddressRange(FuncStartAddr, FuncStartAddr + FuncSize).contains(Addr))
+      return ExpextedData;
   }
   return createStringError(std::errc::invalid_argument,
-                           "failed to extract address[%" PRIu64 "]",
-                           *AddressIndex);
+                           "address 0x%" PRIx64 " is not in GSYM", Addr);
+}
+
+llvm::Expected<DataExtractor>
+GsymReader::getFunctionInfoDataAtIndex(uint64_t AddrIdx,
+                                       uint64_t &FuncStartAddr) const {
+  if (AddrIdx >= getNumAddresses())
+    return createStringError(std::errc::invalid_argument,
+                             "invalid address index %" PRIu64, AddrIdx);
+  const uint32_t AddrInfoOffset = AddrInfoOffsets[AddrIdx];
+  assert((Endian == endianness::big || Endian == endianness::little) &&
+         "Endian must be either big or little");
+  StringRef Bytes = MemBuffer->getBuffer().substr(AddrInfoOffset);
+  if (Bytes.empty())
+    return createStringError(std::errc::invalid_argument,
+                             "invalid address info offset 0x%" PRIx32,
+                             AddrInfoOffset);
+  std::optional<uint64_t> OptFuncStartAddr = getAddress(AddrIdx);
+  if (!OptFuncStartAddr)
+    return createStringError(std::errc::invalid_argument,
+                             "failed to extract address[%" PRIu64 "]", AddrIdx);
+  FuncStartAddr = *OptFuncStartAddr;
+  return DataExtractor(Bytes, Endian == llvm::endianness::little, 4);
+}
+
+llvm::Expected<FunctionInfo> GsymReader::getFunctionInfo(uint64_t Addr) const {
+  uint64_t FuncStartAddr = 0;
+  if (auto ExpectedData = getFunctionInfoDataForAddress(Addr, FuncStartAddr))
+    return FunctionInfo::decode(*ExpectedData, FuncStartAddr);
+  else
+    return ExpectedData.takeError();
+}
+
+llvm::Expected<FunctionInfo>
+GsymReader::getFunctionInfoAtIndex(uint64_t Idx) const {
+  uint64_t FuncStartAddr = 0;
+  if (auto ExpectedData = getFunctionInfoDataAtIndex(Idx, FuncStartAddr))
+    return FunctionInfo::decode(*ExpectedData, FuncStartAddr);
+  else
+    return ExpectedData.takeError();
 }
 
 llvm::Expected<LookupResult> GsymReader::lookup(uint64_t Addr) const {
-  Expected<uint64_t> AddressIndex = getAddressIndex(Addr);
-  if (!AddressIndex)
-    return AddressIndex.takeError();
-  // Address info offsets size should have been checked in parse().
-  assert(*AddressIndex < AddrInfoOffsets.size());
-  auto AddrInfoOffset = AddrInfoOffsets[*AddressIndex];
-  assert(
-      (Endian == llvm::endianness::big || Endian == llvm::endianness::little) &&
-      "Endian must be either big or little");
-  DataExtractor Data(MemBuffer->getBuffer().substr(AddrInfoOffset),
-                     Endian == llvm::endianness::little, 4);
-  if (std::optional<uint64_t> OptAddr = getAddress(*AddressIndex))
-    return FunctionInfo::lookup(Data, *this, *OptAddr, Addr);
-  return createStringError(std::errc::invalid_argument,
-                           "failed to extract address[%" PRIu64 "]",
-                           *AddressIndex);
+  uint64_t FuncStartAddr = 0;
+  if (auto ExpectedData = getFunctionInfoDataForAddress(Addr, FuncStartAddr))
+    return FunctionInfo::lookup(*ExpectedData, *this, FuncStartAddr, Addr);
+  else
+    return ExpectedData.takeError();
 }
 
 void GsymReader::dump(raw_ostream &OS) {
@@ -346,7 +391,7 @@ void GsymReader::dump(raw_ostream &OS) {
 
   for (uint32_t I = 0; I < Header.NumAddresses; ++I) {
     OS << "FunctionInfo @ " << HEX32(AddrInfoOffsets[I]) << ": ";
-    if (auto FI = getFunctionInfo(*getAddress(I)))
+    if (auto FI = getFunctionInfoAtIndex(I))
       dump(OS, *FI);
     else
       logAllUnhandledErrors(FI.takeError(), OS, "FunctionInfo:");

diff  --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
index 53de96cc6953c2d..f7ee7693ac2eead 100644
--- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
+++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
@@ -4681,3 +4681,200 @@ TEST(GSYMTest, TestHandlingOfInvalidFileIndexes) {
   for (const auto &Error : ExpectedLogErrors)
     EXPECT_TRUE(errors.find(Error) != std::string::npos);
 }
+
+TEST(GSYMTest, TestLookupsOfOverlappingAndUnequalRanges) {
+  // Test that llvm-gsymutil lookup the correct funtion info when address
+  // ranges overlap. When functions overlap we always want to pick the first
+  // function info when symbolicating if there are multiple entries with the
+  // same address. Previous to this fix we would just binary search the address
+  // table and pick the first function info that matched the address. After
+  // this fix we now always select the first matching entry whose address range
+  // contains the lookup address to ensure we have the most debug info. We have
+  // seen case where the debug info would contain a small range and a symbol
+  // would have the same start address but the range was larger and sometimes,
+  // depending on how the binary search of the address table happened, we would
+  // pick these latter entries. We want the first entries because they always
+  // have the most debug info.
+  //
+  // To repro this case, we just make some simple DWARF that has two
+  // overlapping ranges and ensure that any lookups between 0x1000 and 0x104f
+  // match "foo", and any ranges between 0x1050 and 0x1fff match "bar".
+  //
+  // 0x0000000b: DW_TAG_compile_unit
+  //               DW_AT_name	("/tmp/main.cpp")
+  //               DW_AT_language	(DW_LANG_C)
+  //               DW_AT_stmt_list	(0x00000000)
+  //
+  // 0x00000015:   DW_TAG_subprogram
+  //                 DW_AT_name	("foo")
+  //                 DW_AT_low_pc	(0x0000000000001000)
+  //                 DW_AT_high_pc	(0x0000000000001050)
+  //
+  // 0x0000002a:   DW_TAG_subprogram
+  //                 DW_AT_name	("bar")
+  //                 DW_AT_low_pc	(0x0000000000001000)
+  //                 DW_AT_high_pc	(0x0000000000001100)
+  //
+  // 0x0000003f:   NULL
+
+  StringRef yamldata = R"(
+  debug_str:
+    - ''
+    - '/tmp/main.cpp'
+    - foo
+    - bar
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+  debug_info:
+    - Length:          0x3C
+      Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x1
+            - Value:           0x2
+            - Value:           0x0
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xF
+            - Value:           0x1000
+            - Value:           0x1050
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0x13
+            - Value:           0x1000
+            - Value:           0x1100
+        - AbbrCode:        0x0
+  debug_line:
+    - Length:          71
+      Version:         2
+      PrologueLength:  36
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
+      IncludeDirs:
+        - '/tmp'
+      Files:
+        - Name:            main.cpp
+          DirIdx:          1
+          ModTime:         0
+          Length:          0
+      Opcodes:
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4096
+        - Opcode:          DW_LNS_advance_line
+          SData:           9
+          Data:            0
+        - Opcode:          DW_LNS_copy
+          Data:            0
+        - Opcode:          DW_LNS_advance_pc
+          Data:            16
+        - Opcode:          DW_LNS_advance_line
+          SData:           1
+          Data:            0
+        - Opcode:          DW_LNS_copy
+          Data:            0
+        - Opcode:          DW_LNS_advance_line
+          SData:           1
+          Data:            0
+        - Opcode:          DW_LNS_copy
+          Data:            0
+        - Opcode:          DW_LNS_advance_pc
+          Data:            64
+        - Opcode:          DW_LNS_advance_line
+          SData:           1
+          Data:            0
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            0
+  )";
+  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
+  ASSERT_THAT_EXPECTED(ErrOrSections, Succeeded());
+  std::unique_ptr<DWARFContext> DwarfContext =
+      DWARFContext::create(*ErrOrSections, 8);
+  ASSERT_TRUE(DwarfContext.get() != nullptr);
+  std::string errors;
+  raw_string_ostream OS(errors);
+  GsymCreator GC;
+  DwarfTransformer DT(*DwarfContext, GC);
+  const uint32_t ThreadCount = 1;
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  OS.flush();
+  SmallString<512> Str;
+  raw_svector_ostream OutStrm(Str);
+  const auto ByteOrder = llvm::endianness::native;
+  FileWriter FW(OutStrm, ByteOrder);
+  ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
+  Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
+  ASSERT_THAT_EXPECTED(GR, Succeeded());
+  // There should be two functions in our GSYM.
+  EXPECT_EQ(GR->getNumAddresses(), 2u);
+  // Verify "foo" is correctly looked up for each of its addresses.
+  for (uint64_t Addr = 0x1000; Addr < 0x1050; ++Addr) {
+    auto ExpFI = GR->getFunctionInfo(Addr);
+    ASSERT_THAT_EXPECTED(ExpFI, Succeeded());
+    ASSERT_EQ(ExpFI->Range, AddressRange(0x1000, 0x1050));
+    StringRef FuncName = GR->getString(ExpFI->Name);
+    EXPECT_EQ(FuncName, "foo");
+  }
+
+  // Verify "bar" is correctly looked up for each of its addresses.
+  for (uint64_t Addr = 0x1050; Addr < 0x1100; ++Addr) {
+    auto ExpFI = GR->getFunctionInfo(Addr);
+    ASSERT_THAT_EXPECTED(ExpFI, Succeeded());
+    ASSERT_EQ(ExpFI->Range, AddressRange(0x1000, 0x1100));
+    StringRef FuncName = GR->getString(ExpFI->Name);
+    EXPECT_EQ(FuncName, "bar");
+  }
+
+  // Prior to the fix for this issue when we dumped an entire GSYM file, we
+  // were using a function that would extract a FunctionInfo object for a
+  // given address which caused us to always dump the first FunctionInfo
+  // entry for a given address. We now dump it correctly using an address
+  // index. Below we verify that we dump the right FunctionInfo gets dumped.
+
+  SmallString<512> DumpStr;
+  raw_svector_ostream DumpStrm(DumpStr);
+  GR->dump(DumpStrm);
+
+  // Make sure we see both "foo" and "bar" in the output of an entire GSYM
+  // dump. Prior to this fix we would two "foo" entries.
+  std::vector<std::string> ExpectedDumpLines = {
+      "@ 0x00000068: [0x0000000000001000 - 0x0000000000001050) \"foo\"",
+      "@ 0x00000088: [0x0000000000001000 - 0x0000000000001100) \"bar\""};
+  // Make sure all expected errors are in the error stream for the two invalid
+  // inlined functions that we removed due to invalid range scoping.
+  for (const auto &Line : ExpectedDumpLines)
+    EXPECT_TRUE(DumpStr.find(Line) != std::string::npos);
+}


        


More information about the llvm-commits mailing list