[Lldb-commits] [lldb] d77ccfd - Create synthetic symbol names on demand to improve memory consumption and startup times.
    Greg Clayton via lldb-commits 
    lldb-commits at lists.llvm.org
       
    Mon Jun 28 18:05:00 PDT 2021
    
    
  
Author: Greg Clayton
Date: 2021-06-28T18:04:51-07:00
New Revision: d77ccfdc72182cf7ca1bbf6b8b47e062766a9f1f
URL: https://github.com/llvm/llvm-project/commit/d77ccfdc72182cf7ca1bbf6b8b47e062766a9f1f
DIFF: https://github.com/llvm/llvm-project/commit/d77ccfdc72182cf7ca1bbf6b8b47e062766a9f1f.diff
LOG: Create synthetic symbol names on demand to improve memory consumption and startup times.
This fix was created after profiling the target creation of a large C/C++/ObjC application that contained almost 4,000,000 redacted symbol names. The symbol table parsing code was creating names for each of these synthetic symbols and adding them to the name indexes. The code was also adding the object file basename to the end of the symbol name which doesn't allow symbols from different shared libraries to share the names in the constant string pool.
Prior to this fix this was creating 180MB of "___lldb_unnamed_symbol" symbol names and was taking a long time to generate each name, add them to the string pool and then add each of these names to the name index.
This patch fixes the issue by:
- not adding a name to synthetic symbols at creation time, and allows name to be dynamically generated when accessed
- doesn't add synthetic symbol names to the name indexes, but catches this special case as name lookup time. Users won't typically set breakpoints or lookup these synthetic names, but support was added to do the lookup in case it does happen
- removes the object file baseanme from the generated names to allow the names to be shared in the constant string pool
Prior to this fix the startup times for a large application was:
12.5 seconds (cold file caches)
8.5 seconds (warm file caches)
After this fix:
9.7 seconds (cold file caches)
5.7 seconds (warm file caches)
The names of the symbols are auto generated by appending the symbol's UserID to the end of the "___lldb_unnamed_symbol" string and is only done when the name is requested from a synthetic symbol if it has no name.
Differential Revision: https://reviews.llvm.org/D104488
Added: 
    
Modified: 
    lldb/include/lldb/Symbol/ObjectFile.h
    lldb/include/lldb/Symbol/Symbol.h
    lldb/include/lldb/Symbol/Symtab.h
    lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
    lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
    lldb/source/Symbol/ObjectFile.cpp
    lldb/source/Symbol/Symbol.cpp
    lldb/source/Symbol/Symtab.cpp
Removed: 
    
################################################################################
diff  --git a/lldb/include/lldb/Symbol/ObjectFile.h b/lldb/include/lldb/Symbol/ObjectFile.h
index 1e29cf53b78b3..dc83565c7db52 100644
--- a/lldb/include/lldb/Symbol/ObjectFile.h
+++ b/lldb/include/lldb/Symbol/ObjectFile.h
@@ -712,8 +712,6 @@ class ObjectFile : public std::enable_shared_from_this<ObjectFile>,
   ///     false otherwise.
   bool SetModulesArchitecture(const ArchSpec &new_arch);
 
-  ConstString GetNextSyntheticSymbolName();
-
   static lldb::DataBufferSP MapFileData(const FileSpec &file, uint64_t Size,
                                         uint64_t Offset);
 
diff  --git a/lldb/include/lldb/Symbol/Symbol.h b/lldb/include/lldb/Symbol/Symbol.h
index 3abe3114863de..be3e8abefa490 100644
--- a/lldb/include/lldb/Symbol/Symbol.h
+++ b/lldb/include/lldb/Symbol/Symbol.h
@@ -113,14 +113,20 @@ class Symbol : public SymbolContextScope {
   lldb::LanguageType GetLanguage() const {
     // TODO: See if there is a way to determine the language for a symbol
     // somehow, for now just return our best guess
-    return m_mangled.GuessLanguage();
+    return GetMangled().GuessLanguage();
   }
 
   void SetID(uint32_t uid) { m_uid = uid; }
 
-  Mangled &GetMangled() { return m_mangled; }
+  Mangled &GetMangled() {
+    SynthesizeNameIfNeeded();
+    return m_mangled;
+  }
 
-  const Mangled &GetMangled() const { return m_mangled; }
+  const Mangled &GetMangled() const {
+    SynthesizeNameIfNeeded();
+    return m_mangled;
+  }
 
   ConstString GetReExportedSymbolName() const;
 
@@ -166,9 +172,9 @@ class Symbol : public SymbolContextScope {
   bool IsTrampoline() const;
 
   bool IsIndirect() const;
-  
+
   bool IsWeak() const { return m_is_weak; }
-  
+
   void SetIsWeak (bool b) { m_is_weak = b; }
 
   bool GetByteSizeIsValid() const { return m_size_is_valid; }
@@ -223,6 +229,10 @@ class Symbol : public SymbolContextScope {
 
   bool ContainsFileAddress(lldb::addr_t file_addr) const;
 
+  static llvm::StringRef GetSyntheticSymbolPrefix() {
+    return "___lldb_unnamed_symbol";
+  }
+
 protected:
   // This is the internal guts of ResolveReExportedSymbol, it assumes
   // reexport_name is not null, and that module_spec is valid.  We track the
@@ -233,6 +243,8 @@ class Symbol : public SymbolContextScope {
       lldb_private::ModuleSpec &module_spec,
       lldb_private::ModuleList &seen_modules) const;
 
+  void SynthesizeNameIfNeeded() const;
+
   uint32_t m_uid =
       UINT32_MAX;           // User ID (usually the original symbol table index)
   uint16_t m_type_data = 0; // data specific to m_type
@@ -258,7 +270,7 @@ class Symbol : public SymbolContextScope {
                                          // doing name lookups
       m_is_weak : 1,
       m_type : 6;            // Values from the lldb::SymbolType enum.
-  Mangled m_mangled;         // uniqued symbol name/mangled name pair
+  mutable Mangled m_mangled; // uniqued symbol name/mangled name pair
   AddressRange m_addr_range; // Contains the value, or the section offset
                              // address when the value is an address in a
                              // section, and the size (if any)
diff  --git a/lldb/include/lldb/Symbol/Symtab.h b/lldb/include/lldb/Symbol/Symtab.h
index fbfa3a5e0cec7..e1ad0dfd2eb8d 100644
--- a/lldb/include/lldb/Symbol/Symtab.h
+++ b/lldb/include/lldb/Symbol/Symtab.h
@@ -219,6 +219,26 @@ class Symtab {
     return false;
   }
 
+  /// A helper function that looks up full function names.
+  ///
+  /// We generate unique names for synthetic symbols so that users can look
+  /// them up by name when needed. But because doing so is uncommon in normal
+  /// debugger use, we trade off some performance at lookup time for faster
+  /// symbol table building by detecting these symbols and generating their
+  /// names lazily, rather than adding them to the normal symbol indexes. This
+  /// function does the job of first consulting the name indexes, and if that
+  /// fails it extracts the information it needs from the synthetic name and
+  /// locates the symbol.
+  ///
+  /// @param[in] symbol_name The symbol name to search for.
+  ///
+  /// @param[out] indexes The vector if symbol indexes to update with results.
+  ///
+  /// @returns The number of indexes added to the index vector. Zero if no
+  /// matches were found.
+  uint32_t GetNameIndexes(ConstString symbol_name,
+                          std::vector<uint32_t> &indexes);
+
   void SymbolIndicesToSymbolContextList(std::vector<uint32_t> &symbol_indexes,
                                         SymbolContextList &sc_list);
 
diff  --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index be73d38961ea6..edf87f036f0f5 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -1880,7 +1880,7 @@ void ObjectFileELF::CreateSections(SectionList &unified_section_list) {
           unified_section_list.AddSection(symtab_section_sp);
       }
     }
-  }  
+  }
 }
 
 std::shared_ptr<ObjectFileELF> ObjectFileELF::GetGnuDebugDataObjectFile() {
@@ -2813,20 +2813,24 @@ Symtab *ObjectFileELF::GetSymtab() {
       if (is_valid_entry_point && !m_symtab_up->FindSymbolContainingFileAddress(
                                       entry_point_file_addr)) {
         uint64_t symbol_id = m_symtab_up->GetNumSymbols();
-        Symbol symbol(symbol_id,
-                      GetNextSyntheticSymbolName().GetCString(), // Symbol name.
-                      eSymbolTypeCode, // Type of this symbol.
-                      true,            // Is this globally visible?
-                      false,           // Is this symbol debug info?
-                      false,           // Is this symbol a trampoline?
-                      true,            // Is this symbol artificial?
-                      entry_point_addr.GetSection(), // Section where this
-                                                     // symbol is defined.
-                      0,     // Offset in section or symbol value.
-                      0,     // Size.
-                      false, // Size is valid.
-                      false, // Contains linker annotations?
-                      0);    // Symbol flags.
+        // Don't set the name for any synthetic symbols, the Symbol
+        // object will generate one if needed when the name is accessed
+        // via accessors.
+        SectionSP section_sp = entry_point_addr.GetSection();
+        Symbol symbol(
+            /*symID=*/symbol_id,
+            /*name=*/llvm::StringRef(), // Name will be auto generated.
+            /*type=*/eSymbolTypeCode,
+            /*external=*/true,
+            /*is_debug=*/false,
+            /*is_trampoline=*/false,
+            /*is_artificial=*/true,
+            /*section_sp=*/section_sp,
+            /*offset=*/entry_point_addr.GetOffset(),
+            /*size=*/0, // FDE can span multiple symbols so don't use its size.
+            /*size_is_valid=*/false,
+            /*contains_linker_annotations=*/false,
+            /*flags=*/0);
         m_symtab_up->AddSymbol(symbol);
         // When the entry point is arm thumb we need to explicitly set its
         // class address to reflect that. This is important because expression
@@ -2917,22 +2921,24 @@ void ObjectFileELF::ParseUnwindSymbols(Symtab *symbol_table,
           section_list->FindSectionContainingFileAddress(file_addr);
       if (section_sp) {
         addr_t offset = file_addr - section_sp->GetFileAddress();
-        const char *symbol_name = GetNextSyntheticSymbolName().GetCString();
         uint64_t symbol_id = ++last_symbol_id;
+        // Don't set the name for any synthetic symbols, the Symbol
+        // object will generate one if needed when the name is accessed
+        // via accessors.
         Symbol eh_symbol(
-            symbol_id,       // Symbol table index.
-            symbol_name,     // Symbol name.
-            eSymbolTypeCode, // Type of this symbol.
-            true,            // Is this globally visible?
-            false,           // Is this symbol debug info?
-            false,           // Is this symbol a trampoline?
-            true,            // Is this symbol artificial?
-            section_sp,      // Section in which this symbol is defined or null.
-            offset,          // Offset in section or symbol value.
-            0,     // Size:          Don't specify the size as an FDE can
-            false, // Size is valid: cover multiple symbols.
-            false, // Contains linker annotations?
-            0);    // Symbol flags.
+            /*symID=*/symbol_id,
+            /*name=*/llvm::StringRef(), // Name will be auto generated.
+            /*type=*/eSymbolTypeCode,
+            /*external=*/true,
+            /*is_debug=*/false,
+            /*is_trampoline=*/false,
+            /*is_artificial=*/true,
+            /*section_sp=*/section_sp,
+            /*offset=*/offset,
+            /*size=*/0, // FDE can span multiple symbols so don't use its size.
+            /*size_is_valid=*/false,
+            /*contains_linker_annotations=*/false,
+            /*flags=*/0);
         new_symbols.push_back(eh_symbol);
       }
     }
diff  --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index e7652cffb1c81..72389e9fd5c67 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -4696,8 +4696,10 @@ size_t ObjectFileMachO::ParseSymtab() {
                 symbol_byte_size = section_end_file_addr - symbol_file_addr;
               }
               sym[sym_idx].SetID(synthetic_sym_id++);
-              sym[sym_idx].GetMangled().SetDemangledName(
-                  GetNextSyntheticSymbolName());
+              // Don't set the name for any synthetic symbols, the Symbol
+              // object will generate one if needed when the name is accessed
+              // via accessors.
+              sym[sym_idx].GetMangled().SetDemangledName(ConstString());
               sym[sym_idx].SetType(eSymbolTypeCode);
               sym[sym_idx].SetIsSynthetic(true);
               sym[sym_idx].GetAddressRef() = symbol_addr;
diff  --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp
index b0fdd50b3c0f1..101af01341a20 100644
--- a/lldb/source/Symbol/ObjectFile.cpp
+++ b/lldb/source/Symbol/ObjectFile.cpp
@@ -616,16 +616,6 @@ ObjectFile::GetSymbolTypeFromName(llvm::StringRef name,
   return symbol_type_hint;
 }
 
-ConstString ObjectFile::GetNextSyntheticSymbolName() {
-  llvm::SmallString<256> name;
-  llvm::raw_svector_ostream os(name);
-  ConstString file_name = GetModule()->GetFileSpec().GetFilename();
-  ++m_synthetic_symbol_idx;
-  os << "___lldb_unnamed_symbol" << m_synthetic_symbol_idx << "$$"
-     << file_name.GetStringRef();
-  return ConstString(os.str());
-}
-
 std::vector<ObjectFile::LoadableData>
 ObjectFile::GetLoadableData(Target &target) {
   std::vector<LoadableData> loadables;
diff  --git a/lldb/source/Symbol/Symbol.cpp b/lldb/source/Symbol/Symbol.cpp
index a25911d1734da..b24372795ad55 100644
--- a/lldb/source/Symbol/Symbol.cpp
+++ b/lldb/source/Symbol/Symbol.cpp
@@ -56,8 +56,8 @@ Symbol::Symbol(uint32_t symID, const Mangled &mangled, SymbolType type,
       m_size_is_synthesized(false),
       m_size_is_valid(size_is_valid || range.GetByteSize() > 0),
       m_demangled_is_synthesized(false),
-      m_contains_linker_annotations(contains_linker_annotations), 
-      m_is_weak(false), m_type(type), m_mangled(mangled), m_addr_range(range), 
+      m_contains_linker_annotations(contains_linker_annotations),
+      m_is_weak(false), m_type(type), m_mangled(mangled), m_addr_range(range),
       m_flags(flags) {}
 
 Symbol::Symbol(const Symbol &rhs)
@@ -119,7 +119,7 @@ bool Symbol::ValueIsAddress() const {
 }
 
 ConstString Symbol::GetDisplayName() const {
-  return m_mangled.GetDisplayDemangledName();
+  return GetMangled().GetDisplayDemangledName();
 }
 
 ConstString Symbol::GetReExportedSymbolName() const {
@@ -202,7 +202,7 @@ void Symbol::GetDescription(Stream *s, lldb::DescriptionLevel level,
       s->Printf(", value = 0x%16.16" PRIx64,
                 m_addr_range.GetBaseAddress().GetOffset());
   }
-  ConstString demangled = m_mangled.GetDemangledName();
+  ConstString demangled = GetMangled().GetDemangledName();
   if (demangled)
     s->Printf(", name=\"%s\"", demangled.AsCString());
   if (m_mangled.GetMangledName())
@@ -218,7 +218,7 @@ void Symbol::Dump(Stream *s, Target *target, uint32_t index,
   // Make sure the size of the symbol is up to date before dumping
   GetByteSize();
 
-  ConstString name = m_mangled.GetName(name_preference);
+  ConstString name = GetMangled().GetName(name_preference);
   if (ValueIsAddress()) {
     if (!m_addr_range.GetBaseAddress().Dump(s, nullptr,
                                             Address::DumpStyleFileAddress))
@@ -330,9 +330,11 @@ uint32_t Symbol::GetPrologueByteSize() {
 }
 
 bool Symbol::Compare(ConstString name, SymbolType type) const {
-  if (type == eSymbolTypeAny || m_type == type)
-    return m_mangled.GetMangledName() == name ||
-           m_mangled.GetDemangledName() == name;
+  if (type == eSymbolTypeAny || m_type == type) {
+    const Mangled &mangled = GetMangled();
+    return mangled.GetMangledName() == name ||
+           mangled.GetDemangledName() == name;
+  }
   return false;
 }
 
@@ -495,10 +497,10 @@ lldb::addr_t Symbol::GetLoadAddress(Target *target) const {
     return LLDB_INVALID_ADDRESS;
 }
 
-ConstString Symbol::GetName() const { return m_mangled.GetName(); }
+ConstString Symbol::GetName() const { return GetMangled().GetName(); }
 
 ConstString Symbol::GetNameNoArguments() const {
-  return m_mangled.GetName(Mangled::ePreferDemangledWithoutArguments);
+  return GetMangled().GetName(Mangled::ePreferDemangledWithoutArguments);
 }
 
 lldb::addr_t Symbol::ResolveCallableAddress(Target &target) const {
@@ -565,3 +567,21 @@ bool Symbol::GetDisassembly(const ExecutionContext &exe_ctx, const char *flavor,
 bool Symbol::ContainsFileAddress(lldb::addr_t file_addr) const {
   return m_addr_range.ContainsFileAddress(file_addr);
 }
+
+void Symbol::SynthesizeNameIfNeeded() const {
+  if (m_is_synthetic && !m_mangled) {
+    // Synthetic symbol names don't mean anything, but they do uniquely
+    // identify individual symbols so we give them a unique name. The name
+    // starts with the synthetic symbol prefix, followed by a unique number.
+    // Typically the UserID of a real symbol is the symbol table index of the
+    // symbol in the object file's symbol table(s), so it will be the same
+    // every time you read in the object file. We want the same persistence for
+    // synthetic symbols so that users can identify them across multiple debug
+    // sessions, to understand crashes in those symbols and to reliably set
+    // breakpoints on them.
+    llvm::SmallString<256> name;
+    llvm::raw_svector_ostream os(name);
+    os << GetSyntheticSymbolPrefix() << GetID();
+    m_mangled.SetDemangledName(ConstString(os.str()));
+  }
+}
diff  --git a/lldb/source/Symbol/Symtab.cpp b/lldb/source/Symbol/Symtab.cpp
index 14360aa69a72c..d859d8e251299 100644
--- a/lldb/source/Symbol/Symtab.cpp
+++ b/lldb/source/Symbol/Symtab.cpp
@@ -301,7 +301,7 @@ void Symtab::InitNameIndexes() {
       // the trampoline symbols to be searchable by name we can remove this and
       // then possibly add a new bool to any of the Symtab functions that
       // lookup symbols by name to indicate if they want trampolines.
-      if (symbol->IsTrampoline())
+      if (symbol->IsTrampoline() || symbol->IsSynthetic())
         continue;
 
       // If the symbol's name string matched a Mangled::ManglingScheme, it is
@@ -628,6 +628,36 @@ void Symtab::SortSymbolIndexesByValue(std::vector<uint32_t> &indexes,
   }
 }
 
+uint32_t Symtab::GetNameIndexes(ConstString symbol_name,
+                                std::vector<uint32_t> &indexes) {
+  auto &name_to_index = GetNameToSymbolIndexMap(lldb::eFunctionNameTypeNone);
+  const uint32_t count = name_to_index.GetValues(symbol_name, indexes);
+  if (count)
+    return count;
+  // Synthetic symbol names are not added to the name indexes, but they start
+  // with a prefix and end with a the symbol UserID. This allows users to find
+  // these symbols without having to add them to the name indexes. These
+  // queries will not happen very often since the names don't mean anything, so
+  // performance is not paramount in this case.
+  llvm::StringRef name = symbol_name.GetStringRef();
+  // String the synthetic prefix if the name starts with it.
+  if (!name.consume_front(Symbol::GetSyntheticSymbolPrefix()))
+    return 0; // Not a synthetic symbol name
+
+  // Extract the user ID from the symbol name
+  user_id_t uid = 0;
+  if (getAsUnsignedInteger(name, /*Radix=*/10, uid))
+    return 0; // Failed to extract the user ID as an integer
+  Symbol *symbol = FindSymbolByID(uid);
+  if (symbol == nullptr)
+    return 0;
+  const uint32_t symbol_idx = GetIndexForSymbol(symbol);
+  if (symbol_idx == UINT32_MAX)
+    return 0;
+  indexes.push_back(symbol_idx);
+  return 1;
+}
+
 uint32_t Symtab::AppendSymbolIndexesWithName(ConstString symbol_name,
                                              std::vector<uint32_t> &indexes) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
@@ -637,8 +667,7 @@ uint32_t Symtab::AppendSymbolIndexesWithName(ConstString symbol_name,
     if (!m_name_indexes_computed)
       InitNameIndexes();
 
-    auto &name_to_index = GetNameToSymbolIndexMap(lldb::eFunctionNameTypeNone);
-    return name_to_index.GetValues(symbol_name, indexes);
+    return GetNameIndexes(symbol_name, indexes);
   }
   return 0;
 }
@@ -655,10 +684,9 @@ uint32_t Symtab::AppendSymbolIndexesWithName(ConstString symbol_name,
     if (!m_name_indexes_computed)
       InitNameIndexes();
 
-    auto &name_to_index = GetNameToSymbolIndexMap(lldb::eFunctionNameTypeNone);
     std::vector<uint32_t> all_name_indexes;
     const size_t name_match_count =
-        name_to_index.GetValues(symbol_name, all_name_indexes);
+        GetNameIndexes(symbol_name, all_name_indexes);
     for (size_t i = 0; i < name_match_count; ++i) {
       if (CheckSymbolAtIndex(all_name_indexes[i], symbol_debug_type,
                              symbol_visibility))
        
    
    
More information about the lldb-commits
mailing list