[llvm] 02106ec - [Symbolize] LRU cache binaries in llvm-symbolizer.

Daniel Thornburgh via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 24 16:31:56 PST 2022


Author: Daniel Thornburgh
Date: 2022-02-25T00:31:48Z
New Revision: 02106ec15c2e1e0a94ef66154c48650cfb111126

URL: https://github.com/llvm/llvm-project/commit/02106ec15c2e1e0a94ef66154c48650cfb111126
DIFF: https://github.com/llvm/llvm-project/commit/02106ec15c2e1e0a94ef66154c48650cfb111126.diff

LOG: [Symbolize] LRU cache binaries in llvm-symbolizer.

This change adds a simple LRU cache to the Symbolize class to put a cap
on llvm-symbolizer memory usage. Previously, the Symbolizer's virtual
memory footprint would grow without bound as additional binaries were
referenced.

I'm putting this out there early for an informal review, since there may be
a dramatically different/better way to go about this. I still need to
figure out a good default constant for the memory cap and benchmark the
implementation against a large symbolization workload. Right now I've
pegged max memory usage at zero for testing purposes, which evicts the whole
cache every time.

Unfortunately, it looks like StringRefs in the returned DI objects can
directly refer to the contents of binaries. Accordingly, the cache
pruning must be explicitly requested by the caller, as the caller must
guarantee that none of the returned objects will be used afterwards.

For llvm-symbolizer this a light burden; symbolization occurs
line-by-line, and the returned objects are discarded after each.

Implementation wise, there are a number of nested caches that depend
on one another. I've implemented a simple Evictor callback system to
allow derived caches to register eviction actions to occur when the
underlying binaries are evicted.

Reviewed By: dblaikie

Differential Revision: https://reviews.llvm.org/D119784

Added: 
    

Modified: 
    llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
    llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
    llvm/tools/llvm-symbolizer/Opts.td
    llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
index 552f2d315c50b..2317017283cc7 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
@@ -14,12 +14,15 @@
 #define LLVM_DEBUGINFO_SYMBOLIZE_SYMBOLIZE_H
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/simple_ilist.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/Symbolize/DIFetcher.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
+#include <list>
 #include <map>
 #include <memory>
 #include <string>
@@ -43,6 +46,8 @@ using namespace object;
 using FunctionNameKind = DILineInfoSpecifier::FunctionNameKind;
 using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
 
+class CachedBinary;
+
 class LLVMSymbolizer {
 public:
   struct Options {
@@ -58,6 +63,9 @@ class LLVMSymbolizer {
     std::string FallbackDebugPath;
     std::string DWPName;
     std::vector<std::string> DebugFileDirectory;
+    size_t MaxCacheSize = sizeof(size_t) == 4
+                              ? 512 * 1024 * 1024 /* 512 MiB */
+                              : 4ULL * 1024 * 1024 * 1024 /* 4 GiB */;
   };
 
   LLVMSymbolizer() = default;
@@ -98,6 +106,11 @@ class LLVMSymbolizer {
                  object::SectionedAddress ModuleOffset);
   void flush();
 
+  // Evict entries from the binary cache until it is under the maximum size
+  // given in the options. Calling this invalidates references in the DI...
+  // objects returned by the methods above.
+  void pruneCache();
+
   static std::string
   DemangleName(const std::string &Name,
                const SymbolizableModule *DbiModuleDescriptor);
@@ -172,6 +185,9 @@ class LLVMSymbolizer {
   Expected<ObjectFile *> getOrCreateObject(const std::string &Path,
                                            const std::string &ArchName);
 
+  /// Update the LRU cache order when a binary is accessed.
+  void recordAccess(CachedBinary &Bin);
+
   std::map<std::string, std::unique_ptr<SymbolizableModule>, std::less<>>
       Modules;
   StringMap<std::string> BuildIDPaths;
@@ -181,7 +197,12 @@ class LLVMSymbolizer {
       ObjectPairForPathArch;
 
   /// Contains parsed binary for each path, or parsing error.
-  std::map<std::string, OwningBinary<Binary>> BinaryForPath;
+  std::map<std::string, CachedBinary> BinaryForPath;
+
+  /// A list of cached binaries in LRU order.
+  simple_ilist<CachedBinary> LRUBinaries;
+  /// Sum of the sizes of the cached binaries.
+  size_t CacheSize = 0;
 
   /// Parsed object file for path/architecture pair, where "path" refers
   /// to Mach-O universal binary.
@@ -193,6 +214,35 @@ class LLVMSymbolizer {
   SmallVector<std::unique_ptr<DIFetcher>> DIFetchers;
 };
 
+// A binary intrusively linked into a LRU cache list. If the binary is empty,
+// then the entry marks that an error occurred, and it is not part of the LRU
+// list.
+class CachedBinary : public ilist_node<CachedBinary> {
+public:
+  CachedBinary() = default;
+  CachedBinary(OwningBinary<Binary> Bin) : Bin(std::move(Bin)) {}
+
+  OwningBinary<Binary> &operator*() { return Bin; }
+  OwningBinary<Binary> *operator->() { return &Bin; }
+
+  // Add an action to be performed when the binary is evicted, before all
+  // previously registered evictors.
+  void pushEvictor(std::function<void()> Evictor);
+
+  // Run all registered evictors in the reverse of the order in which they were
+  // added.
+  void evict() {
+    if (Evictor)
+      Evictor();
+  }
+
+  size_t size() { return Bin.getBinary()->getData().size(); }
+
+private:
+  OwningBinary<Binary> Bin;
+  std::function<void()> Evictor;
+};
+
 } // end namespace symbolize
 } // end namespace llvm
 

diff  --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index dac96c084b276..021db1031ab56 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -227,6 +227,8 @@ LLVMSymbolizer::symbolizeFrame(ArrayRef<uint8_t> BuildID,
 
 void LLVMSymbolizer::flush() {
   ObjectForUBPathAndArch.clear();
+  LRUBinaries.clear();
+  CacheSize = 0;
   BinaryForPath.clear();
   ObjectPairForPathArch.clear();
   Modules.clear();
@@ -493,8 +495,10 @@ Expected<LLVMSymbolizer::ObjectPair>
 LLVMSymbolizer::getOrCreateObjectPair(const std::string &Path,
                                       const std::string &ArchName) {
   auto I = ObjectPairForPathArch.find(std::make_pair(Path, ArchName));
-  if (I != ObjectPairForPathArch.end())
+  if (I != ObjectPairForPathArch.end()) {
+    recordAccess(BinaryForPath.find(Path)->second);
     return I->second;
+  }
 
   auto ObjOrErr = getOrCreateObject(Path, ArchName);
   if (!ObjOrErr) {
@@ -516,7 +520,12 @@ LLVMSymbolizer::getOrCreateObjectPair(const std::string &Path,
   if (!DbgObj)
     DbgObj = Obj;
   ObjectPair Res = std::make_pair(Obj, DbgObj);
-  ObjectPairForPathArch.emplace(std::make_pair(Path, ArchName), Res);
+  std::string DbgObjPath = DbgObj->getFileName().str();
+  auto Pair =
+      ObjectPairForPathArch.emplace(std::make_pair(Path, ArchName), Res);
+  BinaryForPath.find(DbgObjPath)->second.pushEvictor([this, I = Pair.first]() {
+    ObjectPairForPathArch.erase(I);
+  });
   return Res;
 }
 
@@ -526,13 +535,19 @@ LLVMSymbolizer::getOrCreateObject(const std::string &Path,
   Binary *Bin;
   auto Pair = BinaryForPath.emplace(Path, OwningBinary<Binary>());
   if (!Pair.second) {
-    Bin = Pair.first->second.getBinary();
+    Bin = Pair.first->second->getBinary();
+    recordAccess(Pair.first->second);
   } else {
     Expected<OwningBinary<Binary>> BinOrErr = createBinary(Path);
     if (!BinOrErr)
       return BinOrErr.takeError();
-    Pair.first->second = std::move(BinOrErr.get());
-    Bin = Pair.first->second.getBinary();
+
+    CachedBinary &CachedBin = Pair.first->second;
+    CachedBin = std::move(BinOrErr.get());
+    CachedBin.pushEvictor([this, I = Pair.first]() { BinaryForPath.erase(I); });
+    LRUBinaries.push_back(CachedBin);
+    CacheSize += CachedBin.size();
+    Bin = CachedBin->getBinary();
   }
 
   if (!Bin)
@@ -551,8 +566,10 @@ LLVMSymbolizer::getOrCreateObject(const std::string &Path,
       return ObjOrErr.takeError();
     }
     ObjectFile *Res = ObjOrErr->get();
-    ObjectForUBPathAndArch.emplace(std::make_pair(Path, ArchName),
-                                   std::move(ObjOrErr.get()));
+    auto Pair = ObjectForUBPathAndArch.emplace(std::make_pair(Path, ArchName),
+                                               std::move(ObjOrErr.get()));
+    BinaryForPath.find(Path)->second.pushEvictor(
+        [this, Iter = Pair.first]() { ObjectForUBPathAndArch.erase(Iter); });
     return Res;
   }
   if (Bin->isObject()) {
@@ -580,10 +597,6 @@ LLVMSymbolizer::createModuleInfo(const ObjectFile *Obj,
 
 Expected<SymbolizableModule *>
 LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
-  auto I = Modules.find(ModuleName);
-  if (I != Modules.end())
-    return I->second.get();
-
   std::string BinaryName = ModuleName;
   std::string ArchName = Opts.DefaultArch;
   size_t ColonPos = ModuleName.find_last_of(':');
@@ -595,6 +608,13 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
       ArchName = ArchStr;
     }
   }
+
+  auto I = Modules.find(ModuleName);
+  if (I != Modules.end()) {
+    recordAccess(BinaryForPath.find(BinaryName)->second);
+    return I->second.get();
+  }
+
   auto ObjectsOrErr = getOrCreateObjectPair(BinaryName, ArchName);
   if (!ObjectsOrErr) {
     // Failed to find valid object file.
@@ -629,7 +649,15 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
     Context = DWARFContext::create(
         *Objects.second, DWARFContext::ProcessDebugRelocations::Process,
         nullptr, Opts.DWPName);
-  return createModuleInfo(Objects.first, std::move(Context), ModuleName);
+  auto ModuleOrErr =
+      createModuleInfo(Objects.first, std::move(Context), ModuleName);
+  if (ModuleOrErr) {
+    auto I = Modules.find(ModuleName);
+    BinaryForPath.find(BinaryName)->second.pushEvictor([this, I]() {
+      Modules.erase(I);
+    });
+  }
+  return ModuleOrErr;
 }
 
 Expected<SymbolizableModule *>
@@ -712,5 +740,35 @@ LLVMSymbolizer::DemangleName(const std::string &Name,
   return Name;
 }
 
+void LLVMSymbolizer::recordAccess(CachedBinary &Bin) {
+  if (Bin->getBinary())
+    LRUBinaries.splice(LRUBinaries.end(), LRUBinaries, Bin.getIterator());
+}
+
+void LLVMSymbolizer::pruneCache() {
+  // Evict the LRU binary until the max cache size is reached or there's <= 1
+  // item in the cache. The MRU binary is always kept to avoid thrashing if it's
+  // larger than the cache size.
+  while (CacheSize > Opts.MaxCacheSize && !LRUBinaries.empty() &&
+         std::next(LRUBinaries.begin()) != LRUBinaries.end()) {
+    CachedBinary &Bin = LRUBinaries.front();
+    CacheSize -= Bin.size();
+    LRUBinaries.pop_front();
+    Bin.evict();
+  }
+}
+
+void CachedBinary::pushEvictor(std::function<void()> NewEvictor) {
+  if (Evictor) {
+    this->Evictor = [OldEvictor = std::move(this->Evictor),
+                     NewEvictor = std::move(NewEvictor)]() {
+      NewEvictor();
+      OldEvictor();
+    };
+  } else {
+    this->Evictor = std::move(NewEvictor);
+  }
+}
+
 } // namespace symbolize
 } // namespace llvm

diff  --git a/llvm/tools/llvm-symbolizer/Opts.td b/llvm/tools/llvm-symbolizer/Opts.td
index b21af7a9fb203..dae1bd611fdd8 100644
--- a/llvm/tools/llvm-symbolizer/Opts.td
+++ b/llvm/tools/llvm-symbolizer/Opts.td
@@ -22,6 +22,7 @@ defm adjust_vma
       MetaVarName<"<offset>">;
 def basenames : Flag<["--"], "basenames">, HelpText<"Strip directory names from paths">;
 defm build_id : Eq<"build-id", "Build ID used to look up the object file">;
+defm cache_size : Eq<"cache-size", "Max size in bytes of the in-memory binary cache.">;
 defm debug_file_directory : Eq<"debug-file-directory", "Path to directory where to look for debug files">, MetaVarName<"<dir>">;
 defm debuginfod : B<"debuginfod", "Use debuginfod to find debug binaries", "Don't use debuginfod to find debug binaries">;
 defm default_arch

diff  --git a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
index f7f8c42843080..d77ba0672eb7a 100644
--- a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -191,6 +191,7 @@ void executeCommand(StringRef ModuleName, const T &ModuleSpec, Command Cmd,
         Symbolizer.symbolizeCode(ModuleSpec, Address);
     print({ModuleName, Offset}, ResOrErr, Printer);
   }
+  Symbolizer.pruneCache();
 }
 
 static void symbolizeInput(const opt::InputArgList &Args,
@@ -361,6 +362,8 @@ int main(int argc, char **argv) {
   }
 #endif
   Opts.UseSymbolTable = true;
+  if (Args.hasArg(OPT_cache_size_EQ))
+    parseIntArg(Args, OPT_cache_size_EQ, Opts.MaxCacheSize);
   Config.PrintAddress = Args.hasArg(OPT_addresses);
   Config.PrintFunctions = Opts.PrintFunctions != FunctionNameKind::None;
   Config.Pretty = Args.hasArg(OPT_pretty_print);


        


More information about the llvm-commits mailing list