[Lldb-commits] [lldb] cc5c526 - [lldb] Fix and speedup the `memory find` command (#104193)

Wed Sep 4 02:31:02 PDT 2024

Author: Pavel Labath
Date: 2024-09-04T11:30:58+02:00
New Revision: cc5c526c80a4cacf7ed5b7fbe50072594ec1aeaf

URL: https://github.com/llvm/llvm-project/commit/cc5c526c80a4cacf7ed5b7fbe50072594ec1aeaf
DIFF: https://github.com/llvm/llvm-project/commit/cc5c526c80a4cacf7ed5b7fbe50072594ec1aeaf.diff

LOG: [lldb] Fix and speedup the `memory find` command (#104193)

This patch fixes an issue where the `memory find` command would
effectively stop searching after encountering a memory read error (which
could happen due to unreadable memory), without giving any indication
that it has done so (it would just print it could not find the pattern).

To make matters worse, it would not terminate after encountering this
error, but rather proceed to slowly increment the address pointer, which
meant that searching a large region could take a very long time (and
give the appearance that lldb is actually searching for the thing).

The patch fixes this first problem by detecting read errors and
skipping over (using GetMemoryRegionInfo) the unreadable parts of memory
and resuming the search after them. It also reads the memory in bulk
(`max(sizeof(pattern))`), which speeds up the search significantly (up
to 6x for live processes, 18x for core files).

Added: 
    

Modified: 
    lldb/source/Target/Process.cpp
    lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py

Removed: 
    


################################################################################
diff  --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index ae64f6f261bad7..6c5c5162e24686 100644

--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -114,33 +114,6 @@ class ProcessOptionValueProperties
   }
 };
 
-class ProcessMemoryIterator {
-public:
-  ProcessMemoryIterator(Process &process, lldb::addr_t base)
-      : m_process(process), m_base_addr(base) {}
-
-  bool IsValid() { return m_is_valid; }
-
-  uint8_t operator[](lldb::addr_t offset) {
-    if (!IsValid())
-      return 0;
-
-    uint8_t retval = 0;
-    Status error;
-    if (0 == m_process.ReadMemory(m_base_addr + offset, &retval, 1, error)) {
-      m_is_valid = false;
-      return 0;
-    }
-
-    return retval;
-  }
-
-private:
-  Process &m_process;
-  const lldb::addr_t m_base_addr;
-  bool m_is_valid = true;
-};
-
 static constexpr OptionEnumValueElement g_follow_fork_mode_values[] = {
     {
         eFollowParent,
@@ -3379,21 +3352,49 @@ lldb::addr_t Process::FindInMemory(lldb::addr_t low, lldb::addr_t high,
   if (region_size < size)
     return LLDB_INVALID_ADDRESS;
 
+  // See "Boyer-Moore string search algorithm".
   std::vector<size_t> bad_char_heuristic(256, size);
-  ProcessMemoryIterator iterator(*this, low);
-
   for (size_t idx = 0; idx < size - 1; idx++) {
     decltype(bad_char_heuristic)::size_type bcu_idx = buf[idx];
     bad_char_heuristic[bcu_idx] = size - idx - 1;
   }
-  for (size_t s = 0; s <= (region_size - size);) {
+
+  // Memory we're currently searching through.
+  llvm::SmallVector<uint8_t, 0> mem;
+  // Position of the memory buffer.
+  addr_t mem_pos = low;
+  // Maximum number of bytes read (and buffered). We need to read at least
+  // `size` bytes for a successful match.
+  const size_t max_read_size = std::max<size_t>(size, 0x10000);
+
+  for (addr_t cur_addr = low; cur_addr <= (high - size);) {
+    if (cur_addr + size > mem_pos + mem.size()) {
+      // We need to read more data. We don't attempt to reuse the data we've
+      // already read (up to `size-1` bytes from `cur_addr` to
+      // `mem_pos+mem.size()`).  This is fine for patterns much smaller than
+      // max_read_size. For very
+      // long patterns we may need to do something more elaborate.
+      mem.resize_for_overwrite(max_read_size);
+      Status error;
+      mem.resize(ReadMemory(cur_addr, mem.data(),
+                            std::min(mem.size(), high - cur_addr), error));
+      mem_pos = cur_addr;
+      if (size > mem.size()) {
+        // We didn't read enough data. Skip to the next memory region.
+        MemoryRegionInfo info;
+        error = GetMemoryRegionInfo(mem_pos + mem.size(), info);
+        if (error.Fail())
+          break;
+        cur_addr = info.GetRange().GetRangeEnd();
+        continue;
+      }
+    }
     int64_t j = size - 1;
-    while (j >= 0 && buf[j] == iterator[s + j])
+    while (j >= 0 && buf[j] == mem[cur_addr + j - mem_pos])
       j--;
     if (j < 0)
-      return low + s;
-    else
-      s += bad_char_heuristic[iterator[s + size - 1]];
+      return cur_addr; // We have a match.
+    cur_addr += bad_char_heuristic[mem[cur_addr + size - 1 - mem_pos]];
   }
 
   return LLDB_INVALID_ADDRESS;

diff  --git a/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py b/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py
index 1c2c90d483ea3f..c61ae15b9dda70 100644
--- a/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py
+++ b/lldb/test/API/functionalities/memory/holes/TestMemoryHoles.py
@@ -43,6 +43,9 @@ def _prepare_inferior(self):
         # inside the holes we've deliberately left empty.
         self.memory = self.frame().FindVariable("mem_with_holes").GetValueAsUnsigned()
         self.pagesize = self.frame().FindVariable("pagesize").GetValueAsUnsigned()
+        self.num_pages = (
+            self.target().FindFirstGlobalVariable("num_pages").GetValueAsUnsigned()
+        )
         positions = self.frame().FindVariable("positions")
         self.positions = [
             positions.GetChildAtIndex(i).GetValueAsUnsigned()
@@ -58,3 +61,13 @@ def test_memory_read(self):
         self.assertEqual(len(content), self.pagesize)
         self.assertEqual(content[0:7], b"needle\0")
         self.assertTrue(error.Fail())
+
+    def test_memory_find(self):
+        self._prepare_inferior()
+
+        matches = [f"data found at location: {p:#x}" for p in self.positions]
+        self.expect(
+            f'memory find --count {len(self.positions)+1} --string "needle" '
+            f"{self.memory:#x} {self.memory+self.pagesize*self.num_pages:#x}",
+            substrs=matches + ["no more matches within the range"],
+        )