[libc-commits] [libc] Create a poor-developer's msan for libc wide read functions. (PR #170586)

Thu Dec 4 09:47:52 PST 2025

https://github.com/Sterling-Augustine updated https://github.com/llvm/llvm-project/pull/170586

>From 33b3c44babbb8c888864f1be6b824b624d3370f4 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine at google.com>
Date: Wed, 3 Dec 2025 16:33:47 -0800
Subject: [PATCH 1/2] Create a poor-developer's msan for libc wide read
 functions.

Most libcs optimize functions like strlen by reading in chunks larger
than a single character. As part of "the implementation", they can
legally do this as long as they are careful not to read invalid
memory.

However, such tricks prevents those functions from being tested under
the various sanitizers.

This PR creates a test framework that can report when one of these
functions read or write in an invalid way without using the
sanitizers.
---
 libc/test/src/strings/CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/libc/test/src/strings/CMakeLists.txt b/libc/test/src/strings/CMakeLists.txt
index 5f70dc024f6ce..0ccd0dc302943 100644
--- a/libc/test/src/strings/CMakeLists.txt
+++ b/libc/test/src/strings/CMakeLists.txt
@@ -110,5 +110,15 @@ add_libc_test(
     libc.src.strings.strncasecmp_l
 )
 
+add_libc_test(
+  wide_read_memory_test
+  SUITE
+    libc-strings-tests
+  SRCS
+    wide_read_memory_test.cpp
+  DEPENDS
+    libc.src.string.strlen
+)
+
 add_libc_multi_impl_test(bcmp libc-strings-tests SRCS bcmp_test.cpp)
 add_libc_multi_impl_test(bzero libc-strings-tests SRCS bzero_test.cpp)

>From 498391a2406b4cdda0df9093fe85151c4813ff97 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine at google.com>
Date: Thu, 4 Dec 2025 09:47:32 -0800
Subject: [PATCH 2/2] Add missing file.

---
 .../src/strings/wide_read_memory_test.cpp     | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 libc/test/src/strings/wide_read_memory_test.cpp

diff --git a/libc/test/src/strings/wide_read_memory_test.cpp b/libc/test/src/strings/wide_read_memory_test.cpp
new file mode 100644
index 0000000000000..e353d49dd864e
--- /dev/null
+++ b/libc/test/src/strings/wide_read_memory_test.cpp
@@ -0,0 +1,98 @@
+// For performance, some vector-based libc functions read data outside of, but
+// adjacent to, the input address. For example, string_length can read both
+// before and after the data in its src parameter. As part of the
+// implementation, it is allowed to do this. However, the code must take care
+// to avoid address errors. The sanitizers can't distinguish between "the
+// implementation" and user-code, and so report an error. Therefore we can't use
+// them to check if functions like thees have memory errors.
+//
+// This test uses mprotect to simulate address sanitization. Tests that read too
+// far outside data will segfault.
+//
+// It creates three adjacent pages in memory. The outer two are mprotected
+// unreadable, the middle usable normally. By placing test data at the edges
+// between the middle page and the others, we can test for bad accesses.
+
+#include <cstddef>
+#include <type_traits>
+#include <vector>
+
+#include <assert.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "src/string/string_utils.h"
+#include "test/UnitTest/MemoryMatcher.h"
+#include "test/UnitTest/Test.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+
+class LlvmLibcWideAccessMemoryTest : public testing::Test {
+  char *page0_;
+  char *page1_;
+  char *page2_;
+  size_t page_size;
+
+ public:
+  void SetUp() override {
+    page_size = getpagesize();
+    page0_ =
+        static_cast<char *>(mmap(nullptr, page_size * 3, PROT_READ | PROT_WRITE,
+                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+    ASSERT_NE(static_cast<void *>(page0_), MAP_FAILED);
+    page1_ = page0_ + page_size;
+    page2_ = page1_ + page_size;
+    mprotect(page0_, page_size, PROT_NONE);
+    mprotect(page2_, page_size, PROT_NONE);
+  }
+
+  void TearDown() override { munmap(page0_, page_size * 3); }
+
+  // So we don't depend on system memcpy, which may itself be under test.
+  void BasicMemCopy(char *dst, const char *src, size_t len) {
+    while (len--)
+      *dst++ = *src++;
+  }
+
+  // Repeatedly runs "func" on copies of the data in "buf", each progressively
+  // closer to the boundary of valid memory. Test will segfault if function
+  // under test examines invalid memory.
+  //
+  // Func should test the function in question just as normal. Recommend making
+  // the amount of data just over 1.5k, which guarantees a wind-up, multiple
+  // iterations of the inner loop, and a wind-down, even on systems with
+  // 512-byte arrays. The termination condition, eg, end-of string or character
+  // being searched for, should be near the end of the data.
+  template <typename TestFunc>
+  void TestMemoryAccess(const std::vector<char>& buf, TestFunc func) {
+    // Run func on data near the start boundary of valid memory.
+    for (unsigned long offset = 0;
+         offset < std::alignment_of<std::max_align_t>::value; ++offset) {
+      char *test_addr = page1_ + offset;
+      BasicMemCopy(test_addr, buf.data(), buf.size());
+      func(test_addr);
+    }
+    // Run func on data near the end boundary of valid memory.
+    for (unsigned long offset = 0;
+         offset < std::alignment_of<std::max_align_t>::value; ++offset) {
+      char *test_addr = page2_  - buf.size() - offset - 1;
+      assert(test_addr + buf.size() < page2_);
+      BasicMemCopy(test_addr, buf.data(), buf.size());
+      func(test_addr);
+    }
+  }
+};
+
+TEST_F(LlvmLibcWideAccessMemoryTest, StringLength) {
+  // 1.5 k long vector of a's.
+  std::vector<char> buf(1536, 'a');
+  // Make sure it is null terminated.
+  buf.push_back('\0');
+  this->TestMemoryAccess(buf, [this, buf](const char* test_data) {
+    // -1 for the null character.
+    ASSERT_EQ(internal::string_length(test_data), size_t(buf.size() - 1));
+  });
+}
+
+}