[libc-commits] [libc] Create a poor-developer's msan for libc wide read functions. (PR #170586)

via libc-commits libc-commits at lists.llvm.org
Thu Jan 15 14:14:08 PST 2026


https://github.com/Sterling-Augustine updated https://github.com/llvm/llvm-project/pull/170586

>From 33b3c44babbb8c888864f1be6b824b624d3370f4 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine at google.com>
Date: Wed, 3 Dec 2025 16:33:47 -0800
Subject: [PATCH 1/8] Create a poor-developer's msan for libc wide read
 functions.

Most libcs optimize functions like strlen by reading in chunks larger
than a single character. As part of "the implementation", they can
legally do this as long as they are careful not to read invalid
memory.

However, such tricks prevents those functions from being tested under
the various sanitizers.

This PR creates a test framework that can report when one of these
functions read or write in an invalid way without using the
sanitizers.
---
 libc/test/src/strings/CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/libc/test/src/strings/CMakeLists.txt b/libc/test/src/strings/CMakeLists.txt
index 5f70dc024f6ce..0ccd0dc302943 100644
--- a/libc/test/src/strings/CMakeLists.txt
+++ b/libc/test/src/strings/CMakeLists.txt
@@ -110,5 +110,15 @@ add_libc_test(
     libc.src.strings.strncasecmp_l
 )
 
+add_libc_test(
+  wide_read_memory_test
+  SUITE
+    libc-strings-tests
+  SRCS
+    wide_read_memory_test.cpp
+  DEPENDS
+    libc.src.string.strlen
+)
+
 add_libc_multi_impl_test(bcmp libc-strings-tests SRCS bcmp_test.cpp)
 add_libc_multi_impl_test(bzero libc-strings-tests SRCS bzero_test.cpp)

>From 498391a2406b4cdda0df9093fe85151c4813ff97 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine at google.com>
Date: Thu, 4 Dec 2025 09:47:32 -0800
Subject: [PATCH 2/8] Add missing file.

---
 .../src/strings/wide_read_memory_test.cpp     | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 libc/test/src/strings/wide_read_memory_test.cpp

diff --git a/libc/test/src/strings/wide_read_memory_test.cpp b/libc/test/src/strings/wide_read_memory_test.cpp
new file mode 100644
index 0000000000000..e353d49dd864e
--- /dev/null
+++ b/libc/test/src/strings/wide_read_memory_test.cpp
@@ -0,0 +1,98 @@
+// For performance, some vector-based libc functions read data outside of, but
+// adjacent to, the input address. For example, string_length can read both
+// before and after the data in its src parameter. As part of the
+// implementation, it is allowed to do this. However, the code must take care
+// to avoid address errors. The sanitizers can't distinguish between "the
+// implementation" and user-code, and so report an error. Therefore we can't use
+// them to check if functions like thees have memory errors.
+//
+// This test uses mprotect to simulate address sanitization. Tests that read too
+// far outside data will segfault.
+//
+// It creates three adjacent pages in memory. The outer two are mprotected
+// unreadable, the middle usable normally. By placing test data at the edges
+// between the middle page and the others, we can test for bad accesses.
+
+#include <cstddef>
+#include <type_traits>
+#include <vector>
+
+#include <assert.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "src/string/string_utils.h"
+#include "test/UnitTest/MemoryMatcher.h"
+#include "test/UnitTest/Test.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+
+class LlvmLibcWideAccessMemoryTest : public testing::Test {
+  char *page0_;
+  char *page1_;
+  char *page2_;
+  size_t page_size;
+
+ public:
+  void SetUp() override {
+    page_size = getpagesize();
+    page0_ =
+        static_cast<char *>(mmap(nullptr, page_size * 3, PROT_READ | PROT_WRITE,
+                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+    ASSERT_NE(static_cast<void *>(page0_), MAP_FAILED);
+    page1_ = page0_ + page_size;
+    page2_ = page1_ + page_size;
+    mprotect(page0_, page_size, PROT_NONE);
+    mprotect(page2_, page_size, PROT_NONE);
+  }
+
+  void TearDown() override { munmap(page0_, page_size * 3); }
+
+  // So we don't depend on system memcpy, which may itself be under test.
+  void BasicMemCopy(char *dst, const char *src, size_t len) {
+    while (len--)
+      *dst++ = *src++;
+  }
+
+  // Repeatedly runs "func" on copies of the data in "buf", each progressively
+  // closer to the boundary of valid memory. Test will segfault if function
+  // under test examines invalid memory.
+  //
+  // Func should test the function in question just as normal. Recommend making
+  // the amount of data just over 1.5k, which guarantees a wind-up, multiple
+  // iterations of the inner loop, and a wind-down, even on systems with
+  // 512-byte arrays. The termination condition, eg, end-of string or character
+  // being searched for, should be near the end of the data.
+  template <typename TestFunc>
+  void TestMemoryAccess(const std::vector<char>& buf, TestFunc func) {
+    // Run func on data near the start boundary of valid memory.
+    for (unsigned long offset = 0;
+         offset < std::alignment_of<std::max_align_t>::value; ++offset) {
+      char *test_addr = page1_ + offset;
+      BasicMemCopy(test_addr, buf.data(), buf.size());
+      func(test_addr);
+    }
+    // Run func on data near the end boundary of valid memory.
+    for (unsigned long offset = 0;
+         offset < std::alignment_of<std::max_align_t>::value; ++offset) {
+      char *test_addr = page2_  - buf.size() - offset - 1;
+      assert(test_addr + buf.size() < page2_);
+      BasicMemCopy(test_addr, buf.data(), buf.size());
+      func(test_addr);
+    }
+  }
+};
+
+TEST_F(LlvmLibcWideAccessMemoryTest, StringLength) {
+  // 1.5 k long vector of a's.
+  std::vector<char> buf(1536, 'a');
+  // Make sure it is null terminated.
+  buf.push_back('\0');
+  this->TestMemoryAccess(buf, [this, buf](const char* test_data) {
+    // -1 for the null character.
+    ASSERT_EQ(internal::string_length(test_data), size_t(buf.size() - 1));
+  });
+}
+
+}

>From 54152258bdb25ef2da4d5fb5b1b1d26788f1367f Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine at google.com>
Date: Thu, 4 Dec 2025 10:35:37 -0800
Subject: [PATCH 3/8] Fix formatting.

---
 libc/test/src/strings/wide_read_memory_test.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/libc/test/src/strings/wide_read_memory_test.cpp b/libc/test/src/strings/wide_read_memory_test.cpp
index e353d49dd864e..a7694dad570ab 100644
--- a/libc/test/src/strings/wide_read_memory_test.cpp
+++ b/libc/test/src/strings/wide_read_memory_test.cpp
@@ -27,14 +27,13 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-
 class LlvmLibcWideAccessMemoryTest : public testing::Test {
   char *page0_;
   char *page1_;
   char *page2_;
   size_t page_size;
 
- public:
+public:
   void SetUp() override {
     page_size = getpagesize();
     page0_ =
@@ -57,15 +56,15 @@ class LlvmLibcWideAccessMemoryTest : public testing::Test {
 
   // Repeatedly runs "func" on copies of the data in "buf", each progressively
   // closer to the boundary of valid memory. Test will segfault if function
-  // under test examines invalid memory.
+  // under test accesses invalid memory.
   //
   // Func should test the function in question just as normal. Recommend making
   // the amount of data just over 1.5k, which guarantees a wind-up, multiple
   // iterations of the inner loop, and a wind-down, even on systems with
-  // 512-byte arrays. The termination condition, eg, end-of string or character
+  // 512-byte vectors. The termination condition, eg, end-of string or character
   // being searched for, should be near the end of the data.
   template <typename TestFunc>
-  void TestMemoryAccess(const std::vector<char>& buf, TestFunc func) {
+  void TestMemoryAccess(const std::vector<char> &buf, TestFunc func) {
     // Run func on data near the start boundary of valid memory.
     for (unsigned long offset = 0;
          offset < std::alignment_of<std::max_align_t>::value; ++offset) {
@@ -76,7 +75,7 @@ class LlvmLibcWideAccessMemoryTest : public testing::Test {
     // Run func on data near the end boundary of valid memory.
     for (unsigned long offset = 0;
          offset < std::alignment_of<std::max_align_t>::value; ++offset) {
-      char *test_addr = page2_  - buf.size() - offset - 1;
+      char *test_addr = page2_ - buf.size() - offset - 1;
       assert(test_addr + buf.size() < page2_);
       BasicMemCopy(test_addr, buf.data(), buf.size());
       func(test_addr);
@@ -89,10 +88,10 @@ TEST_F(LlvmLibcWideAccessMemoryTest, StringLength) {
   std::vector<char> buf(1536, 'a');
   // Make sure it is null terminated.
   buf.push_back('\0');
-  this->TestMemoryAccess(buf, [this, buf](const char* test_data) {
+  this->TestMemoryAccess(buf, [this, buf](const char *test_data) {
     // -1 for the null character.
     ASSERT_EQ(internal::string_length(test_data), size_t(buf.size() - 1));
   });
 }
 
-}
+} // namespace LIBC_NAMESPACE_DECL

>From 12cfaeff85bb5235d5524572e28a397deb1e24d0 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine at google.com>
Date: Tue, 16 Dec 2025 09:52:32 -0800
Subject: [PATCH 4/8] Address comments.

---
 .../src/strings/wide_read_memory_test.cpp     | 30 ++++++++-----------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/libc/test/src/strings/wide_read_memory_test.cpp b/libc/test/src/strings/wide_read_memory_test.cpp
index a7694dad570ab..f772d821417f8 100644
--- a/libc/test/src/strings/wide_read_memory_test.cpp
+++ b/libc/test/src/strings/wide_read_memory_test.cpp
@@ -13,14 +13,14 @@
 // unreadable, the middle usable normally. By placing test data at the edges
 // between the middle page and the others, we can test for bad accesses.
 
+#include <assert.h>
 #include <cstddef>
 #include <type_traits>
-#include <vector>
-
-#include <assert.h>
-#include <sys/mman.h>
-#include <unistd.h>
 
+#include "src/unistd/getpagesize.h"
+#include "src/sys/mman/mmap.h"
+#include "src/sys/mman/munmap.h"
+#include "src/sys/mman/mprotect.h"
 #include "src/string/string_utils.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -35,24 +35,18 @@ class LlvmLibcWideAccessMemoryTest : public testing::Test {
 
 public:
   void SetUp() override {
-    page_size = getpagesize();
-    page0_ =
-        static_cast<char *>(mmap(nullptr, page_size * 3, PROT_READ | PROT_WRITE,
-                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
+    page_size = LIBC_NAMESPACE::getpagesize();
+    page0_ = static_cast<char *>(
+        LIBC_NAMESPACE::mmap(nullptr, page_size * 3, PROT_READ | PROT_WRITE,
+                             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
     ASSERT_NE(static_cast<void *>(page0_), MAP_FAILED);
     page1_ = page0_ + page_size;
     page2_ = page1_ + page_size;
-    mprotect(page0_, page_size, PROT_NONE);
-    mprotect(page2_, page_size, PROT_NONE);
+    LIBC_NAMESPACE::mprotect(page0_, page_size, PROT_NONE);
+    LIBC_NAMESPACE::mprotect(page2_, page_size, PROT_NONE);
   }
 
-  void TearDown() override { munmap(page0_, page_size * 3); }
-
-  // So we don't depend on system memcpy, which may itself be under test.
-  void BasicMemCopy(char *dst, const char *src, size_t len) {
-    while (len--)
-      *dst++ = *src++;
-  }
+  void TearDown() override { LIBC_NAMESPACE::munmap(page0_, page_size * 3); }
 
   // Repeatedly runs "func" on copies of the data in "buf", each progressively
   // closer to the boundary of valid memory. Test will segfault if function

>From 7faff32682f1918b700892c6e6743ebf21c2a870 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine at google.com>
Date: Wed, 14 Jan 2026 09:58:47 -0800
Subject: [PATCH 5/8] Fix include order. Use internal versions of functions.

---
 libc/test/src/strings/CMakeLists.txt          |  5 ++++
 .../src/strings/wide_read_memory_test.cpp     | 30 +++++++++++--------
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/libc/test/src/strings/CMakeLists.txt b/libc/test/src/strings/CMakeLists.txt
index 0ccd0dc302943..e3d6ec1f134d8 100644
--- a/libc/test/src/strings/CMakeLists.txt
+++ b/libc/test/src/strings/CMakeLists.txt
@@ -118,6 +118,11 @@ add_libc_test(
     wide_read_memory_test.cpp
   DEPENDS
     libc.src.string.strlen
+    libc.src.sys.mman.mmap
+    libc.src.sys.mman.mprotect
+    libc.src.sys.mman.munmap
+    libc.src.unistd.linux.getpagesize
+    libc.src.__support.CPP.array
 )
 
 add_libc_multi_impl_test(bcmp libc-strings-tests SRCS bcmp_test.cpp)
diff --git a/libc/test/src/strings/wide_read_memory_test.cpp b/libc/test/src/strings/wide_read_memory_test.cpp
index f772d821417f8..06d8289812303 100644
--- a/libc/test/src/strings/wide_read_memory_test.cpp
+++ b/libc/test/src/strings/wide_read_memory_test.cpp
@@ -13,20 +13,23 @@
 // unreadable, the middle usable normally. By placing test data at the edges
 // between the middle page and the others, we can test for bad accesses.
 
-#include <assert.h>
 #include <cstddef>
 #include <type_traits>
 
-#include "src/unistd/getpagesize.h"
+#include "src/__support/CPP/array.h"
+#include "src/string/memory_utils/inline_memset.h"
+#include "src/string/string_utils.h"
 #include "src/sys/mman/mmap.h"
-#include "src/sys/mman/munmap.h"
 #include "src/sys/mman/mprotect.h"
-#include "src/string/string_utils.h"
+#include "src/sys/mman/munmap.h"
+#include "src/unistd/getpagesize.h"
 #include "test/UnitTest/MemoryMatcher.h"
 #include "test/UnitTest/Test.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
+using TwoKilobyteBuffer = cpp::array<char, 2048>;
+
 class LlvmLibcWideAccessMemoryTest : public testing::Test {
   char *page0_;
   char *page1_;
@@ -53,25 +56,25 @@ class LlvmLibcWideAccessMemoryTest : public testing::Test {
   // under test accesses invalid memory.
   //
   // Func should test the function in question just as normal. Recommend making
-  // the amount of data just over 1.5k, which guarantees a wind-up, multiple
+  // the amount of test data at least 1.5k, which guarantees a wind-up, multiple
   // iterations of the inner loop, and a wind-down, even on systems with
   // 512-byte vectors. The termination condition, eg, end-of string or character
   // being searched for, should be near the end of the data.
   template <typename TestFunc>
-  void TestMemoryAccess(const std::vector<char> &buf, TestFunc func) {
+  void TestMemoryAccess(const TwoKilobyteBuffer &buf, TestFunc func) {
     // Run func on data near the start boundary of valid memory.
     for (unsigned long offset = 0;
-         offset < std::alignment_of<std::max_align_t>::value; ++offset) {
+         offset < std::alignment_of<max_align_t>::value; ++offset) {
       char *test_addr = page1_ + offset;
-      BasicMemCopy(test_addr, buf.data(), buf.size());
+      inline_memcpy(test_addr, buf.data(), buf.size());
       func(test_addr);
     }
     // Run func on data near the end boundary of valid memory.
     for (unsigned long offset = 0;
-         offset < std::alignment_of<std::max_align_t>::value; ++offset) {
+         offset < std::alignment_of<max_align_t>::value; ++offset) {
       char *test_addr = page2_ - buf.size() - offset - 1;
-      assert(test_addr + buf.size() < page2_);
-      BasicMemCopy(test_addr, buf.data(), buf.size());
+      ASSERT_LE(test_addr + buf.size(), page2_);
+      inline_memcpy(test_addr, buf.data(), buf.size());
       func(test_addr);
     }
   }
@@ -79,9 +82,10 @@ class LlvmLibcWideAccessMemoryTest : public testing::Test {
 
 TEST_F(LlvmLibcWideAccessMemoryTest, StringLength) {
   // 1.5 k long vector of a's.
-  std::vector<char> buf(1536, 'a');
+  TwoKilobyteBuffer buf;
+  inline_memset(buf.data(), 'a', buf.size());
   // Make sure it is null terminated.
-  buf.push_back('\0');
+  buf[buf.size() - 1] = '\0';
   this->TestMemoryAccess(buf, [this, buf](const char *test_data) {
     // -1 for the null character.
     ASSERT_EQ(internal::string_length(test_data), size_t(buf.size() - 1));

>From 7bcd3dd908b625901c81a18a7e11eeaf06ae40a8 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine at google.com>
Date: Wed, 14 Jan 2026 11:21:03 -0800
Subject: [PATCH 6/8] Remove unused include.

---
 libc/test/src/strings/wide_read_memory_test.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libc/test/src/strings/wide_read_memory_test.cpp b/libc/test/src/strings/wide_read_memory_test.cpp
index 06d8289812303..8f83690c6d023 100644
--- a/libc/test/src/strings/wide_read_memory_test.cpp
+++ b/libc/test/src/strings/wide_read_memory_test.cpp
@@ -13,7 +13,6 @@
 // unreadable, the middle usable normally. By placing test data at the edges
 // between the middle page and the others, we can test for bad accesses.
 
-#include <cstddef>
 #include <type_traits>
 
 #include "src/__support/CPP/array.h"

>From 2fbbbc1ab3ee0565d1ea6af797d8555df08c2020 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine at google.com>
Date: Wed, 14 Jan 2026 11:50:56 -0800
Subject: [PATCH 7/8] Certain alignment aren't supported on older targets. Be
 conservative.

---
 libc/test/src/strings/wide_read_memory_test.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/libc/test/src/strings/wide_read_memory_test.cpp b/libc/test/src/strings/wide_read_memory_test.cpp
index 8f83690c6d023..75052878ea385 100644
--- a/libc/test/src/strings/wide_read_memory_test.cpp
+++ b/libc/test/src/strings/wide_read_memory_test.cpp
@@ -13,8 +13,6 @@
 // unreadable, the middle usable normally. By placing test data at the edges
 // between the middle page and the others, we can test for bad accesses.
 
-#include <type_traits>
-
 #include "src/__support/CPP/array.h"
 #include "src/string/memory_utils/inline_memset.h"
 #include "src/string/string_utils.h"
@@ -28,6 +26,9 @@
 namespace LIBC_NAMESPACE_DECL {
 
 using TwoKilobyteBuffer = cpp::array<char, 2048>;
+// This could be smaller on a target-basis, but that adds complexity and the
+// extra testing is fine.
+static constexpr unsigned long kLargestTestVectorSize = 512;
 
 class LlvmLibcWideAccessMemoryTest : public testing::Test {
   char *page0_;
@@ -62,15 +63,13 @@ class LlvmLibcWideAccessMemoryTest : public testing::Test {
   template <typename TestFunc>
   void TestMemoryAccess(const TwoKilobyteBuffer &buf, TestFunc func) {
     // Run func on data near the start boundary of valid memory.
-    for (unsigned long offset = 0;
-         offset < std::alignment_of<max_align_t>::value; ++offset) {
+    for (unsigned long offset = 0; offset < kLargestTestVectorSize; ++offset) {
       char *test_addr = page1_ + offset;
       inline_memcpy(test_addr, buf.data(), buf.size());
       func(test_addr);
     }
     // Run func on data near the end boundary of valid memory.
-    for (unsigned long offset = 0;
-         offset < std::alignment_of<max_align_t>::value; ++offset) {
+    for (unsigned long offset = 0; offset < kLargestTestVectorSize; ++offset) {
       char *test_addr = page2_ - buf.size() - offset - 1;
       ASSERT_LE(test_addr + buf.size(), page2_);
       inline_memcpy(test_addr, buf.data(), buf.size());

>From b7b2f981a84b72820fee555325a57837f96e979f Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine at google.com>
Date: Thu, 15 Jan 2026 13:36:56 -0800
Subject: [PATCH 8/8] Address comments

---
 libc/test/src/strings/CMakeLists.txt            |  2 +-
 libc/test/src/strings/wide_read_memory_test.cpp | 14 +++++++++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/libc/test/src/strings/CMakeLists.txt b/libc/test/src/strings/CMakeLists.txt
index e3d6ec1f134d8..6e1befca16f37 100644
--- a/libc/test/src/strings/CMakeLists.txt
+++ b/libc/test/src/strings/CMakeLists.txt
@@ -117,7 +117,7 @@ add_libc_test(
   SRCS
     wide_read_memory_test.cpp
   DEPENDS
-    libc.src.string.strlen
+    libc.src.string.string_utils
     libc.src.sys.mman.mmap
     libc.src.sys.mman.mprotect
     libc.src.sys.mman.munmap
diff --git a/libc/test/src/strings/wide_read_memory_test.cpp b/libc/test/src/strings/wide_read_memory_test.cpp
index 75052878ea385..cc4a2dcbd9dde 100644
--- a/libc/test/src/strings/wide_read_memory_test.cpp
+++ b/libc/test/src/strings/wide_read_memory_test.cpp
@@ -1,10 +1,18 @@
+//===-- Memory bounds check test for wide-read functions ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 // For performance, some vector-based libc functions read data outside of, but
 // adjacent to, the input address. For example, string_length can read both
 // before and after the data in its src parameter. As part of the
-// implementation, it is allowed to do this. However, the code must take care
-// to avoid address errors. The sanitizers can't distinguish between "the
+// implementation, it is allowed to do this. However, the code must take care to
+// avoid address errors. The sanitizers can't distinguish between "the
 // implementation" and user-code, and so report an error. Therefore we can't use
-// them to check if functions like thees have memory errors.
+// them to check if functions like these have memory errors.
 //
 // This test uses mprotect to simulate address sanitization. Tests that read too
 // far outside data will segfault.



More information about the libc-commits mailing list