[libc-commits] [libc] [llvm] [libc] fixed signed char issues in strsep()/strtok()/strtok_r(). (PR #156705)

via libc-commits libc-commits at lists.llvm.org
Wed Sep 3 09:16:46 PDT 2025


https://github.com/enh-google created https://github.com/llvm/llvm-project/pull/156705

Also add the missing tests for all the related functions (even the ones that were already right), and add the missing bazel build rules.

>From 125bbdb62884228ae657aae89f754803fbdcfcf7 Mon Sep 17 00:00:00 2001
From: Elliott Hughes <enh at google.com>
Date: Wed, 3 Sep 2025 16:13:46 +0000
Subject: [PATCH] [libc] fixed signed char issues in
 strsep()/strtok()/strtok_r().

Also add the missing tests for all the related functions (even the ones
that were already right), and add the missing bazel build rules.
---
 libc/src/string/string_utils.h                   | 14 +++++++-------
 libc/test/src/string/strcspn_test.cpp            |  4 ++++
 libc/test/src/string/strpbrk_test.cpp            |  4 ++++
 libc/test/src/string/strsep_test.cpp             |  8 ++++++++
 libc/test/src/string/strspn_test.cpp             |  4 ++++
 libc/test/src/string/strtok_r_test.cpp           |  8 ++++++++
 libc/test/src/string/strtok_test.cpp             |  7 +++++++
 .../bazel/llvm-project-overlay/libc/BUILD.bazel  | 10 ++++++++++
 .../libc/test/src/string/BUILD.bazel             | 16 ++++++++++++++++
 9 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 26e9adde0d66e..10803488b6cf5 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -212,28 +212,28 @@ LIBC_INLINE char *string_token(char *__restrict src,
   static_assert(CHAR_BIT == 8, "bitset of 256 assumes char is 8 bits");
   cpp::bitset<256> delims;
   for (; *delimiter_string != '\0'; ++delimiter_string)
-    delims.set(static_cast<size_t>(*delimiter_string));
+    delims.set(*reinterpret_cast<const unsigned char *>(delimiter_string));
 
-  char *tok_start = src;
+  unsigned char *tok_start = reinterpret_cast<unsigned char *>(src);
   if constexpr (SkipDelim)
-    while (*tok_start != '\0' && delims.test(static_cast<size_t>(*tok_start)))
+    while (*tok_start != '\0' && delims.test(*tok_start))
       ++tok_start;
   if (*tok_start == '\0' && SkipDelim) {
     *context = nullptr;
     return nullptr;
   }
 
-  char *tok_end = tok_start;
-  while (*tok_end != '\0' && !delims.test(static_cast<size_t>(*tok_end)))
+  unsigned char *tok_end = tok_start;
+  while (*tok_end != '\0' && !delims.test(*tok_end))
     ++tok_end;
 
   if (*tok_end == '\0') {
     *context = nullptr;
   } else {
     *tok_end = '\0';
-    *context = tok_end + 1;
+    *context = reinterpret_cast<char *>(tok_end + 1);
   }
-  return tok_start;
+  return reinterpret_cast<char *>(tok_start);
 }
 
 LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
diff --git a/libc/test/src/string/strcspn_test.cpp b/libc/test/src/string/strcspn_test.cpp
index d83b3cf4fdfe8..ec98f72e37113 100644
--- a/libc/test/src/string/strcspn_test.cpp
+++ b/libc/test/src/string/strcspn_test.cpp
@@ -48,3 +48,7 @@ TEST(LlvmLibcStrCSpnTest, DuplicatedCharactersNotPartOfComplementarySpan) {
   EXPECT_EQ(LIBC_NAMESPACE::strcspn("aaaa", "aa"), size_t{0});
   EXPECT_EQ(LIBC_NAMESPACE::strcspn("aaaa", "baa"), size_t{0});
 }
+
+TEST(LlvmLibcStrCSpnTest, TopBitSet) {
+  EXPECT_EQ(LIBC_NAMESPACE::strcspn("hello\x80world", "\x80"), size_t{5});
+}
diff --git a/libc/test/src/string/strpbrk_test.cpp b/libc/test/src/string/strpbrk_test.cpp
index fbe14da12ac10..cc802460d10be 100644
--- a/libc/test/src/string/strpbrk_test.cpp
+++ b/libc/test/src/string/strpbrk_test.cpp
@@ -60,3 +60,7 @@ TEST(LlvmLibcStrPBrkTest, FindsFirstOfRepeated) {
 TEST(LlvmLibcStrPBrkTest, FindsFirstInBreakset) {
   EXPECT_STREQ(LIBC_NAMESPACE::strpbrk("12345", "34"), "345");
 }
+
+TEST(LlvmLibcStrPBrkTest, TopBitSet) {
+  EXPECT_STREQ(LIBC_NAMESPACE::strpbrk("hello\x80world", "\x80 "), "\x80world");
+}
diff --git a/libc/test/src/string/strsep_test.cpp b/libc/test/src/string/strsep_test.cpp
index e2a5d52bbeddb..553edd99604ef 100644
--- a/libc/test/src/string/strsep_test.cpp
+++ b/libc/test/src/string/strsep_test.cpp
@@ -61,6 +61,14 @@ TEST(LlvmLibcStrsepTest, SubsequentSearchesReturnNull) {
   ASSERT_EQ(LIBC_NAMESPACE::strsep(&string, ":"), nullptr);
 }
 
+TEST(LlvmLibcStrsepTest, TopBitSet) {
+  char top_bit_set_str[] = "hello\x80world";
+  char *p = top_bit_set_str;
+  ASSERT_STREQ(LIBC_NAMESPACE::strsep(&p, "\x80"), "hello");
+  ASSERT_STREQ(LIBC_NAMESPACE::strsep(&p, "\x80"), "world");
+  ASSERT_EQ(LIBC_NAMESPACE::strsep(&p, "\x80"), nullptr);
+}
+
 #if defined(LIBC_ADD_NULL_CHECKS)
 
 TEST(LlvmLibcStrsepTest, CrashOnNullPtr) {
diff --git a/libc/test/src/string/strspn_test.cpp b/libc/test/src/string/strspn_test.cpp
index 82f9b2aef0dfd..813612f09fc16 100644
--- a/libc/test/src/string/strspn_test.cpp
+++ b/libc/test/src/string/strspn_test.cpp
@@ -85,6 +85,10 @@ TEST(LlvmLibcStrSpnTest, DuplicatedCharactersToBeSearchedForShouldStillMatch) {
   EXPECT_EQ(LIBC_NAMESPACE::strspn("aaaa", "aa"), size_t{4});
 }
 
+TEST(LlvmLibcStrSpnTest, TopBitSet) {
+  EXPECT_EQ(LIBC_NAMESPACE::strspn("hello\x80world", "helo\x80rld"), size_t{6});
+}
+
 #if defined(LIBC_ADD_NULL_CHECKS)
 
 TEST(LlvmLibcStrSpnTest, CrashOnNullPtr) {
diff --git a/libc/test/src/string/strtok_r_test.cpp b/libc/test/src/string/strtok_r_test.cpp
index a19390d0b0c2d..8c4d3c362f778 100644
--- a/libc/test/src/string/strtok_r_test.cpp
+++ b/libc/test/src/string/strtok_r_test.cpp
@@ -131,3 +131,11 @@ TEST(LlvmLibcStrTokReentrantTest, SubsequentSearchesReturnNull) {
   ASSERT_EQ(LIBC_NAMESPACE::strtok_r(nullptr, ":", &reserve), nullptr);
   ASSERT_EQ(LIBC_NAMESPACE::strtok_r(nullptr, ":", &reserve), nullptr);
 }
+
+TEST(LlvmLibcStrTokReentrantTest, TopBitSet) {
+  char top_bit_set_str[] = "hello\x80world";
+  char *p;
+  ASSERT_STREQ(LIBC_NAMESPACE::strtok_r(top_bit_set_str, "\x80", &p), "hello");
+  ASSERT_STREQ(LIBC_NAMESPACE::strtok_r(nullptr, "\x80", &p), "world");
+  ASSERT_EQ(LIBC_NAMESPACE::strtok_r(nullptr, "\x80", &p), nullptr);
+}
diff --git a/libc/test/src/string/strtok_test.cpp b/libc/test/src/string/strtok_test.cpp
index 76efeddda6f4a..3c097fdee0713 100644
--- a/libc/test/src/string/strtok_test.cpp
+++ b/libc/test/src/string/strtok_test.cpp
@@ -83,3 +83,10 @@ TEST(LlvmLibcStrTokTest, SubsequentSearchesReturnNull) {
   ASSERT_EQ(LIBC_NAMESPACE::strtok(nullptr, ":"), nullptr);
   ASSERT_EQ(LIBC_NAMESPACE::strtok(nullptr, ":"), nullptr);
 }
+
+TEST(LlvmLibcStrTokTest, TopBitSet) {
+  char top_bit_set_str[] = "hello\x80world";
+  ASSERT_STREQ(LIBC_NAMESPACE::strtok(top_bit_set_str, "\x80"), "hello");
+  ASSERT_STREQ(LIBC_NAMESPACE::strtok(nullptr, "\x80"), "world");
+  ASSERT_EQ(LIBC_NAMESPACE::strtok(nullptr, "\x80"), nullptr);
+}
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index b2cd3fdd468af..acfd0d96a28bf 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -5251,6 +5251,16 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "strtok_r",
+    srcs = ["src/string/strtok_r.cpp"],
+    hdrs = ["src/string/strtok_r.h"],
+    deps = [
+        ":__support_common",
+        ":string_utils",
+    ],
+)
+
 ################################ fcntl targets #################################
 
 libc_function(
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel
index d90992417a721..1a95dece8bf20 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/string/BUILD.bazel
@@ -59,6 +59,14 @@ libc_test(
     ],
 )
 
+libc_test(
+    name = "strpbrk_test",
+    srcs = ["strpbrk_test.cpp"],
+    deps = [
+        "//libc:strpbrk",
+    ],
+)
+
 libc_test(
     name = "strsep_test",
     srcs = ["strsep_test.cpp"],
@@ -127,6 +135,14 @@ libc_test(
     ],
 )
 
+libc_test(
+    name = "strtok_r_test",
+    srcs = ["strtok_r_test.cpp"],
+    deps = [
+        "//libc:strtok_r",
+    ],
+)
+
 libc_test_library(
     name = "memory_check_utils",
     hdrs = ["memory_utils/memory_check_utils.h"],



More information about the libc-commits mailing list