[libc-commits] [libc] [libc] fix strsep()/strtok()/strtok_r() "subsequent searches" behavior. (PR #153652)
via libc-commits
libc-commits at lists.llvm.org
Thu Aug 14 13:20:13 PDT 2025
https://github.com/enh-google updated https://github.com/llvm/llvm-project/pull/153652
>From 4fc980176d804aaf5b07c8a00af7b7bb5284df31 Mon Sep 17 00:00:00 2001
From: enh-google <enh at google.com>
Date: Thu, 31 Jul 2025 15:58:46 -0400
Subject: [PATCH 1/3] [libc] Fix wcstok() "subsequent searches" behavior.
You're allowed to keep calling wcstok() after the first nullptr, and should keep getting nullptr.
---
libc/src/wchar/wcstok.cpp | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/libc/src/wchar/wcstok.cpp b/libc/src/wchar/wcstok.cpp
index ed4f0aad08ea5..85513a6ecfb93 100644
--- a/libc/src/wchar/wcstok.cpp
+++ b/libc/src/wchar/wcstok.cpp
@@ -27,17 +27,22 @@ LLVM_LIBC_FUNCTION(wchar_t *, wcstok,
wchar_t *tok_start = str;
while (*tok_start != L'\0' && internal::wcschr(delims, *tok_start))
++tok_start;
+ if (*tok_start == L'\0') {
+ *context = nullptr;
+ return nullptr;
+ }
wchar_t *tok_end = tok_start;
while (*tok_end != L'\0' && !internal::wcschr(delims, *tok_end))
++tok_end;
- if (*tok_end != L'\0') {
+ if (*tok_end == L'\0') {
+ *context = nullptr;
+ } else {
*tok_end = L'\0';
- ++tok_end;
+ *context = tok_end + 1;
}
- *context = tok_end;
- return *tok_start == L'\0' ? nullptr : tok_start;
+ return tok_start;
}
} // namespace LIBC_NAMESPACE_DECL
>From 10d198b4e9550f9d87805e017227417c218f7810 Mon Sep 17 00:00:00 2001
From: enh-google <enh at google.com>
Date: Thu, 31 Jul 2025 16:31:56 -0400
Subject: [PATCH 2/3] [libc] test wcstok() "subsequent searches" behavior.
---
libc/test/src/wchar/wcstok_test.cpp | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/libc/test/src/wchar/wcstok_test.cpp b/libc/test/src/wchar/wcstok_test.cpp
index 7106e9f2fab5e..3bb1014aff3ab 100644
--- a/libc/test/src/wchar/wcstok_test.cpp
+++ b/libc/test/src/wchar/wcstok_test.cpp
@@ -19,6 +19,8 @@ TEST(LlvmLibcWCSTokReentrantTest, NoTokenFound) {
// Another call to ensure that 'reserve' is not in a bad state.
ASSERT_EQ(LIBC_NAMESPACE::wcstok(empty, L"", &reserve), nullptr);
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L"", &reserve), nullptr);
+ // Subsequent searches still return nullptr.
+ ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L"", &reserve), nullptr);
}
{ // Empty source and single character delimiter string.
wchar_t empty[] = L"";
@@ -27,6 +29,8 @@ TEST(LlvmLibcWCSTokReentrantTest, NoTokenFound) {
// Another call to ensure that 'reserve' is not in a bad state.
ASSERT_EQ(LIBC_NAMESPACE::wcstok(empty, L"_", &reserve), nullptr);
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L"_", &reserve), nullptr);
+ // Subsequent searches still return nullptr.
+ ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L"_", &reserve), nullptr);
}
{ // Same character source and delimiter string.
wchar_t single[] = L"_";
@@ -35,6 +39,8 @@ TEST(LlvmLibcWCSTokReentrantTest, NoTokenFound) {
// Another call to ensure that 'reserve' is not in a bad state.
ASSERT_EQ(LIBC_NAMESPACE::wcstok(single, L"_", &reserve), nullptr);
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L"_", &reserve), nullptr);
+ // Subsequent searches still return nullptr.
+ ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L"_", &reserve), nullptr);
}
{ // Multiple character source and single character delimiter string.
wchar_t multiple[] = L"1,2";
@@ -51,6 +57,8 @@ TEST(LlvmLibcWCSTokReentrantTest, NoTokenFound) {
ASSERT_TRUE(tok[2] == L'2');
ASSERT_TRUE(tok[3] == L'\0');
ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L":", &reserve), nullptr);
+ // Subsequent searches still return nullptr.
+ ASSERT_EQ(LIBC_NAMESPACE::wcstok(nullptr, L":", &reserve), nullptr);
}
}
>From 9a9dff3216fa411ca4fcbc7769230673653702b5 Mon Sep 17 00:00:00 2001
From: enh-google <enh at google.com>
Date: Thu, 14 Aug 2025 15:28:56 -0400
Subject: [PATCH 3/3] Fix strsep()/strtok()/strtok_r() "subsequent searches"
behavior.
---
libc/src/string/string_utils.h | 41 ++++++++++++++------------
libc/test/src/string/strsep_test.cpp | 8 +++++
libc/test/src/string/strtok_r_test.cpp | 9 ++++++
libc/test/src/string/strtok_test.cpp | 7 +++++
4 files changed, 46 insertions(+), 19 deletions(-)
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 80e5783c7890b..20d3b223c822d 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -172,7 +172,7 @@ LIBC_INLINE size_t complementary_span(const char *src, const char *segment) {
return static_cast<size_t>(src - initial);
}
-// Given the similarities between strtok and strtok_r, we can implement both
+// Given the similarities between strsep/strtok/strtok_r, we can implement them
// using a utility function. On the first call, 'src' is scanned for the
// first character not found in 'delimiter_string'. Once found, it scans until
// the first character in the 'delimiter_string' or the null terminator is
@@ -184,33 +184,36 @@ LIBC_INLINE size_t complementary_span(const char *src, const char *segment) {
template <bool SkipDelim = true>
LIBC_INLINE char *string_token(char *__restrict src,
const char *__restrict delimiter_string,
- char **__restrict saveptr) {
- // Return nullptr immediately if both src AND saveptr are nullptr
- if (LIBC_UNLIKELY(src == nullptr && ((src = *saveptr) == nullptr)))
+ char **__restrict context) {
+ // Return nullptr immediately if both src AND context are nullptr
+ if (LIBC_UNLIKELY(src == nullptr && ((src = *context) == nullptr)))
return nullptr;
static_assert(CHAR_BIT == 8, "bitset of 256 assumes char is 8 bits");
- cpp::bitset<256> delimiter_set;
+ cpp::bitset<256> delims;
for (; *delimiter_string != '\0'; ++delimiter_string)
- delimiter_set.set(static_cast<size_t>(*delimiter_string));
+ delims.set(static_cast<size_t>(*delimiter_string));
+ char *tok_start = src;
if constexpr (SkipDelim)
- for (; *src != '\0' && delimiter_set.test(static_cast<size_t>(*src)); ++src)
- ;
- if (*src == '\0') {
- *saveptr = src;
+ while (*tok_start != '\0' && delims.test(static_cast<size_t>(*tok_start)))
+ ++tok_start;
+ if (*tok_start == '\0' && SkipDelim) {
+ *context = nullptr;
return nullptr;
}
- char *token = src;
- for (; *src != '\0'; ++src) {
- if (delimiter_set.test(static_cast<size_t>(*src))) {
- *src = '\0';
- ++src;
- break;
- }
+
+ char *tok_end = tok_start;
+ while (*tok_end != '\0' && !delims.test(static_cast<size_t>(*tok_end)))
+ ++tok_end;
+
+ if (*tok_end == '\0') {
+ *context = nullptr;
+ } else {
+ *tok_end = '\0';
+ *context = tok_end + 1;
}
- *saveptr = src;
- return token;
+ return tok_start;
}
LIBC_INLINE size_t strlcpy(char *__restrict dst, const char *__restrict src,
diff --git a/libc/test/src/string/strsep_test.cpp b/libc/test/src/string/strsep_test.cpp
index 06318dea4cb68..f902fd1b6d5c2 100644
--- a/libc/test/src/string/strsep_test.cpp
+++ b/libc/test/src/string/strsep_test.cpp
@@ -53,6 +53,14 @@ TEST(LlvmLibcStrsepTest, DelimitersShouldNotBeIncludedInToken) {
}
}
+TEST(LlvmLibcStrsepTest, SubsequentSearchesReturnNull) {
+ char s[] = "a";
+ char *string = s;
+ ASSERT_STREQ(LIBC_NAMESPACE::strsep(&string, ":"), "a");
+ ASSERT_EQ(LIBC_NAMESPACE::strsep(&string, ":"), nullptr);
+ ASSERT_EQ(LIBC_NAMESPACE::strsep(&string, ":"), nullptr);
+}
+
#if defined(LIBC_ADD_NULL_CHECKS)
TEST(LlvmLibcStrsepTest, CrashOnNullPtr) {
diff --git a/libc/test/src/string/strtok_r_test.cpp b/libc/test/src/string/strtok_r_test.cpp
index fdc27bae23c97..a19390d0b0c2d 100644
--- a/libc/test/src/string/strtok_r_test.cpp
+++ b/libc/test/src/string/strtok_r_test.cpp
@@ -122,3 +122,12 @@ TEST(LlvmLibcStrTokReentrantTest, DelimitersShouldNotBeIncludedInToken) {
token = LIBC_NAMESPACE::strtok_r(nullptr, "_:,_", &reserve);
ASSERT_STREQ(token, nullptr);
}
+
+TEST(LlvmLibcStrTokReentrantTest, SubsequentSearchesReturnNull) {
+ char src[] = "a";
+ char *reserve = nullptr;
+ char *token = LIBC_NAMESPACE::strtok_r(src, ":", &reserve);
+ ASSERT_STREQ(token, "a");
+ ASSERT_EQ(LIBC_NAMESPACE::strtok_r(nullptr, ":", &reserve), nullptr);
+ ASSERT_EQ(LIBC_NAMESPACE::strtok_r(nullptr, ":", &reserve), nullptr);
+}
diff --git a/libc/test/src/string/strtok_test.cpp b/libc/test/src/string/strtok_test.cpp
index b82065309e00c..76efeddda6f4a 100644
--- a/libc/test/src/string/strtok_test.cpp
+++ b/libc/test/src/string/strtok_test.cpp
@@ -76,3 +76,10 @@ TEST(LlvmLibcStrTokTest, DelimitersShouldNotBeIncludedInToken) {
token = LIBC_NAMESPACE::strtok(nullptr, "_:,_");
ASSERT_STREQ(token, nullptr);
}
+
+TEST(LlvmLibcStrTokTest, SubsequentSearchesReturnNull) {
+ char src[] = "a";
+ ASSERT_STREQ("a", LIBC_NAMESPACE::strtok(src, ":"));
+ ASSERT_EQ(LIBC_NAMESPACE::strtok(nullptr, ":"), nullptr);
+ ASSERT_EQ(LIBC_NAMESPACE::strtok(nullptr, ":"), nullptr);
+}
More information about the libc-commits
mailing list