[libcxx-commits] [libcxx] [libc++][regex] Correctly adjust match prefix for zero-length matches. (PR #94550)
Konstantin Varlamov via libcxx-commits
libcxx-commits at lists.llvm.org
Thu Jun 6 16:56:35 PDT 2024
https://github.com/var-const updated https://github.com/llvm/llvm-project/pull/94550
>From 0a4aee48c8c924197f0da1d3dd4a17684285ec09 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconst at apple.com>
Date: Wed, 5 Jun 2024 17:38:33 -0700
Subject: [PATCH 1/2] [libc++][regex] Correctly adjust match prefix for
zero-length matches.
---
libcxx/include/regex | 21 +-
.../re.regiter/re.regiter.incr/post.pass.cpp | 237 +++++++++++-------
2 files changed, 161 insertions(+), 97 deletions(-)
diff --git a/libcxx/include/regex b/libcxx/include/regex
index b3869d36de1df..46968357d3974 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -4700,6 +4700,9 @@ private:
template <class, class>
friend class __lookahead;
+
+ template <class, class, class>
+ friend class regex_iterator;
};
template <class _BidirectionalIterator, class _Allocator>
@@ -5410,7 +5413,9 @@ template <class _BidirectionalIterator, class _CharT, class _Traits>
regex_iterator<_BidirectionalIterator, _CharT, _Traits>&
regex_iterator<_BidirectionalIterator, _CharT, _Traits>::operator++() {
__flags_ |= regex_constants::__no_update_pos;
- _BidirectionalIterator __start = __match_[0].second;
+ _BidirectionalIterator __start = __match_[0].second;
+ _BidirectionalIterator __prefix_start = __start;
+
if (__match_[0].first == __match_[0].second) {
if (__start == __end_) {
__match_ = value_type();
@@ -5424,9 +5429,21 @@ regex_iterator<_BidirectionalIterator, _CharT, _Traits>::operator++() {
else
++__start;
}
+
__flags_ |= regex_constants::match_prev_avail;
- if (!std::regex_search(__start, __end_, __match_, *__pregex_, __flags_))
+ if (!std::regex_search(__start, __end_, __match_, *__pregex_, __flags_)) {
__match_ = value_type();
+
+ } else {
+ // The Standard mandates that if `regex_search` returns true ([re.regiter.incr]), "`match.prefix().first` shall be
+ // equal to the previous value of `match[0].second`... It is unspecified how the implementation makes these
+ // adjustments." The adjustment is necessary if we incremented `__start` above (the branch that deals with
+ // zero-length matches).
+ auto& __prefix = __match_.__prefix_;
+ __prefix.first = __prefix_start;
+ __prefix.matched = __prefix.first != __prefix.second;
+ }
+
return *this;
}
diff --git a/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp b/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp
index 9332158f0de95..c60e5b131f64f 100644
--- a/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp
+++ b/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp
@@ -17,102 +17,149 @@
#include <iterator>
#include "test_macros.h"
-int main(int, char**)
-{
- {
- std::regex phone_numbers("\\d{3}-\\d{4}");
- const char phone_book[] = "555-1234, 555-2345, 555-3456";
- std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
- std::cregex_iterator i2 = i;
- assert(i != std::cregex_iterator());
- assert(i2!= std::cregex_iterator());
- assert((*i).size() == 1);
- assert((*i).position() == 0);
- assert((*i).str() == "555-1234");
- assert((*i2).size() == 1);
- assert((*i2).position() == 0);
- assert((*i2).str() == "555-1234");
- i++;
- assert(i != std::cregex_iterator());
- assert(i2!= std::cregex_iterator());
- assert((*i).size() == 1);
- assert((*i).position() == 10);
- assert((*i).str() == "555-2345");
- assert((*i2).size() == 1);
- assert((*i2).position() == 0);
- assert((*i2).str() == "555-1234");
- i++;
- assert(i != std::cregex_iterator());
- assert(i2!= std::cregex_iterator());
- assert((*i).size() == 1);
- assert((*i).position() == 20);
- assert((*i).str() == "555-3456");
- assert((*i2).size() == 1);
- assert((*i2).position() == 0);
- assert((*i2).str() == "555-1234");
- i++;
- assert(i == std::cregex_iterator());
- assert(i2!= std::cregex_iterator());
- assert((*i2).size() == 1);
- assert((*i2).position() == 0);
- assert((*i2).str() == "555-1234");
- }
- {
- std::regex phone_numbers("\\d{3}-\\d{4}");
- const char phone_book[] = "555-1234, 555-2345, 555-3456";
- std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
- std::cregex_iterator i2 = i;
- assert(i != std::cregex_iterator());
- assert(i2!= std::cregex_iterator());
- assert((*i).size() == 1);
- assert((*i).position() == 0);
- assert((*i).str() == "555-1234");
- assert((*i2).size() == 1);
- assert((*i2).position() == 0);
- assert((*i2).str() == "555-1234");
- ++i;
- assert(i != std::cregex_iterator());
- assert(i2!= std::cregex_iterator());
- assert((*i).size() == 1);
- assert((*i).position() == 10);
- assert((*i).str() == "555-2345");
- assert((*i2).size() == 1);
- assert((*i2).position() == 0);
- assert((*i2).str() == "555-1234");
- ++i;
- assert(i != std::cregex_iterator());
- assert(i2!= std::cregex_iterator());
- assert((*i).size() == 1);
- assert((*i).position() == 20);
- assert((*i).str() == "555-3456");
- assert((*i2).size() == 1);
- assert((*i2).position() == 0);
- assert((*i2).str() == "555-1234");
- ++i;
- assert(i == std::cregex_iterator());
- assert(i2!= std::cregex_iterator());
- assert((*i2).size() == 1);
- assert((*i2).position() == 0);
- assert((*i2).str() == "555-1234");
- }
- { // https://llvm.org/PR33681
- std::regex rex(".*");
- const char foo[] = "foo";
+int main(int, char**) {
+ {
+ std::regex phone_numbers("\\d{3}-\\d{4}");
+ const char phone_book[] = "555-1234, 555-2345, 555-3456";
+ std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
+ std::cregex_iterator i2 = i;
+ assert(i != std::cregex_iterator());
+ assert(i2 != std::cregex_iterator());
+ assert((*i).size() == 1);
+ assert((*i).position() == 0);
+ assert((*i).str() == "555-1234");
+ assert((*i2).size() == 1);
+ assert((*i2).position() == 0);
+ assert((*i2).str() == "555-1234");
+ i++;
+ assert(i != std::cregex_iterator());
+ assert(i2 != std::cregex_iterator());
+ assert((*i).size() == 1);
+ assert((*i).position() == 10);
+ assert((*i).str() == "555-2345");
+ assert((*i2).size() == 1);
+ assert((*i2).position() == 0);
+ assert((*i2).str() == "555-1234");
+ i++;
+ assert(i != std::cregex_iterator());
+ assert(i2 != std::cregex_iterator());
+ assert((*i).size() == 1);
+ assert((*i).position() == 20);
+ assert((*i).str() == "555-3456");
+ assert((*i2).size() == 1);
+ assert((*i2).position() == 0);
+ assert((*i2).str() == "555-1234");
+ i++;
+ assert(i == std::cregex_iterator());
+ assert(i2 != std::cregex_iterator());
+ assert((*i2).size() == 1);
+ assert((*i2).position() == 0);
+ assert((*i2).str() == "555-1234");
+ }
+ {
+ std::regex phone_numbers("\\d{3}-\\d{4}");
+ const char phone_book[] = "555-1234, 555-2345, 555-3456";
+ std::cregex_iterator i(std::begin(phone_book), std::end(phone_book), phone_numbers);
+ std::cregex_iterator i2 = i;
+ assert(i != std::cregex_iterator());
+ assert(i2 != std::cregex_iterator());
+ assert((*i).size() == 1);
+ assert((*i).position() == 0);
+ assert((*i).str() == "555-1234");
+ assert((*i2).size() == 1);
+ assert((*i2).position() == 0);
+ assert((*i2).str() == "555-1234");
+ ++i;
+ assert(i != std::cregex_iterator());
+ assert(i2 != std::cregex_iterator());
+ assert((*i).size() == 1);
+ assert((*i).position() == 10);
+ assert((*i).str() == "555-2345");
+ assert((*i2).size() == 1);
+ assert((*i2).position() == 0);
+ assert((*i2).str() == "555-1234");
+ ++i;
+ assert(i != std::cregex_iterator());
+ assert(i2 != std::cregex_iterator());
+ assert((*i).size() == 1);
+ assert((*i).position() == 20);
+ assert((*i).str() == "555-3456");
+ assert((*i2).size() == 1);
+ assert((*i2).position() == 0);
+ assert((*i2).str() == "555-1234");
+ ++i;
+ assert(i == std::cregex_iterator());
+ assert(i2 != std::cregex_iterator());
+ assert((*i2).size() == 1);
+ assert((*i2).position() == 0);
+ assert((*i2).str() == "555-1234");
+ }
+ { // https://llvm.org/PR33681
+ std::regex rex(".*");
+ const char foo[] = "foo";
// The -1 is because we don't want the implicit null from the array.
- std::cregex_iterator i(std::begin(foo), std::end(foo) - 1, rex);
- std::cregex_iterator e;
- assert(i != e);
- assert((*i).size() == 1);
- assert((*i).str() == "foo");
-
- ++i;
- assert(i != e);
- assert((*i).size() == 1);
- assert((*i).str() == "");
-
- ++i;
- assert(i == e);
- }
+ std::cregex_iterator i(std::begin(foo), std::end(foo) - 1, rex);
+ std::cregex_iterator e;
+ assert(i != e);
+ assert((*i).size() == 1);
+ assert((*i).str() == "foo");
+
+ ++i;
+ assert(i != e);
+ assert((*i).size() == 1);
+ assert((*i).str() == "");
+
+ ++i;
+ assert(i == e);
+ }
+
+ {
+ // Check that we correctly adjust the match prefix when dealing with zero-length matches -- this is explicitly
+ // required by the Standard ([re.regiter.incr]: "In all cases in which the call to `regex_search` returns true,
+ // `match.prefix().first` shall be equal to the previous value of `match[0].second`"). For a pattern that matches
+ // empty sequences, there is an implicit zero-length match between every character in a string -- make sure the
+ // prefix of each of these matches (except the first one) is the preceding character.
+
+ auto validate = [](const std::regex& empty_matching_pattern) {
+ const char source[] = "abc";
+
+ std::cregex_iterator i(source, source + 3, empty_matching_pattern);
+ assert(!i->prefix().matched);
+ assert(i->prefix().length() == 0);
+ assert(i->prefix().first == source);
+ assert(i->prefix().second == source);
+
+ ++i;
+ assert(i->prefix().matched);
+ assert(i->prefix().length() == 1);
+ assert(i->prefix().first == source);
+ assert(i->prefix().second == source + 1);
+ assert(i->prefix().str() == "a");
+
+ ++i;
+ assert(i->prefix().matched);
+ assert(i->prefix().length() == 1);
+ assert(i->prefix().first == source + 1);
+ assert(i->prefix().second == source + 2);
+ assert(i->prefix().str() == "b");
+
+ ++i;
+ assert(i->prefix().matched);
+ assert(i->prefix().length() == 1);
+ assert(i->prefix().first == source + 2);
+ assert(i->prefix().second == source + 3);
+ assert(i->prefix().str() == "c");
+
+ ++i;
+ assert(i == std::cregex_iterator());
+ };
+
+ // An empty pattern produces zero-length matches.
+ validate(std::regex(""));
+ // Any character repeated zero or more times can produce zero-length matches.
+ validate(std::regex("X*"));
+ validate(std::regex("X{0,3}"));
+ }
return 0;
}
>From 71395b1866af055dd243c809ebbeb8774b809b47 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconst at apple.com>
Date: Thu, 6 Jun 2024 16:56:11 -0700
Subject: [PATCH 2/2] Feedback
---
.../zero_length_matches.pass.cpp | 37 +++++++
.../re.regiter/re.regiter.incr/post.pass.cpp | 96 ++++++++++---------
2 files changed, 86 insertions(+), 47 deletions(-)
create mode 100644 libcxx/test/std/re/re.alg/re.alg.replace/zero_length_matches.pass.cpp
diff --git a/libcxx/test/std/re/re.alg/re.alg.replace/zero_length_matches.pass.cpp b/libcxx/test/std/re/re.alg/re.alg.replace/zero_length_matches.pass.cpp
new file mode 100644
index 0000000000000..2f1b6280deb78
--- /dev/null
+++ b/libcxx/test/std/re/re.alg/re.alg.replace/zero_length_matches.pass.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <regex>
+
+// Test that replacing zero-length matches works correctly.
+
+#include <cassert>
+#include <regex>
+#include <string>
+#include "test_macros.h"
+
+int main(int, char**) {
+ // Various patterns that produce zero-length matches.
+ assert(std::regex_replace("abc", std::regex(""), "!") == "!a!b!c!");
+ assert(std::regex_replace("abc", std::regex("X*"), "!") == "!a!b!c!");
+ assert(std::regex_replace("abc", std::regex("X{0,3}"), "!") == "!a!b!c!");
+
+ // Replacement string has several characters.
+ assert(std::regex_replace("abc", std::regex(""), "[!]") == "[!]a[!]b[!]c[!]");
+
+ // Empty replacement string.
+ assert(std::regex_replace("abc", std::regex(""), "") == "abc");
+
+ // Empty input.
+ assert(std::regex_replace("", std::regex(""), "!") == "!");
+
+ // Not all matches are zero-length.
+ assert(std::regex_replace("abCabCa", std::regex("C*"), "!") == "!a!b!!a!b!!a!");
+
+ return 0;
+}
diff --git a/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp b/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp
index c60e5b131f64f..aefe86f152980 100644
--- a/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp
+++ b/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp
@@ -17,6 +17,54 @@
#include <iterator>
#include "test_macros.h"
+void validate_prefixes(const std::regex& empty_matching_pattern) {
+ const char source[] = "abc";
+
+ std::cregex_iterator i(source, source + 3, empty_matching_pattern);
+ assert(!i->prefix().matched);
+ assert(i->prefix().length() == 0);
+ assert(i->prefix().first == source);
+ assert(i->prefix().second == source);
+
+ ++i;
+ assert(i->prefix().matched);
+ assert(i->prefix().length() == 1);
+ assert(i->prefix().first == source);
+ assert(i->prefix().second == source + 1);
+ assert(i->prefix().str() == "a");
+
+ ++i;
+ assert(i->prefix().matched);
+ assert(i->prefix().length() == 1);
+ assert(i->prefix().first == source + 1);
+ assert(i->prefix().second == source + 2);
+ assert(i->prefix().str() == "b");
+
+ ++i;
+ assert(i->prefix().matched);
+ assert(i->prefix().length() == 1);
+ assert(i->prefix().first == source + 2);
+ assert(i->prefix().second == source + 3);
+ assert(i->prefix().str() == "c");
+
+ ++i;
+ assert(i == std::cregex_iterator());
+}
+
+void test_prefix_adjustment() {
+ // Check that we correctly adjust the match prefix when dealing with zero-length matches -- this is explicitly
+ // required by the Standard ([re.regiter.incr]: "In all cases in which the call to `regex_search` returns true,
+ // `match.prefix().first` shall be equal to the previous value of `match[0].second`"). For a pattern that matches
+ // empty sequences, there is an implicit zero-length match between every character in a string -- make sure the
+ // prefix of each of these matches (except the first one) is the preceding character.
+
+ // An empty pattern produces zero-length matches.
+ validate_prefixes(std::regex(""));
+ // Any character repeated zero or more times can produce zero-length matches.
+ validate_prefixes(std::regex("X*"));
+ validate_prefixes(std::regex("X{0,3}"));
+}
+
int main(int, char**) {
{
std::regex phone_numbers("\\d{3}-\\d{4}");
@@ -113,53 +161,7 @@ int main(int, char**) {
assert(i == e);
}
- {
- // Check that we correctly adjust the match prefix when dealing with zero-length matches -- this is explicitly
- // required by the Standard ([re.regiter.incr]: "In all cases in which the call to `regex_search` returns true,
- // `match.prefix().first` shall be equal to the previous value of `match[0].second`"). For a pattern that matches
- // empty sequences, there is an implicit zero-length match between every character in a string -- make sure the
- // prefix of each of these matches (except the first one) is the preceding character.
-
- auto validate = [](const std::regex& empty_matching_pattern) {
- const char source[] = "abc";
-
- std::cregex_iterator i(source, source + 3, empty_matching_pattern);
- assert(!i->prefix().matched);
- assert(i->prefix().length() == 0);
- assert(i->prefix().first == source);
- assert(i->prefix().second == source);
-
- ++i;
- assert(i->prefix().matched);
- assert(i->prefix().length() == 1);
- assert(i->prefix().first == source);
- assert(i->prefix().second == source + 1);
- assert(i->prefix().str() == "a");
-
- ++i;
- assert(i->prefix().matched);
- assert(i->prefix().length() == 1);
- assert(i->prefix().first == source + 1);
- assert(i->prefix().second == source + 2);
- assert(i->prefix().str() == "b");
-
- ++i;
- assert(i->prefix().matched);
- assert(i->prefix().length() == 1);
- assert(i->prefix().first == source + 2);
- assert(i->prefix().second == source + 3);
- assert(i->prefix().str() == "c");
-
- ++i;
- assert(i == std::cregex_iterator());
- };
-
- // An empty pattern produces zero-length matches.
- validate(std::regex(""));
- // Any character repeated zero or more times can produce zero-length matches.
- validate(std::regex("X*"));
- validate(std::regex("X{0,3}"));
- }
+ test_prefix_adjustment();
return 0;
}
More information about the libcxx-commits
mailing list