[libcxx-commits] [libcxx] [libc++][regex] Correctly adjust match prefix for zero-length matches. (PR #94550)

via libcxx-commits libcxx-commits at lists.llvm.org
Wed Jun 5 17:39:46 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-libcxx

Author: Konstantin Varlamov (var-const)

<details>
<summary>Changes</summary>



---
Full diff: https://github.com/llvm/llvm-project/pull/94550.diff


2 Files Affected:

- (modified) libcxx/include/regex (+20-2) 
- (modified) libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp (+48) 


``````````diff
diff --git a/libcxx/include/regex b/libcxx/include/regex
index b3869d36de1df..8b2637ead328a 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -792,6 +792,7 @@ typedef regex_token_iterator<wstring::const_iterator> wsregex_token_iterator;
 #include <__algorithm/find.h>
 #include <__algorithm/search.h>
 #include <__assert>
+#include <__availability>
 #include <__config>
 #include <__iterator/back_insert_iterator.h>
 #include <__iterator/default_sentinel.h>
@@ -4700,6 +4701,9 @@ private:
 
   template <class, class>
   friend class __lookahead;
+
+  template <class, class, class>
+  friend class regex_iterator;
 };
 
 template <class _BidirectionalIterator, class _Allocator>
@@ -5410,7 +5414,9 @@ template <class _BidirectionalIterator, class _CharT, class _Traits>
 regex_iterator<_BidirectionalIterator, _CharT, _Traits>&
 regex_iterator<_BidirectionalIterator, _CharT, _Traits>::operator++() {
   __flags_ |= regex_constants::__no_update_pos;
-  _BidirectionalIterator __start = __match_[0].second;
+  _BidirectionalIterator __start        = __match_[0].second;
+  _BidirectionalIterator __prefix_start = __start;
+
   if (__match_[0].first == __match_[0].second) {
     if (__start == __end_) {
       __match_ = value_type();
@@ -5424,9 +5430,21 @@ regex_iterator<_BidirectionalIterator, _CharT, _Traits>::operator++() {
     else
       ++__start;
   }
+
   __flags_ |= regex_constants::match_prev_avail;
-  if (!std::regex_search(__start, __end_, __match_, *__pregex_, __flags_))
+  if (!std::regex_search(__start, __end_, __match_, *__pregex_, __flags_)) {
     __match_ = value_type();
+
+  } else {
+    // The Standard mandates that if `regex_search` returns true ([re.regiter.incr]), "`match.prefix().first` shall be
+    // equal to the previous value of `match[0].second`... It is unspecified how the implementation makes these
+    // adjustments." The adjustment is necessary if we incremented `__start` above (the branch that deals with
+    // zero-length matches).
+    auto& __prefix = __match_.__prefix_;
+    __prefix.first = __prefix_start;
+    __prefix.matched = __prefix.first != __prefix.second;
+  }
+
   return *this;
 }
 
diff --git a/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp b/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp
index 9332158f0de95..596a71c70a484 100644
--- a/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp
+++ b/libcxx/test/std/re/re.iter/re.regiter/re.regiter.incr/post.pass.cpp
@@ -114,5 +114,53 @@ int main(int, char**)
         assert(i == e);
     }
 
+  {
+    // Check that we correctly adjust the match prefix when dealing with zero-length matches -- this is explicitly
+    // required by the Standard ([re.regiter.incr]: "In all cases in which the call to `regex_search` returns true,
+    // `match.prefix().first` shall be equal to the previous value of `match[0].second`"). For a pattern that matches
+    // empty sequences, there is an implicit zero-length match between every character in a string -- make sure the
+    // prefix of each of these matches (except the first one) is the preceding character.
+
+    auto validate = [](const std::regex& empty_matching_pattern) {
+      const char source[] = "abc";
+
+      std::cregex_iterator i(source, source + 3, empty_matching_pattern);
+      assert(!i->prefix().matched);
+      assert(i->prefix().length() == 0);
+      assert(i->prefix().first == source);
+      assert(i->prefix().second == source);
+
+      ++i;
+      assert(i->prefix().matched);
+      assert(i->prefix().length() == 1);
+      assert(i->prefix().first == source);
+      assert(i->prefix().second == source + 1);
+      assert(i->prefix().str() == "a");
+
+      ++i;
+      assert(i->prefix().matched);
+      assert(i->prefix().length() == 1);
+      assert(i->prefix().first == source + 1);
+      assert(i->prefix().second == source + 2);
+      assert(i->prefix().str() == "b");
+
+      ++i;
+      assert(i->prefix().matched);
+      assert(i->prefix().length() == 1);
+      assert(i->prefix().first == source + 2);
+      assert(i->prefix().second == source + 3);
+      assert(i->prefix().str() == "c");
+
+      ++i;
+      assert(i == std::cregex_iterator());
+    };
+
+    // An empty pattern produces zero-length matches.
+    validate(std::regex(""));
+    // Any character repeated zero or more times can produce zero-length matches.
+    validate(std::regex("X*"));
+    validate(std::regex("X{0,3}"));
+  }
+
   return 0;
 }

``````````

</details>


https://github.com/llvm/llvm-project/pull/94550


More information about the libcxx-commits mailing list