[libcxx-commits] [libcxx] [libc++] Fix endianness for algorithm mismatch (PR #93082)

Zibi Sarbinowski via libcxx-commits libcxx-commits at lists.llvm.org
Mon Jun 10 06:36:36 PDT 2024


https://github.com/zibi2 updated https://github.com/llvm/llvm-project/pull/93082

>From e366cb1d384d08387bb2cb106c5991ec57a0fb55 Mon Sep 17 00:00:00 2001
From: Zbigniew Sarbinowski <zibi at ca.ibm.com>
Date: Wed, 22 May 2024 18:11:27 +0000
Subject: [PATCH 1/8] Fix endianess for algorithm mismatch

---
 libcxx/include/__algorithm/mismatch.h | 47 ++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index 632bec02406a4..a8219f0817a6c 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -56,6 +56,39 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro
 
 #if _LIBCPP_VECTORIZE_ALGORITHMS
 
+template <class _Tp,
+          __enable_if_t<is_integral<_Tp>::value, int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 8>
+__reverse_vector(__simd_vector<_Tp, 8>& __cmp_res) {
+#if defined(_LIBCPP_BIG_ENDIAN)
+  static_assert(__native_vector_size<_Tp> == 8, "The __native_vector_size has to be 8");
+  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0);
+#endif
+  return __cmp_res;
+}
+
+template <class _Tp,
+          __enable_if_t<is_integral<_Tp>::value, int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 16>
+__reverse_vector(__simd_vector<_Tp, 16> __cmp_res) {
+#if defined(_LIBCPP_BIG_ENDIAN)
+  static_assert(__native_vector_size<_Tp> == 16, "The __native_vector_size has to be 16");
+  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+#endif
+  return __cmp_res;
+}
+
+template <class _Tp,
+          __enable_if_t<is_integral<_Tp>::value, int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 32>
+__reverse_vector(__simd_vector<_Tp, 32> __cmp_res) {
+#if defined(_LIBCPP_BIG_ENDIAN)
+  static_assert(__native_vector_size<_Tp> == 32, "The __native_vector_size has to be 32");
+  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+#endif
+  return __cmp_res;
+}
+
 template <class _Iter>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter>
 __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
@@ -77,7 +110,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
       }
 
       for (size_t __i = 0; __i != __unroll_count; ++__i) {
-        if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) {
+        auto __cmp_res = __lhs[__i] == __rhs[__i];
+        __cmp_res = __reverse_vector<_Tp>(__cmp_res);
+        if (!std::__all_of(__cmp_res)) {
           auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
           return {__first1 + __offset, __first2 + __offset};
         }
@@ -89,8 +124,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
 
     // check the remaining 0-3 vectors
     while (static_cast<size_t>(__last1 - __first1) >= __vec_size) {
-      if (auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
-          !std::__all_of(__cmp_res)) {
+      auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
+      __cmp_res = __reverse_vector<_Tp>(__cmp_res);
+      if (!std::__all_of(__cmp_res)) {
         auto __offset = std::__find_first_not_set(__cmp_res);
         return {__first1 + __offset, __first2 + __offset};
       }
@@ -106,8 +142,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
     if (static_cast<size_t>(__first1 - __orig_first1) >= __vec_size) {
       __first1 = __last1 - __vec_size;
       __first2 = __last2 - __vec_size;
-      auto __offset =
-          std::__find_first_not_set(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2));
+      auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
+      __cmp_res = __reverse_vector<_Tp>(__cmp_res);
+      auto __offset = std::__find_first_not_set(__cmp_res);
       return {__first1 + __offset, __first2 + __offset};
     } // else loop over the elements individually
   }

>From 05395e0b07312d5a7594bdc1db46cf695001242c Mon Sep 17 00:00:00 2001
From: Zbigniew Sarbinowski <zibi at ca.ibm.com>
Date: Wed, 22 May 2024 18:21:04 +0000
Subject: [PATCH 2/8] Update based on the latest changes

---
 libcxx/include/__algorithm/mismatch.h | 83 ++++++++++++++++++---------
 1 file changed, 57 insertions(+), 26 deletions(-)

diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index a8219f0817a6c..7b4e7da35cf7f 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -56,36 +56,67 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro
 
 #if _LIBCPP_VECTORIZE_ALGORITHMS
 
-template <class _Tp,
-          __enable_if_t<is_integral<_Tp>::value, int> = 0>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 8>
-__reverse_vector(__simd_vector<_Tp, 8>& __cmp_res) {
-#if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<_Tp> == 8, "The __native_vector_size has to be 8");
+template <class _Value_type>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 8>
+__reverse_vector(__simd_vector<_Value_type, 8>& __cmp_res) {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+  static_assert(__native_vector_size<_Value_type> == 8, "The __native_vector_size has to be 8");
   __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0);
-#endif
+#  endif
   return __cmp_res;
 }
 
-template <class _Tp,
-          __enable_if_t<is_integral<_Tp>::value, int> = 0>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 16>
-__reverse_vector(__simd_vector<_Tp, 16> __cmp_res) {
-#if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<_Tp> == 16, "The __native_vector_size has to be 16");
+template <class _Value_type>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 16>
+__reverse_vector(__simd_vector<_Value_type, 16> __cmp_res) {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+  static_assert(__native_vector_size<_Value_type> == 16, "The __native_vector_size has to be 16");
   __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-#endif
+#  endif
   return __cmp_res;
 }
 
-template <class _Tp,
-          __enable_if_t<is_integral<_Tp>::value, int> = 0>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Tp, 32>
-__reverse_vector(__simd_vector<_Tp, 32> __cmp_res) {
-#if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<_Tp> == 32, "The __native_vector_size has to be 32");
-  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-#endif
+template <class _Value_type>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 32>
+__reverse_vector(__simd_vector<_Value_type, 32> __cmp_res) {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+  static_assert(__native_vector_size<_Value_type> == 32, "The __native_vector_size has to be 32");
+  __cmp_res = __builtin_shufflevector(
+      __cmp_res,
+      __cmp_res,
+      31,
+      30,
+      29,
+      28,
+      27,
+      26,
+      25,
+      24,
+      23,
+      22,
+      21,
+      20,
+      19,
+      18,
+      17,
+      16,
+      15,
+      14,
+      13,
+      12,
+      11,
+      10,
+      9,
+      8,
+      7,
+      6,
+      5,
+      4,
+      3,
+      2,
+      1,
+      0);
+#  endif
   return __cmp_res;
 }
 
@@ -111,7 +142,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
 
       for (size_t __i = 0; __i != __unroll_count; ++__i) {
         auto __cmp_res = __lhs[__i] == __rhs[__i];
-        __cmp_res = __reverse_vector<_Tp>(__cmp_res);
+        __cmp_res      = __reverse_vector<__value_type>(__cmp_res);
         if (!std::__all_of(__cmp_res)) {
           auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
           return {__first1 + __offset, __first2 + __offset};
@@ -125,7 +156,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
     // check the remaining 0-3 vectors
     while (static_cast<size_t>(__last1 - __first1) >= __vec_size) {
       auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
-      __cmp_res = __reverse_vector<_Tp>(__cmp_res);
+      __cmp_res      = __reverse_vector<__value_type>(__cmp_res);
       if (!std::__all_of(__cmp_res)) {
         auto __offset = std::__find_first_not_set(__cmp_res);
         return {__first1 + __offset, __first2 + __offset};
@@ -143,8 +174,8 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
       __first1 = __last1 - __vec_size;
       __first2 = __last2 - __vec_size;
       auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
-      __cmp_res = __reverse_vector<_Tp>(__cmp_res);
-      auto __offset = std::__find_first_not_set(__cmp_res);
+      __cmp_res      = __reverse_vector<__value_type>(__cmp_res);
+      auto __offset  = std::__find_first_not_set(__cmp_res);
       return {__first1 + __offset, __first2 + __offset};
     } // else loop over the elements individually
   }

>From b4ce87242c5eb820fe6fde076ecbaedeb78b427e Mon Sep 17 00:00:00 2001
From: Zbigniew Sarbinowski <zibi at ca.ibm.com>
Date: Wed, 22 May 2024 22:45:20 +0000
Subject: [PATCH 3/8]  Add more __reverse_vector overloads

---
 libcxx/include/__algorithm/mismatch.h | 70 +++++++++++++++++++++------
 1 file changed, 55 insertions(+), 15 deletions(-)

diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index 7b4e7da35cf7f..8c4b4a0e031ed 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -56,31 +56,71 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro
 
 #if _LIBCPP_VECTORIZE_ALGORITHMS
 
-template <class _Value_type>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 8>
-__reverse_vector(__simd_vector<_Value_type, 8>& __cmp_res) {
+template <class _ValueType>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<long, 2>
+__reverse_vector(__simd_vector<long, 2>& __cmp_res) {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<_Value_type> == 8, "The __native_vector_size has to be 8");
+  static_assert(__native_vector_size<long> == 2, "The __native_vector_size has to be 2");
+  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 1, 0);
+#  endif
+  return __cmp_res;
+}
+
+template <class _ValueType>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<long, 4>
+__reverse_vector(__simd_vector<long, 4>& __cmp_res) {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+  static_assert(__native_vector_size<long> == 4, "The __native_vector_size has to be 4");
+  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0);
+#  endif
+  return __cmp_res;
+}
+
+template <class _ValueType>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<int, 8>
+__reverse_vector(__simd_vector<int, 8>& __cmp_res) {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+  static_assert(__native_vector_size<int> == 8, "The __native_vector_size has to be 8");
+  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0);
+#  endif
+  return __cmp_res;
+}
+
+template <class _ValueType>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<int, 4>
+__reverse_vector(__simd_vector<int, 4>& __cmp_res) {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+  static_assert(__native_vector_size<int> == 4, "The __native_vector_size has to be 4");
+  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0);
+#  endif
+  return __cmp_res;
+}
+
+template <class _ValueType>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 8>
+__reverse_vector(__simd_vector<_ValueType, 8>& __cmp_res) {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+  static_assert(__native_vector_size<_ValueType> == 8, "The __native_vector_size has to be 8");
   __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0);
 #  endif
   return __cmp_res;
 }
 
-template <class _Value_type>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 16>
-__reverse_vector(__simd_vector<_Value_type, 16> __cmp_res) {
+template <class _ValueType>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 16>
+__reverse_vector(__simd_vector<_ValueType, 16> __cmp_res) {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<_Value_type> == 16, "The __native_vector_size has to be 16");
+  static_assert(__native_vector_size<_ValueType> == 16, "The __native_vector_size has to be 16");
   __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 #  endif
   return __cmp_res;
 }
 
-template <class _Value_type>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_Value_type, 32>
-__reverse_vector(__simd_vector<_Value_type, 32> __cmp_res) {
+template <class _ValueType>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 32>
+__reverse_vector(__simd_vector<_ValueType, 32> __cmp_res) {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<_Value_type> == 32, "The __native_vector_size has to be 32");
+  static_assert(__native_vector_size<_ValueType> == 32, "The __native_vector_size has to be 32");
   __cmp_res = __builtin_shufflevector(
       __cmp_res,
       __cmp_res,
@@ -142,7 +182,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
 
       for (size_t __i = 0; __i != __unroll_count; ++__i) {
         auto __cmp_res = __lhs[__i] == __rhs[__i];
-        __cmp_res      = __reverse_vector<__value_type>(__cmp_res);
+        __cmp_res      = std::__reverse_vector<__value_type>(__cmp_res);
         if (!std::__all_of(__cmp_res)) {
           auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
           return {__first1 + __offset, __first2 + __offset};
@@ -156,7 +196,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
     // check the remaining 0-3 vectors
     while (static_cast<size_t>(__last1 - __first1) >= __vec_size) {
       auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
-      __cmp_res      = __reverse_vector<__value_type>(__cmp_res);
+      __cmp_res      = std::__reverse_vector<__value_type>(__cmp_res);
       if (!std::__all_of(__cmp_res)) {
         auto __offset = std::__find_first_not_set(__cmp_res);
         return {__first1 + __offset, __first2 + __offset};
@@ -174,7 +214,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
       __first1 = __last1 - __vec_size;
       __first2 = __last2 - __vec_size;
       auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
-      __cmp_res      = __reverse_vector<__value_type>(__cmp_res);
+      __cmp_res      = std::__reverse_vector<__value_type>(__cmp_res);
       auto __offset  = std::__find_first_not_set(__cmp_res);
       return {__first1 + __offset, __first2 + __offset};
     } // else loop over the elements individually

>From 40cab2461b7efa5861fc01272baa3b196fb91c98 Mon Sep 17 00:00:00 2001
From: Zbigniew Sarbinowski <zibi at ca.ibm.com>
Date: Fri, 24 May 2024 19:16:57 +0000
Subject: [PATCH 4/8] Try to fix windows CI

---
 libcxx/include/__algorithm/mismatch.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index 8c4b4a0e031ed..8519ef76140ed 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -56,11 +56,19 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro
 
 #if _LIBCPP_VECTORIZE_ALGORITHMS
 
+template <class _ValueType>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<long long, 2>
+__reverse_vector(__simd_vector<long long, 2>& __cmp_res) {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 1, 0);
+#  endif
+  return __cmp_res;
+}
+
 template <class _ValueType>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<long, 2>
 __reverse_vector(__simd_vector<long, 2>& __cmp_res) {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<long> == 2, "The __native_vector_size has to be 2");
   __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 1, 0);
 #  endif
   return __cmp_res;
@@ -70,7 +78,6 @@ template <class _ValueType>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<long, 4>
 __reverse_vector(__simd_vector<long, 4>& __cmp_res) {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<long> == 4, "The __native_vector_size has to be 4");
   __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0);
 #  endif
   return __cmp_res;
@@ -80,7 +87,6 @@ template <class _ValueType>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<int, 8>
 __reverse_vector(__simd_vector<int, 8>& __cmp_res) {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<int> == 8, "The __native_vector_size has to be 8");
   __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0);
 #  endif
   return __cmp_res;
@@ -90,7 +96,6 @@ template <class _ValueType>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<int, 4>
 __reverse_vector(__simd_vector<int, 4>& __cmp_res) {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<int> == 4, "The __native_vector_size has to be 4");
   __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0);
 #  endif
   return __cmp_res;
@@ -100,7 +105,6 @@ template <class _ValueType>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 8>
 __reverse_vector(__simd_vector<_ValueType, 8>& __cmp_res) {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<_ValueType> == 8, "The __native_vector_size has to be 8");
   __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0);
 #  endif
   return __cmp_res;
@@ -110,7 +114,6 @@ template <class _ValueType>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 16>
 __reverse_vector(__simd_vector<_ValueType, 16> __cmp_res) {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<_ValueType> == 16, "The __native_vector_size has to be 16");
   __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 #  endif
   return __cmp_res;
@@ -120,7 +123,6 @@ template <class _ValueType>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 32>
 __reverse_vector(__simd_vector<_ValueType, 32> __cmp_res) {
 #  if defined(_LIBCPP_BIG_ENDIAN)
-  static_assert(__native_vector_size<_ValueType> == 32, "The __native_vector_size has to be 32");
   __cmp_res = __builtin_shufflevector(
       __cmp_res,
       __cmp_res,

>From 42f64c8d8ebee7b314d28bdf724a2ed2b2ed68b0 Mon Sep 17 00:00:00 2001
From: Zbigniew Sarbinowski <zibi at ca.ibm.com>
Date: Mon, 3 Jun 2024 18:30:36 +0000
Subject: [PATCH 5/8] Based on the suggestion, apply the variadic template
 technique to reduce code

---
 libcxx/include/__algorithm/mismatch.h | 139 ++++++--------------------
 1 file changed, 33 insertions(+), 106 deletions(-)

diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index 8519ef76140ed..bde3010b0c457 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -55,113 +55,34 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro
 }
 
 #if _LIBCPP_VECTORIZE_ALGORITHMS
-
-template <class _ValueType>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<long long, 2>
-__reverse_vector(__simd_vector<long long, 2>& __cmp_res) {
-#  if defined(_LIBCPP_BIG_ENDIAN)
-  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 1, 0);
-#  endif
-  return __cmp_res;
-}
-
-template <class _ValueType>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<long, 2>
-__reverse_vector(__simd_vector<long, 2>& __cmp_res) {
-#  if defined(_LIBCPP_BIG_ENDIAN)
-  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 1, 0);
-#  endif
-  return __cmp_res;
-}
-
-template <class _ValueType>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<long, 4>
-__reverse_vector(__simd_vector<long, 4>& __cmp_res) {
-#  if defined(_LIBCPP_BIG_ENDIAN)
-  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0);
-#  endif
-  return __cmp_res;
-}
-
-template <class _ValueType>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<int, 8>
-__reverse_vector(__simd_vector<int, 8>& __cmp_res) {
-#  if defined(_LIBCPP_BIG_ENDIAN)
-  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0);
-#  endif
-  return __cmp_res;
-}
-
-template <class _ValueType>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<int, 4>
-__reverse_vector(__simd_vector<int, 4>& __cmp_res) {
-#  if defined(_LIBCPP_BIG_ENDIAN)
-  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 3, 2, 1, 0);
-#  endif
-  return __cmp_res;
+template <class _ValueType, size_t _Np>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<long long, _Np>
+__reverse_vector(__simd_vector<long long, _Np> __cmp_res) {
+  return [&]<size_t... _Indices>(index_sequence<_Indices...>) {
+    return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...);
+  }(make_index_sequence<_Np>{});
+}
+template <class _ValueType, size_t _Np>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<long, _Np>
+__reverse_vector(__simd_vector<long, _Np> __cmp_res) {
+  return [&]<size_t... _Indices>(index_sequence<_Indices...>) {
+    return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...);
+  }(make_index_sequence<_Np>{});
+}
+template <class _ValueType, size_t _Np>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<int, _Np>
+__reverse_vector(__simd_vector<int, _Np> __cmp_res) {
+  return [&]<size_t... _Indices>(index_sequence<_Indices...>) {
+    return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...);
+  }(make_index_sequence<_Np>{});
+}
+template <class _ValueType, size_t _Np>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<_ValueType, _Np>
+__reverse_vector(__simd_vector<_ValueType, _Np> __cmp_res) {
+  return [&]<size_t... _Indices>(index_sequence<_Indices...>) {
+    return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...);
+  }(make_index_sequence<_Np>{});
 }
-
-template <class _ValueType>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 8>
-__reverse_vector(__simd_vector<_ValueType, 8>& __cmp_res) {
-#  if defined(_LIBCPP_BIG_ENDIAN)
-  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 7, 6, 5, 4, 3, 2, 1, 0);
-#  endif
-  return __cmp_res;
-}
-
-template <class _ValueType>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 16>
-__reverse_vector(__simd_vector<_ValueType, 16> __cmp_res) {
-#  if defined(_LIBCPP_BIG_ENDIAN)
-  __cmp_res = __builtin_shufflevector(__cmp_res, __cmp_res, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-#  endif
-  return __cmp_res;
-}
-
-template <class _ValueType>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __simd_vector<_ValueType, 32>
-__reverse_vector(__simd_vector<_ValueType, 32> __cmp_res) {
-#  if defined(_LIBCPP_BIG_ENDIAN)
-  __cmp_res = __builtin_shufflevector(
-      __cmp_res,
-      __cmp_res,
-      31,
-      30,
-      29,
-      28,
-      27,
-      26,
-      25,
-      24,
-      23,
-      22,
-      21,
-      20,
-      19,
-      18,
-      17,
-      16,
-      15,
-      14,
-      13,
-      12,
-      11,
-      10,
-      9,
-      8,
-      7,
-      6,
-      5,
-      4,
-      3,
-      2,
-      1,
-      0);
-#  endif
-  return __cmp_res;
-}
-
 template <class _Iter>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter>
 __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
@@ -184,7 +105,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
 
       for (size_t __i = 0; __i != __unroll_count; ++__i) {
         auto __cmp_res = __lhs[__i] == __rhs[__i];
+#  if defined(_LIBCPP_BIG_ENDIAN)
         __cmp_res      = std::__reverse_vector<__value_type>(__cmp_res);
+#  endif
         if (!std::__all_of(__cmp_res)) {
           auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
           return {__first1 + __offset, __first2 + __offset};
@@ -198,7 +121,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
     // check the remaining 0-3 vectors
     while (static_cast<size_t>(__last1 - __first1) >= __vec_size) {
       auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
+#  if defined(_LIBCPP_BIG_ENDIAN)
       __cmp_res      = std::__reverse_vector<__value_type>(__cmp_res);
+#  endif
       if (!std::__all_of(__cmp_res)) {
         auto __offset = std::__find_first_not_set(__cmp_res);
         return {__first1 + __offset, __first2 + __offset};
@@ -216,7 +141,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
       __first1 = __last1 - __vec_size;
       __first2 = __last2 - __vec_size;
       auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
+#  if defined(_LIBCPP_BIG_ENDIAN)
       __cmp_res      = std::__reverse_vector<__value_type>(__cmp_res);
+#  endif
       auto __offset  = std::__find_first_not_set(__cmp_res);
       return {__first1 + __offset, __first2 + __offset};
     } // else loop over the elements individually

>From 8cddb8352696412011d1a513cb86b01af925044f Mon Sep 17 00:00:00 2001
From: Zbigniew Sarbinowski <zibi at ca.ibm.com>
Date: Mon, 3 Jun 2024 18:43:05 +0000
Subject: [PATCH 6/8] fix formatting

---
 libcxx/include/__algorithm/mismatch.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index bde3010b0c457..bdd3314ed1ec5 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -63,15 +63,13 @@ __reverse_vector(__simd_vector<long long, _Np> __cmp_res) {
   }(make_index_sequence<_Np>{});
 }
 template <class _ValueType, size_t _Np>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<long, _Np>
-__reverse_vector(__simd_vector<long, _Np> __cmp_res) {
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<long, _Np> __reverse_vector(__simd_vector<long, _Np> __cmp_res) {
   return [&]<size_t... _Indices>(index_sequence<_Indices...>) {
     return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...);
   }(make_index_sequence<_Np>{});
 }
 template <class _ValueType, size_t _Np>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<int, _Np>
-__reverse_vector(__simd_vector<int, _Np> __cmp_res) {
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<int, _Np> __reverse_vector(__simd_vector<int, _Np> __cmp_res) {
   return [&]<size_t... _Indices>(index_sequence<_Indices...>) {
     return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...);
   }(make_index_sequence<_Np>{});
@@ -106,7 +104,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
       for (size_t __i = 0; __i != __unroll_count; ++__i) {
         auto __cmp_res = __lhs[__i] == __rhs[__i];
 #  if defined(_LIBCPP_BIG_ENDIAN)
-        __cmp_res      = std::__reverse_vector<__value_type>(__cmp_res);
+        __cmp_res = std::__reverse_vector<__value_type>(__cmp_res);
 #  endif
         if (!std::__all_of(__cmp_res)) {
           auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
@@ -122,7 +120,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
     while (static_cast<size_t>(__last1 - __first1) >= __vec_size) {
       auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
 #  if defined(_LIBCPP_BIG_ENDIAN)
-      __cmp_res      = std::__reverse_vector<__value_type>(__cmp_res);
+      __cmp_res = std::__reverse_vector<__value_type>(__cmp_res);
 #  endif
       if (!std::__all_of(__cmp_res)) {
         auto __offset = std::__find_first_not_set(__cmp_res);
@@ -142,9 +140,9 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
       __first2 = __last2 - __vec_size;
       auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
 #  if defined(_LIBCPP_BIG_ENDIAN)
-      __cmp_res      = std::__reverse_vector<__value_type>(__cmp_res);
+      __cmp_res = std::__reverse_vector<__value_type>(__cmp_res);
 #  endif
-      auto __offset  = std::__find_first_not_set(__cmp_res);
+      auto __offset = std::__find_first_not_set(__cmp_res);
       return {__first1 + __offset, __first2 + __offset};
     } // else loop over the elements individually
   }

>From 9bf257d9563cbe7b1d9e1b6a349e81d249baa150 Mon Sep 17 00:00:00 2001
From: Zbigniew Sarbinowski <zibi at ca.ibm.com>
Date: Mon, 3 Jun 2024 19:19:50 +0000
Subject: [PATCH 7/8] attempt to fix CI

---
 libcxx/include/__algorithm/mismatch.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index bdd3314ed1ec5..3ff0f59caec2b 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -22,6 +22,7 @@
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_equality_comparable.h>
 #include <__type_traits/is_integral.h>
+#include <__utility/integer_sequence.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/unreachable.h>

>From 5eba555bf6aaf042a9b3c7783bcce3efd9c04985 Mon Sep 17 00:00:00 2001
From: Zbigniew Sarbinowski <zibi at ca.ibm.com>
Date: Mon, 10 Jun 2024 13:32:52 +0000
Subject: [PATCH 8/8] Make __find_first_set endianness aware

---
 libcxx/include/__algorithm/mismatch.h   | 48 ++++---------------------
 libcxx/include/__algorithm/simd_utils.h | 11 +++++-
 2 files changed, 16 insertions(+), 43 deletions(-)

diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index 3ff0f59caec2b..632bec02406a4 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -22,7 +22,6 @@
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_equality_comparable.h>
 #include <__type_traits/is_integral.h>
-#include <__utility/integer_sequence.h>
 #include <__utility/move.h>
 #include <__utility/pair.h>
 #include <__utility/unreachable.h>
@@ -56,32 +55,7 @@ __mismatch(_Iter1 __first1, _Sent1 __last1, _Iter2 __first2, _Pred& __pred, _Pro
 }
 
 #if _LIBCPP_VECTORIZE_ALGORITHMS
-template <class _ValueType, size_t _Np>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<long long, _Np>
-__reverse_vector(__simd_vector<long long, _Np> __cmp_res) {
-  return [&]<size_t... _Indices>(index_sequence<_Indices...>) {
-    return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...);
-  }(make_index_sequence<_Np>{});
-}
-template <class _ValueType, size_t _Np>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<long, _Np> __reverse_vector(__simd_vector<long, _Np> __cmp_res) {
-  return [&]<size_t... _Indices>(index_sequence<_Indices...>) {
-    return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...);
-  }(make_index_sequence<_Np>{});
-}
-template <class _ValueType, size_t _Np>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<int, _Np> __reverse_vector(__simd_vector<int, _Np> __cmp_res) {
-  return [&]<size_t... _Indices>(index_sequence<_Indices...>) {
-    return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...);
-  }(make_index_sequence<_Np>{});
-}
-template <class _ValueType, size_t _Np>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI __simd_vector<_ValueType, _Np>
-__reverse_vector(__simd_vector<_ValueType, _Np> __cmp_res) {
-  return [&]<size_t... _Indices>(index_sequence<_Indices...>) {
-    return __builtin_shufflevector(__cmp_res, __cmp_res, (_Np - _Indices - 1)...);
-  }(make_index_sequence<_Np>{});
-}
+
 template <class _Iter>
 _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_Iter, _Iter>
 __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
@@ -103,11 +77,7 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
       }
 
       for (size_t __i = 0; __i != __unroll_count; ++__i) {
-        auto __cmp_res = __lhs[__i] == __rhs[__i];
-#  if defined(_LIBCPP_BIG_ENDIAN)
-        __cmp_res = std::__reverse_vector<__value_type>(__cmp_res);
-#  endif
-        if (!std::__all_of(__cmp_res)) {
+        if (auto __cmp_res = __lhs[__i] == __rhs[__i]; !std::__all_of(__cmp_res)) {
           auto __offset = __i * __vec_size + std::__find_first_not_set(__cmp_res);
           return {__first1 + __offset, __first2 + __offset};
         }
@@ -119,11 +89,8 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
 
     // check the remaining 0-3 vectors
     while (static_cast<size_t>(__last1 - __first1) >= __vec_size) {
-      auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
-#  if defined(_LIBCPP_BIG_ENDIAN)
-      __cmp_res = std::__reverse_vector<__value_type>(__cmp_res);
-#  endif
-      if (!std::__all_of(__cmp_res)) {
+      if (auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
+          !std::__all_of(__cmp_res)) {
         auto __offset = std::__find_first_not_set(__cmp_res);
         return {__first1 + __offset, __first2 + __offset};
       }
@@ -139,11 +106,8 @@ __mismatch_vectorized(_Iter __first1, _Iter __last1, _Iter __first2) {
     if (static_cast<size_t>(__first1 - __orig_first1) >= __vec_size) {
       __first1 = __last1 - __vec_size;
       __first2 = __last2 - __vec_size;
-      auto __cmp_res = std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2);
-#  if defined(_LIBCPP_BIG_ENDIAN)
-      __cmp_res = std::__reverse_vector<__value_type>(__cmp_res);
-#  endif
-      auto __offset = std::__find_first_not_set(__cmp_res);
+      auto __offset =
+          std::__find_first_not_set(std::__load_vector<__vec>(__first1) == std::__load_vector<__vec>(__first2));
       return {__first1 + __offset, __first2 + __offset};
     } // else loop over the elements individually
   }
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index aa4336a2214c8..fd1d3d439092b 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -11,7 +11,11 @@
 
 #include <__algorithm/min.h>
 #include <__bit/bit_cast.h>
-#include <__bit/countr.h>
+#if defined(_LIBCPP_BIG_ENDIAN)
+#  include <__bit/countl.h>
+#else
+#  include <__bit/countr.h>
+#endif
 #include <__config>
 #include <__type_traits/is_arithmetic.h>
 #include <__type_traits/is_same.h>
@@ -126,8 +130,13 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_T
 
   // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876
   auto __impl = [&]<class _MaskT>(_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+    return std::min<size_t>(
+        _Np, std::__countl_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
+#  else
     return std::min<size_t>(
         _Np, std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
+#  endif
   };
 
   if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) {



More information about the libcxx-commits mailing list