[libcxx-commits] [libcxx] [libc++] Vectorize mismatch (PR #73255)

Sat Dec 23 05:05:37 PST 2023

https://github.com/philnik777 updated https://github.com/llvm/llvm-project/pull/73255

>From 6a9f6de198bcc9f7da1be0e14c2bd448cf9c8831 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Sat, 23 Dec 2023 12:07:10 +0100
Subject: [PATCH 1/2] [libc++][NFC] Refactor <experimental/simd> a bit to
 simplify dependencies

---
 libcxx/include/CMakeLists.txt                 |  2 -
 libcxx/include/experimental/__simd/abi_tag.h  | 55 -------------------
 .../include/experimental/__simd/aligned_tag.h | 13 ++++-
 .../include/experimental/__simd/declaration.h | 52 +++++++++++++++++-
 .../__simd/internal_declaration.h             | 41 --------------
 libcxx/include/experimental/__simd/scalar.h   |  2 +-
 libcxx/include/experimental/__simd/simd.h     |  2 -
 .../include/experimental/__simd/simd_mask.h   |  2 -
 libcxx/include/experimental/__simd/traits.h   | 15 +----
 libcxx/include/experimental/__simd/vec_ext.h  |  2 +-
 libcxx/include/experimental/simd              |  1 -
 libcxx/include/module.modulemap.in            |  2 -
 12 files changed, 66 insertions(+), 123 deletions(-)
 delete mode 100644 libcxx/include/experimental/__simd/abi_tag.h
 delete mode 100644 libcxx/include/experimental/__simd/internal_declaration.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 746d5812fba048..0fe3ab44d2466e 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -914,10 +914,8 @@ set(files
   expected
   experimental/__config
   experimental/__memory
-  experimental/__simd/abi_tag.h
   experimental/__simd/aligned_tag.h
   experimental/__simd/declaration.h
-  experimental/__simd/internal_declaration.h
   experimental/__simd/reference.h
   experimental/__simd/scalar.h
   experimental/__simd/simd.h
diff --git a/libcxx/include/experimental/__simd/abi_tag.h b/libcxx/include/experimental/__simd/abi_tag.h
deleted file mode 100644
index cec5be65ce5c21..00000000000000
--- a/libcxx/include/experimental/__simd/abi_tag.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP_EXPERIMENTAL___SIMD_ABI_TAG_H
-#define _LIBCPP_EXPERIMENTAL___SIMD_ABI_TAG_H
-
-#include <cstddef>
-#include <experimental/__config>
-#include <experimental/__simd/internal_declaration.h>
-
-#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-
-_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
-inline namespace parallelism_v2 {
-namespace simd_abi {
-
-using scalar = __scalar;
-
-// TODO: make this platform dependent
-template <int _Np>
-using fixed_size = __vec_ext<_Np>;
-
-template <class _Tp>
-inline constexpr int max_fixed_size = 32;
-
-// TODO: make this platform dependent
-template <class _Tp>
-using compatible = __vec_ext<16 / sizeof(_Tp)>;
-
-// TODO: make this platform dependent
-template <class _Tp>
-using native = __vec_ext<_LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES / sizeof(_Tp)>;
-
-// TODO: make this platform dependent
-template <class _Tp, size_t _Np, class... _Abis>
-struct deduce {
-  using type = fixed_size<_Np>;
-};
-
-// TODO: make this platform dependent
-template <class _Tp, size_t _Np, class... _Abis>
-using deduce_t = typename deduce<_Tp, _Np, _Abis...>::type;
-
-} // namespace simd_abi
-} // namespace parallelism_v2
-_LIBCPP_END_NAMESPACE_EXPERIMENTAL
-
-#endif // _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-#endif // _LIBCPP_EXPERIMENTAL___SIMD_ABI_TAG_H
diff --git a/libcxx/include/experimental/__simd/aligned_tag.h b/libcxx/include/experimental/__simd/aligned_tag.h
index d216a21c073f3a..edbb3b24931f5a 100644
--- a/libcxx/include/experimental/__simd/aligned_tag.h
+++ b/libcxx/include/experimental/__simd/aligned_tag.h
@@ -10,10 +10,10 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H
 
-#include <__bit/bit_ceil.h>
 #include <__memory/assume_aligned.h>
 #include <cstddef>
 #include <experimental/__config>
+#include <experimental/__simd/traits.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
 
@@ -30,9 +30,12 @@ struct element_aligned_tag {
   }
 };
 
+template <>
+inline constexpr bool is_simd_flag_type_v<element_aligned_tag> = true;
+
 struct vector_aligned_tag {
   template <class _Tp, class _Up = typename _Tp::value_type>
-  static constexpr size_t __alignment = std::__bit_ceil(sizeof(_Up) * _Tp::size());
+  static constexpr size_t __alignment = memory_alignment_v<_Tp, _Up>;
 
   template <class _Tp, class _Up>
   static _LIBCPP_HIDE_FROM_ABI constexpr _Up* __apply(_Up* __ptr) {
@@ -40,6 +43,9 @@ struct vector_aligned_tag {
   }
 };
 
+template <>
+inline constexpr bool is_simd_flag_type_v<vector_aligned_tag> = true;
+
 template <size_t _Np>
 struct overaligned_tag {
   template <class _Tp, class _Up = typename _Tp::value_type>
@@ -51,6 +57,9 @@ struct overaligned_tag {
   }
 };
 
+template <size_t _Np>
+inline constexpr bool is_simd_flag_type_v<overaligned_tag<_Np>> = true;
+
 inline constexpr element_aligned_tag element_aligned{};
 
 inline constexpr vector_aligned_tag vector_aligned{};
diff --git a/libcxx/include/experimental/__simd/declaration.h b/libcxx/include/experimental/__simd/declaration.h
index 065faeaec3841f..7b45d035c27121 100644
--- a/libcxx/include/experimental/__simd/declaration.h
+++ b/libcxx/include/experimental/__simd/declaration.h
@@ -10,13 +10,63 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H
 
+#include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
 
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
 inline namespace parallelism_v2 {
+namespace simd_abi {
+template <int>
+struct __vec_ext;
+struct __scalar;
+
+using scalar = __scalar;
+
+// TODO: make this platform dependent
+template <int _Np>
+using fixed_size = __vec_ext<_Np>;
+
+template <class _Tp>
+inline constexpr int max_fixed_size = 32;
+
+// TODO: make this platform dependent
+template <class _Tp>
+using compatible = __vec_ext<16 / sizeof(_Tp)>;
+
+// TODO: make this platform dependent
+template <class _Tp>
+using native = __vec_ext<_LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES / sizeof(_Tp)>;
+
+// TODO: make this platform dependent
+template <class _Tp, size_t _Np, class... _Abis>
+struct deduce {
+  using type = fixed_size<_Np>;
+};
+
+// TODO: make this platform dependent
+template <class _Tp, size_t _Np, class... _Abis>
+using deduce_t = typename deduce<_Tp, _Np, _Abis...>::type;
+
+} // namespace simd_abi
+
+template <class _Tp, class _Abi>
+struct __simd_storage;
+
+template <class _Tp, class _Abi>
+struct __mask_storage;
+
+template <class _Tp, class _Abi>
+struct __simd_operations;
+
+template <class _Tp, class _Abi>
+struct __mask_operations;
+
+struct element_aligned_tag;
+struct vector_aligned_tag;
+template <size_t>
+struct overaligned_tag;
 
 template <class _Tp, class _Abi = simd_abi::compatible<_Tp>>
 class simd;
diff --git a/libcxx/include/experimental/__simd/internal_declaration.h b/libcxx/include/experimental/__simd/internal_declaration.h
deleted file mode 100644
index 9ad1ad1ae3192f..00000000000000
--- a/libcxx/include/experimental/__simd/internal_declaration.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP_EXPERIMENTAL___SIMD_INTERNAL_DECLARATION_H
-#define _LIBCPP_EXPERIMENTAL___SIMD_INTERNAL_DECLARATION_H
-
-#include <experimental/__config>
-
-#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-
-_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
-inline namespace parallelism_v2 {
-namespace simd_abi {
-template <int>
-struct __vec_ext;
-struct __scalar;
-} // namespace simd_abi
-
-template <class _Tp, class _Abi>
-struct __simd_storage;
-
-template <class _Tp, class _Abi>
-struct __mask_storage;
-
-template <class _Tp, class _Abi>
-struct __simd_operations;
-
-template <class _Tp, class _Abi>
-struct __mask_operations;
-
-} // namespace parallelism_v2
-_LIBCPP_END_NAMESPACE_EXPERIMENTAL
-
-#endif // _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-#endif // _LIBCPP_EXPERIMENTAL___SIMD_INTERNAL_DECLARATION_H
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index 53fa1c29f374ca..5eeff4c1e82a38 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -12,7 +12,7 @@
 
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/internal_declaration.h>
+#include <experimental/__simd/declaration.h>
 #include <experimental/__simd/traits.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index ffb328eb345b1a..c345811fee7fc7 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -15,9 +15,7 @@
 #include <__utility/forward.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 #include <experimental/__simd/declaration.h>
-#include <experimental/__simd/internal_declaration.h>
 #include <experimental/__simd/reference.h>
 #include <experimental/__simd/traits.h>
 #include <experimental/__simd/utility.h>
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index 325b8409e3b6d2..db03843b46e3ad 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -13,9 +13,7 @@
 #include <__type_traits/is_same.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 #include <experimental/__simd/declaration.h>
-#include <experimental/__simd/internal_declaration.h>
 #include <experimental/__simd/reference.h>
 #include <experimental/__simd/traits.h>
 
diff --git a/libcxx/include/experimental/__simd/traits.h b/libcxx/include/experimental/__simd/traits.h
index 9b4abe9d0c232e..ec25b4bfa7f95e 100644
--- a/libcxx/include/experimental/__simd/traits.h
+++ b/libcxx/include/experimental/__simd/traits.h
@@ -10,14 +10,12 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H
 
+#include <__bit/bit_ceil.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_same.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
-#include <experimental/__simd/aligned_tag.h>
 #include <experimental/__simd/declaration.h>
-#include <experimental/__simd/internal_declaration.h>
 #include <experimental/__simd/utility.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
@@ -47,15 +45,6 @@ struct is_simd_mask : bool_constant<is_simd_mask_v<_Tp>> {};
 template <class _Tp>
 inline constexpr bool is_simd_flag_type_v = false;
 
-template <>
-inline constexpr bool is_simd_flag_type_v<element_aligned_tag> = true;
-
-template <>
-inline constexpr bool is_simd_flag_type_v<vector_aligned_tag> = true;
-
-template <size_t _Np>
-inline constexpr bool is_simd_flag_type_v<overaligned_tag<_Np>> = true;
-
 template <class _Tp>
 struct is_simd_flag_type : bool_constant<is_simd_flag_type_v<_Tp>> {};
 
@@ -71,7 +60,7 @@ inline constexpr size_t simd_size_v = simd_size<_Tp, _Abi>::value;
 template <class _Tp,
           class _Up = typename _Tp::value_type,
           bool      = (is_simd_v<_Tp> && __is_vectorizable_v<_Up>) || (is_simd_mask_v<_Tp> && is_same_v<_Up, bool>)>
-struct memory_alignment : integral_constant<size_t, vector_aligned_tag::__alignment<_Tp, _Up>> {};
+struct memory_alignment : integral_constant<size_t, std::__bit_ceil(sizeof(_Up) * _Tp::size())> {};
 
 template <class _Tp, class _Up>
 struct memory_alignment<_Tp, _Up, false> {};
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index 56a0b888104bfa..07ba032f493b1e 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -15,7 +15,7 @@
 #include <__utility/integer_sequence.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/internal_declaration.h>
+#include <experimental/__simd/declaration.h>
 #include <experimental/__simd/traits.h>
 #include <experimental/__simd/utility.h>
 
diff --git a/libcxx/include/experimental/simd b/libcxx/include/experimental/simd
index 56858832857c17..adca9faa47bb06 100644
--- a/libcxx/include/experimental/simd
+++ b/libcxx/include/experimental/simd
@@ -78,7 +78,6 @@ inline namespace parallelism_v2 {
 #endif
 
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 #include <experimental/__simd/aligned_tag.h>
 #include <experimental/__simd/declaration.h>
 #include <experimental/__simd/scalar.h>
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index a37e96205cf2e0..d10670d4faaffc 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -530,10 +530,8 @@ module std_experimental [system] {
     export *
   }
   module simd {
-    module abi_tag              { private header "experimental/__simd/abi_tag.h" }
     module aligned_tag          { private header "experimental/__simd/aligned_tag.h" }
     module declaration          { private header "experimental/__simd/declaration.h" }
-    module internal_declaration { private header "experimental/__simd/internal_declaration.h" }
     module reference            { private header "experimental/__simd/reference.h" }
     module scalar               { private header "experimental/__simd/scalar.h" }
     module simd                 { private header "experimental/__simd/simd.h" }

>From 5251bb216104da9cf7cb69e850613e874163d4b5 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Sun, 8 Oct 2023 12:45:40 +0200
Subject: [PATCH 2/2] [libc++] Vectorize mismatch

---
 .../benchmarks/algorithms/mismatch.bench.cpp  |  31 ++
 libcxx/include/CMakeLists.txt                 |   1 +
 libcxx/include/__algorithm/mismatch.h         | 158 +++++++++-
 libcxx/include/__algorithm/vectorization.h    |  78 +++++
 libcxx/include/__bit/has_single_bit.h         |  15 +-
 libcxx/include/__utility/align_down.h         |  31 ++
 libcxx/include/experimental/__simd/avx512.h   |  93 ++++++
 .../include/experimental/__simd/declaration.h |  13 +
 .../experimental/__simd/feature_traits.h      | 298 ++++++++++++++++++
 libcxx/include/experimental/__simd/simd.h     |  73 +++++
 .../include/experimental/__simd/simd_mask.h   |  26 ++
 libcxx/include/experimental/__simd/vec_ext.h  |  33 ++
 libcxx/src/memory_resource.cpp                |   3 +-
 .../mismatch/mismatch.pass.cpp                |  77 ++---
 14 files changed, 863 insertions(+), 67 deletions(-)
 create mode 100644 libcxx/benchmarks/algorithms/mismatch.bench.cpp
 create mode 100644 libcxx/include/__algorithm/vectorization.h
 create mode 100644 libcxx/include/__utility/align_down.h
 create mode 100644 libcxx/include/experimental/__simd/avx512.h
 create mode 100644 libcxx/include/experimental/__simd/feature_traits.h

diff --git a/libcxx/benchmarks/algorithms/mismatch.bench.cpp b/libcxx/benchmarks/algorithms/mismatch.bench.cpp
new file mode 100644
index 00000000000000..3cce0c108ee1ec
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/mismatch.bench.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <random>
+
+template <class T>
+static void bm_find(benchmark::State& state) {
+  std::vector<T> vec1(state.range(), '1');
+  std::vector<T> vec2(state.range(), '1');
+  std::mt19937_64 rng(std::random_device{}());
+
+  for (auto _ : state) {
+    auto idx  = rng() % vec1.size();
+    vec1[idx] = '2';
+    benchmark::DoNotOptimize(vec1);
+    benchmark::DoNotOptimize(std::mismatch(vec1.begin(), vec1.end(), vec2.begin()));
+    vec1[idx] = '1';
+  }
+}
+BENCHMARK(bm_find<char>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_find<short>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_find<int>)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 0fe3ab44d2466e..791d1386c0e141 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -229,6 +229,7 @@ set(files
   __algorithm/unwrap_iter.h
   __algorithm/unwrap_range.h
   __algorithm/upper_bound.h
+  __algorithm/vectorization.h
   __assert
   __atomic/aliases.h
   __atomic/atomic.h
diff --git a/libcxx/include/__algorithm/mismatch.h b/libcxx/include/__algorithm/mismatch.h
index d345b6048a7e9b..20e59e805d7f03 100644
--- a/libcxx/include/__algorithm/mismatch.h
+++ b/libcxx/include/__algorithm/mismatch.h
@@ -11,9 +11,19 @@
 #define _LIBCPP___ALGORITHM_MISMATCH_H
 
 #include <__algorithm/comp.h>
+#include <__algorithm/unwrap_iter.h>
+#include <__algorithm/vectorization.h>
 #include <__config>
+#include <__functional/identity.h>
 #include <__iterator/iterator_traits.h>
+#include <__type_traits/invoke.h>
+#include <__type_traits/is_equality_comparable.h>
+#include <__utility/align_down.h>
+#include <__utility/move.h>
 #include <__utility/pair.h>
+#include <experimental/__simd/feature_traits.h>
+#include <experimental/__simd/simd.h>
+#include <experimental/__simd/simd_mask.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -21,13 +31,151 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _InIter1, class _Sent1, class _InIter2, class _Pred, class _Proj1, class _Proj2>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter1, _InIter2>
+__mismatch_loop(_InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Pred __pred, _Proj1 __proj1, _Proj2 __proj2) {
+  while (__first1 != __last1) {
+    if (!std::__invoke(__pred, std::__invoke(__proj1, *__first1), std::__invoke(__proj2, *__first2)))
+      break;
+    ++__first1;
+    ++__first2;
+  }
+  return {std::move(__first1), std::move(__first2)};
+}
+
+#if _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
+template <class _Tp>
+struct __mismatch_vector_impl {
+  template <bool _VectorizeFloatingPoint>
+  static constexpr bool __can_vectorize =
+      (__libcpp_is_trivially_equality_comparable<_Tp, _Tp>::value && __fits_in_vector<_Tp> &&
+       alignof(_Tp) >= alignof(__get_arithmetic_type<_Tp>)) ||
+      (_VectorizeFloatingPoint && is_floating_point_v<_Tp>);
+
+  using __vec         = __arithmetic_vec<_Tp>;
+  using __mask_traits = experimental::__mask_traits<typename __vec::value_type, typename __vec::abi_type>;
+  static constexpr size_t __unroll_count = 4;
+
+  struct __result {
+    _Tp* __iter1;
+    _Tp* __iter2;
+    bool __matched;
+  };
+
+  _LIBCPP_HIDE_FROM_ABI static __result __prologue(_Tp* __first1, _Tp* __last1, _Tp* __first2) {
+    if constexpr (__mask_traits::__has_maskload) {
+      auto __first_aligned = std::__align_down(__vec::size(), __first1);
+      auto __offset        = __first1 - __first_aligned;
+      auto __checked_size  = __vec::size() - __offset;
+      if (__checked_size < __last1 - __first1)
+        return {__first1, __first2, false};
+      auto __second_aligned = __first2 - __offset;
+      auto __mask           = __mask_traits::__mask_with_last_enabled(__checked_size);
+      __vec __lhs =
+          __mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__first_aligned), __mask);
+      __vec __rhs =
+          __mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__second_aligned), __mask);
+      auto __res      = __mask_traits::__mask_cmp_eq(__mask, __lhs, __rhs);
+      auto __inv_mask = ~__mask.__get_data().__mask_;
+      if ((__res.__get_data().__mask_ & __mask.__get_data().__mask_) != __mask.__get_data().__mask_) {
+        auto __match_offset = experimental::find_first_set(decltype(__mask){
+            experimental::__from_storage, {decltype(__res.__get_data().__mask_)(~__res.__get_data().__mask_)}});
+        return {__first_aligned + __match_offset, __second_aligned + __match_offset, true};
+      }
+      return {__first_aligned + __vec::size(), __second_aligned + __vec::size(), false};
+    } else {
+      return {__first1, __first2, false};
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __result __loop(_Tp* __first1, _Tp* __last1, _Tp* __first2) {
+    while (__last1 - __first1 >= __unroll_count * __vec::size()) {
+      __vec __lhs[__unroll_count];
+      __vec __rhs[__unroll_count];
+
+      for (size_t __i = 0; __i != __unroll_count; ++__i) {
+        __lhs[__i] = std::__load_as_arithmetic(__first1 + __i * __vec::size());
+        __rhs[__i] = std::__load_as_arithmetic(__first2 + __i * __vec::size());
+      }
+
+      for (size_t __i = 0; __i != __unroll_count; ++__i) {
+        if (auto __res = __lhs[__i] == __rhs[__i]; !experimental::all_of(__res)) {
+          auto __offset = __i * __vec::size() + experimental::find_first_set(__res);
+          return {__first1 + __offset, __first2 + __offset, true};
+        }
+      }
+
+      __first1 += __unroll_count * __vec::size();
+      __first2 += __unroll_count * __vec::size();
+    }
+    return {__first1, __first2, __first1 == __last1};
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static pair<_Tp*, _Tp*> __epilogue(_Tp* __first1, _Tp* __last1, _Tp* __first2) {
+    if constexpr (__mask_traits::__has_maskload) {
+      auto __size = __last1 - __first1;
+      auto __mask = __mask_traits::__mask_with_first_enabled(__size);
+      __vec __lhs =
+          __mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__first1), __mask);
+      __vec __rhs =
+          __mask_traits::__maskload_unaligned(reinterpret_cast<typename __vec::value_type*>(__first2), __mask);
+      auto __res      = __mask_traits::__mask_cmp_eq(__mask, __lhs, __rhs);
+      auto __inv_mask = ~__mask.__get_data().__mask_;
+      if ((__res.__get_data().__mask_ | __inv_mask) != decltype(__mask){true}.__get_data().__mask_) {
+        auto __offset = experimental::find_first_set(__res);
+        return {__first1 + __offset, __first2 + __offset};
+      }
+      return {__first1 + __size, __first2 + __size};
+    } else {
+      return std::__mismatch_loop(__first1, __last1, __first2, __equal_to(), __identity(), __identity());
+    }
+  }
+};
+#endif // _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
+
+template <class _InIter1, class _Sent1, class _InIter2, class _Pred, class _Proj1, class _Proj2>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter1, _InIter2>
+__mismatch(_InIter1 __first1, _Sent1 __last1, _InIter2 __first2, _Pred __pred, _Proj1 __proj1, _Proj2 __proj2) {
+  return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
+}
+
+#if _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS
+template <
+    class _Tp,
+    class _Pred,
+    class _Proj1,
+    class _Proj2,
+    enable_if_t<
+        __desugars_to<__equal_tag, _Pred, _Tp, _Tp>::value && __is_identity<_Proj1>::value &&
+            __is_identity<_Proj2>::value &&
+            __mismatch_vector_impl<_Tp>::template __can_vectorize<_LIBCPP_VECTORIZE_FLOATING_POINT_CLASSIC_ALGORITHMS>,
+        int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI inline constexpr pair<_Tp*, _Tp*>
+__mismatch(_Tp* __first1, _Tp* __last1, _Tp* __first2, _Pred __pred, _Proj1 __proj1, _Proj2 __proj2) {
+  if (__libcpp_is_constant_evaluated())
+    return std::__mismatch_loop(__first1, __last1, __first2, __pred, __proj1, __proj2);
+
+  using __impl = __mismatch_vector_impl<_Tp>;
+
+  // auto [__piter1, __piter2, __pmatch] = __impl::__prologue(__first1, __last1, __first2);
+  // if (__pmatch)
+  //   return {__piter1, __piter2};
+
+  auto [__iter1, __iter2, __matched] = __impl::__loop(__first1, __last1, __first2);
+  if (__matched)
+    return {__iter1, __iter2};
+
+  return __impl::__epilogue(__first1, __last1, __first2);
+}
+#endif // _LIBCPP_VECTORIZE_ALGORITHMS
+
 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
-_LIBCPP_NODISCARD_EXT inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InputIterator1, _InputIterator2>
 mismatch(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _BinaryPredicate __pred) {
-  for (; __first1 != __last1; ++__first1, (void)++__first2)
-    if (!__pred(*__first1, *__first2))
-      break;
-  return pair<_InputIterator1, _InputIterator2>(__first1, __first2);
+  __identity __proj;
+  auto __res = std::__mismatch(
+      std::__unwrap_iter(__first1), std::__unwrap_iter(__last1), std::__unwrap_iter(__first2), __pred, __proj, __proj);
+  return std::make_pair(std::__rewrap_iter(__first1, __res.first), std::__rewrap_iter(__first2, __res.second));
 }
 
 template <class _InputIterator1, class _InputIterator2>
diff --git a/libcxx/include/__algorithm/vectorization.h b/libcxx/include/__algorithm/vectorization.h
new file mode 100644
index 00000000000000..f2b139ff4a1ffe
--- /dev/null
+++ b/libcxx/include/__algorithm/vectorization.h
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_VECTORIZATION_H
+#define _LIBCPP___ALGORITHM_VECTORIZATION_H
+
+#include <__config>
+#include <__type_traits/is_floating_point.h>
+#include <__utility/integer_sequence.h>
+#include <experimental/__simd/simd.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
+#  define _LIBCPP_CAN_VECTORIZE_ALGORIHTMS 1
+#else
+#  define _LIBCPP_CAN_VECTORIZE_ALGORIHTMS 0
+#endif
+
+#if _LIBCPP_CAN_VECTORIZE_ALGORIHTMS && !defined(__OPTIMIZE_SIZE__)
+#  define _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS 1
+#else
+#  define _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS 0
+#endif
+
+#if _LIBCPP_VECTORIZE_CLASSIC_ALGORITHMS && defined(__FAST_MATH__)
+#  define _LIBCPP_VECTORIZE_FLOATING_POINT_CLASSIC_ALGORITHMS 1
+#else
+#  define _LIBCPP_VECTORIZE_FLOATING_POINT_CLASSIC_ALGORITHMS 0
+#endif
+
+#if _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+inline static const bool __fits_in_vector =
+    sizeof(_Tp) == 1 || sizeof(_Tp) == 2 || sizeof(_Tp) == 4 || sizeof(_Tp) == 8;
+
+template <class _Tp>
+_LIBCPP_HIDE_FROM_ABI constexpr auto __get_arithmetic_type_impl() {
+  if constexpr (is_floating_point_v<_Tp>)
+    return _Tp{};
+  else if constexpr (constexpr auto __sz = sizeof(_Tp); __sz == 1)
+    return uint8_t{};
+  else if constexpr (__sz == 2)
+    return uint16_t{};
+  else if constexpr (__sz == 4)
+    return uint32_t{};
+  else if constexpr (__sz == 8)
+    return uint64_t{};
+  else
+    static_assert(false, "unexpected sizeof type");
+}
+
+template <class _Tp>
+using __get_arithmetic_type = decltype(__get_arithmetic_type_impl<_Tp>());
+
+template <class _Tp>
+using __arithmetic_vec = experimental::native_simd<__get_arithmetic_type<_Tp>>;
+
+template <class _Tp>
+_LIBCPP_HIDE_FROM_ABI __arithmetic_vec<_Tp> __load_as_arithmetic(_Tp* __values) {
+  return {reinterpret_cast<__get_arithmetic_type<_Tp>*>(__values), 0};
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_CAN_VECTORIZE_ALGORIHTMS
+
+#endif // _LIBCPP___ALGORITHM_VECTORIZATION_H
diff --git a/libcxx/include/__bit/has_single_bit.h b/libcxx/include/__bit/has_single_bit.h
index a4e178060a73a3..bc75158206829c 100644
--- a/libcxx/include/__bit/has_single_bit.h
+++ b/libcxx/include/__bit/has_single_bit.h
@@ -19,19 +19,24 @@
 _LIBCPP_PUSH_MACROS
 #include <__undef_macros>
 
-#if _LIBCPP_STD_VER >= 20
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <__libcpp_unsigned_integer _Tp>
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr bool has_single_bit(_Tp __t) noexcept {
+template <class _Tp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI constexpr bool __has_single_bit(_Tp __t) noexcept {
   return __t != 0 && (((__t & (__t - 1)) == 0));
 }
 
-_LIBCPP_END_NAMESPACE_STD
+#if _LIBCPP_STD_VER >= 20
+
+template <__libcpp_unsigned_integer _Tp>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr bool has_single_bit(_Tp __t) noexcept {
+  return std::__has_single_bit(__t);
+}
 
 #endif // _LIBCPP_STD_VER >= 20
 
+_LIBCPP_END_NAMESPACE_STD
+
 _LIBCPP_POP_MACROS
 
 #endif // _LIBCPP___BIT_HAS_SINGLE_BIT_H
diff --git a/libcxx/include/__utility/align_down.h b/libcxx/include/__utility/align_down.h
new file mode 100644
index 00000000000000..d0bbfbb568ee20
--- /dev/null
+++ b/libcxx/include/__utility/align_down.h
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___UTILITY_ALIGN_DOWN_H
+#define _LIBCPP___UTILITY_ALIGN_DOWN_H
+
+#include <__config>
+#include <cstddef>
+#include <cstdint>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI inline _Tp* __align_down(size_t __align, _Tp* __ptr) {
+  _LIBCPP_ASSERT_UNCATEGORIZED(
+      __align >= alignof(_Tp), "Alignment has to be at least as large as the required alignment");
+  return reinterpret_cast<_Tp*>(reinterpret_cast<uintptr_t>(__ptr) & ~(__align - 1));
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___UTILITY_ALIGN_DOWN_H
diff --git a/libcxx/include/experimental/__simd/avx512.h b/libcxx/include/experimental/__simd/avx512.h
new file mode 100644
index 00000000000000..f8875b8aa8b206
--- /dev/null
+++ b/libcxx/include/experimental/__simd/avx512.h
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_EXPERIMENTAL___SIMD_AVX512_H
+#define _LIBCPP_EXPERIMENTAL___SIMD_AVX512_H
+
+#include <__bit/bit_ceil.h>
+#include <experimental/__config>
+#include <experimental/__simd/declaration.h>
+#include <experimental/__simd/vec_ext.h>
+
+#if __has_include(<immintrin.h>)
+#  include <immintrin.h>
+#endif
+
+#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) && defined(__AVX512F__)
+
+_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
+inline namespace parallelism_v2 {
+namespace simd_abi {
+template <int _Np>
+struct __avx512 {
+  static constexpr size_t __simd_size = _Np;
+};
+
+template <class _Tp>
+inline constexpr bool __is_avx512_v = false;
+
+template <int _Np>
+inline constexpr bool __is_avx512_v<__avx512<_Np>> = true;
+} // namespace simd_abi
+
+template <int _Np>
+inline constexpr bool is_abi_tag_v<simd_abi::__avx512<_Np>> = _Np > 0 && _Np <= 64;
+
+template <class _Tp, int _Np>
+struct __simd_storage<_Tp, simd_abi::__avx512<_Np>> : __simd_storage<_Tp, simd_abi::__vec_ext<_Np>> {};
+
+template <class _Tp, int _Np>
+struct __mask_storage<_Tp, simd_abi::__avx512<_Np>> {
+  _LIBCPP_HIDE_FROM_ABI static constexpr auto __get_mask_t() {
+    if constexpr (_Np <= 8)
+      return __mmask8{};
+    else if constexpr (_Np <= 16)
+      return __mmask16{};
+    else if constexpr (_Np <= 32)
+      return __mmask32{};
+    else if constexpr (_Np <= 64)
+      return __mmask64{};
+    else
+      static_assert(_Np == -1, "Unexpected size");
+  }
+  decltype(__get_mask_t()) __mask_;
+
+  _LIBCPP_HIDE_FROM_ABI bool __get(size_t __index) const noexcept { return __mask_ & 1 << __index; }
+  _LIBCPP_HIDE_FROM_ABI void __set(size_t __index, bool __value) noexcept {
+    if (__value)
+      __mask_ |= 1 << __index;
+    else
+      __mask_ &= ~(1 << __index);
+  }
+};
+
+template <class _Tp, int _Np>
+struct __simd_operations<_Tp, simd_abi::__avx512<_Np>> : __simd_operations<_Tp, simd_abi::__vec_ext<_Np>> {};
+
+template <class _Tp, int _Np>
+struct __mask_operations<_Tp, simd_abi::__avx512<_Np>> {
+  using _MaskStorage = __mask_storage<_Tp, simd_abi::__avx512<_Np>>;
+
+  _LIBCPP_HIDE_FROM_ABI static _MaskStorage __broadcast(bool __v) noexcept {
+    if (__v)
+      return {numeric_limits<_MaskStorage>::max()};
+    else
+      return {0};
+  }
+
+  _LIBCPP_HIDE_FROM_ABI static bool all_of(_MaskStorage __mask) noexcept {
+    return __mask.__mask_ == __broadcast(true).__mask_;
+  }
+};
+} // namespace parallelism_v2
+
+_LIBCPP_END_NAMESPACE_EXPERIMENTAL
+
+#endif // _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) && defined(__AVX512F__)
+
+#endif // _LIBCPP_EXPERIMENTAL___SIMD_AVX512_H
diff --git a/libcxx/include/experimental/__simd/declaration.h b/libcxx/include/experimental/__simd/declaration.h
index 7b45d035c27121..aa87d75738044d 100644
--- a/libcxx/include/experimental/__simd/declaration.h
+++ b/libcxx/include/experimental/__simd/declaration.h
@@ -18,6 +18,11 @@
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
 inline namespace parallelism_v2 {
 namespace simd_abi {
+#ifdef __AVX512F__
+template <int>
+struct __avx512;
+#endif
+
 template <int>
 struct __vec_ext;
 struct __scalar;
@@ -36,8 +41,13 @@ template <class _Tp>
 using compatible = __vec_ext<16 / sizeof(_Tp)>;
 
 // TODO: make this platform dependent
+#ifdef __AVX512F__
+template <class _Tp>
+using native = __avx512<64 / sizeof(_Tp)>;
+#else
 template <class _Tp>
 using native = __vec_ext<_LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES / sizeof(_Tp)>;
+#endif
 
 // TODO: make this platform dependent
 template <class _Tp, size_t _Np, class... _Abis>
@@ -51,6 +61,9 @@ using deduce_t = typename deduce<_Tp, _Np, _Abis...>::type;
 
 } // namespace simd_abi
 
+struct __from_storage_t {};
+inline constexpr __from_storage_t __from_storage;
+
 template <class _Tp, class _Abi>
 struct __simd_storage;
 
diff --git a/libcxx/include/experimental/__simd/feature_traits.h b/libcxx/include/experimental/__simd/feature_traits.h
new file mode 100644
index 00000000000000..6d96bf3856bde1
--- /dev/null
+++ b/libcxx/include/experimental/__simd/feature_traits.h
@@ -0,0 +1,298 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_EXPERIMENTAL___SIMD_FEATURE_TRAITS_H
+#define _LIBCPP_EXPERIMENTAL___SIMD_FEATURE_TRAITS_H
+
+#include <__bit/has_single_bit.h>
+#include <__config>
+#include <__memory/assume_aligned.h>
+#include <experimental/__simd/declaration.h>
+#include <experimental/__simd/vec_ext.h>
+
+#ifdef __AVX512F__
+#  include <immintrin.h>
+#endif
+
+// The intrinsics cannot be portably qualified. This isn't super problematic, since we're only dealing with builtin
+// types anyways.
+// NOLINTBEGIN(libcpp-robust-against-adl)
+
+#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
+
+_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
+inline namespace parallelism_v2 {
+
+template <class _Tp, class _Abi, class = void>
+struct __mask_traits {
+  static constexpr bool __has_maskload  = false;
+  static constexpr bool __has_maskstore = false;
+};
+
+template <uint64_t __base_pattern>
+_LIBCPP_HIDE_FROM_ABI uint64_t __set_least_significant_bits(size_t __count) noexcept {
+  uint64_t __bits = __base_pattern;
+  __bits >>= 64 - __count;
+  return __bits;
+}
+
+template <uint64_t __base_pattern>
+_LIBCPP_HIDE_FROM_ABI uint64_t __set_most_significant_bits(size_t __count) noexcept {
+  uint64_t __bits = __base_pattern;
+  __bits <<= 64 - __count;
+  __bits &= __base_pattern;
+  return __bits;
+}
+
+#  ifdef __AVX512F__
+
+template <class _Tp, size_t _Np>
+struct __mask_traits<_Tp, simd_abi::__avx512<_Np>, enable_if_t<is_integral_v<_Tp>>> {
+private:
+  static constexpr size_t __element_count = _Np;
+  static constexpr size_t __element_size  = sizeof(_Tp);
+
+  using __simd_t = simd<_Tp, simd_abi::__avx512<_Np>>;
+  using __mask_t = simd_mask<_Tp, simd_abi::__avx512<_Np>>;
+
+  using __storage_t [[__gnu__::__vector_size__(_Np * sizeof(_Tp))]] = _Tp;
+
+public:
+#    ifdef __AVX512VL__
+  static constexpr bool __has_maskload  = std::__has_single_bit(_Np);
+  static constexpr bool __has_maskstore = __has_maskload;
+
+  static _LIBCPP_HIDE_FROM_ABI __simd_t __maskload_unaligned(const _Tp* __ptr, __mask_t __mask_wrapped) {
+    if constexpr (!__has_maskload) {
+      return {};
+    } else {
+      __storage_t __data = [&] {
+        auto __mask = __mask_wrapped.__get_data().__mask_;
+
+        if constexpr (__element_size == 1) {
+          if constexpr (__element_count == 16) {
+            return _mm_maskz_loadu_epi8(__mask, __ptr);
+          } else if constexpr (__element_count == 32) {
+            return _mm256_maskz_loadu_epi8(__mask, __ptr);
+          } else if constexpr (__element_count == 64) {
+            return _mm512_maskz_loadu_epi8(__mask, __ptr);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        } else if constexpr (__element_size == 2) {
+          if constexpr (__element_count == 8) {
+            return _mm_maskz_loadu_epi16(__mask, __ptr);
+          } else if constexpr (__element_count == 16) {
+            return _mm256_maskz_loadu_epi16(__mask, __ptr);
+          } else if constexpr (__element_count == 32) {
+            return _mm512_maskz_loadu_epi16(__mask, __ptr);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        } else if constexpr (__element_size == 4) {
+          if constexpr (__element_count == 4) {
+            return _mm_maskz_loadu_epi32(__mask, __ptr);
+          } else if constexpr (__element_count == 8) {
+            return _mm256_maskz_loadu_epi32(__mask, __ptr);
+          } else if constexpr (__element_count == 16) {
+            return _mm512_maskz_loadu_epi32(__mask, __ptr);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        } else if constexpr (__element_size == 8) {
+          if constexpr (__element_count == 2) {
+            return _mm_maskz_loadu_epi64(__mask, __ptr);
+          } else if constexpr (__element_count == 4) {
+            return _mm256_maskz_loadu_epi64(__mask, __ptr);
+          } else if constexpr (__element_count == 8) {
+            return _mm512_maskz_loadu_epi64(__mask, __ptr);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        } else {
+          static_assert(_Np == 3, "Unexpected size");
+        }
+      }();
+      return {__from_storage, { __data }};
+    }
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void __maskstore(const _Tp* __ptr_raw, __simd_t __data_wrapped, __mask_t __mask_wrapped) {
+    if constexpr (!__has_maskstore) {
+      return;
+    } else {
+      [&] {
+        auto __mask = __mask_wrapped.__get_data();
+        auto __data = __data_wrapped.__get_data();
+        auto __ptr  = std::__assume_aligned<sizeof(__storage_t)>(__ptr_raw);
+
+        if constexpr (__element_size == 1) {
+          if constexpr (__element_count == 16) {
+            return _mm_mask_storeu_epi8(__ptr, __mask, __data);
+          } else if constexpr (__element_count == 32) {
+            return _mm256_mask_storeu_epi8(__ptr, __mask, __data);
+          } else if constexpr (__element_count == 64) {
+            return _mm512_mask_storeu_epi8(__ptr, __mask, __data);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        } else if constexpr (__element_size == 2) {
+          if constexpr (__element_count == 8) {
+            return _mm_mask_storeu_epi16(__ptr, __mask, __data);
+          } else if constexpr (__element_count == 16) {
+            return _mm256_mask_storeu_epi16(__ptr, __mask, __data);
+          } else if constexpr (__element_count == 32) {
+            return _mm512_mask_storeu_epi16(__ptr, __mask, __data);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        } else if constexpr (__element_size == 4) {
+          if constexpr (__element_count == 4) {
+            return _mm_mask_store_epi32(__ptr, __mask, __data);
+          } else if constexpr (__element_count == 8) {
+            return _mm256_mask_store_epi32(__ptr, __mask, __data);
+          } else if constexpr (__element_count == 16) {
+            return _mm512_mask_store_epi32(__ptr, __mask, __data);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        } else if constexpr (__element_size == 8) {
+          if constexpr (__element_count == 2) {
+            return _mm_mask_store_epi64(__ptr, __mask, __data);
+          } else if constexpr (__element_count == 4) {
+            return _mm256_mask_store_epi64(__ptr, __mask, __data);
+          } else if constexpr (__element_count == 8) {
+            return _mm512_mask_store_epi64(__ptr, __mask, __data);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        }
+      }();
+    }
+  }
+
+  static __mask_t __mask_with_first_enabled(size_t __n) noexcept {
+    if constexpr (__element_count == 2) {
+      auto __bitmask = experimental::__set_most_significant_bits<0x0000000000000003>(__n);
+      return {__from_storage, { static_cast<__mmask8>(__bitmask) }};
+    } else if constexpr (__element_count == 4) {
+      auto __bitmask = experimental::__set_most_significant_bits<0x000000000000000F>(__n);
+      return {__from_storage, { static_cast<__mmask8>(__bitmask) }};
+    } else if constexpr (__element_count == 8) {
+      auto __bitmask = experimental::__set_most_significant_bits<0x00000000000000FF>(__n);
+      return {__from_storage, { static_cast<__mmask8>(__bitmask) }};
+    } else if constexpr (__element_count == 16) {
+      auto __bitmask = experimental::__set_most_significant_bits<0x000000000000FFFF>(__n);
+      return {__from_storage, { static_cast<__mmask16>(__bitmask) }};
+    } else if constexpr (__element_count == 32) {
+      auto __bitmask = experimental::__set_most_significant_bits<0x00000000FFFFFFFF>(__n);
+      return {__from_storage, { static_cast<__mmask32>(__bitmask) }};
+    } else if constexpr (__element_count == 64) {
+      auto __bitmask = experimental::__set_most_significant_bits<0xFFFFFFFFFFFFFFFF>(__n);
+      return {__from_storage, { static_cast<__mmask64>(__bitmask) }};
+    }
+  }
+
+  static __mask_t __mask_with_last_enabled(size_t __n) noexcept {
+    if constexpr (__element_count == 2) {
+      auto __bitmask = experimental::__set_least_significant_bits<0x0000000000000003>(__n);
+      return {__from_storage, { static_cast<__mmask8>(__bitmask) }};
+    } else if constexpr (__element_count == 4) {
+      auto __bitmask = experimental::__set_least_significant_bits<0x000000000000000F>(__n);
+      return {__from_storage, { static_cast<__mmask8>(__bitmask) }};
+    } else if constexpr (__element_count == 8) {
+      auto __bitmask = experimental::__set_least_significant_bits<0x00000000000000FF>(__n);
+      return {__from_storage, { static_cast<__mmask8>(__bitmask) }};
+    } else if constexpr (__element_count == 16) {
+      auto __bitmask = experimental::__set_least_significant_bits<0x000000000000FFFF>(__n);
+      return {__from_storage, { static_cast<__mmask16>(__bitmask) }};
+    } else if constexpr (__element_count == 32) {
+      auto __bitmask = experimental::__set_least_significant_bits<0x00000000FFFFFFFF>(__n);
+      return {__from_storage, { static_cast<__mmask32>(__bitmask) }};
+    } else if constexpr (__element_count == 64) {
+      auto __bitmask = experimental::__set_least_significant_bits<0xFFFFFFFFFFFFFFFF>(__n);
+      return {__from_storage, { static_cast<__mmask64>(__bitmask) }};
+    }
+  }
+
+  template <int __comparator>
+  static _LIBCPP_HIDE_FROM_ABI __mask_t
+  __mask_cmp_mask(__mask_t __mask_wrapped, __simd_t __lhs_wrapped, __simd_t __rhs_wrapped) {
+    if constexpr (!__has_maskstore) {
+      return;
+    } else {
+      auto __ret = [&] {
+        auto __mask = __mask_wrapped.__get_data().__mask_;
+        auto __lhs  = __lhs_wrapped.__get_data().__data;
+        auto __rhs  = __rhs_wrapped.__get_data().__data;
+
+        if constexpr (__element_size == 1) {
+          if constexpr (__element_count == 16) {
+            return _mm_mask_cmp_epi8_mask(__mask, __lhs, __rhs, __comparator);
+          } else if constexpr (__element_count == 32) {
+            return _mm256_mask_cmp_epi8_mask(__mask, __lhs, __rhs, __comparator);
+          } else if constexpr (__element_count == 64) {
+            return _mm512_mask_cmp_epi8_mask(__mask, __lhs, __rhs, __comparator);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        } else if constexpr (__element_size == 2) {
+          if constexpr (__element_count == 8) {
+            return _mm_mask_cmp_epi16_mask(__mask, __lhs, __rhs, __comparator);
+          } else if constexpr (__element_count == 16) {
+            return _mm256_mask_cmp_epi16_mask(__mask, __lhs, __rhs, __comparator);
+          } else if constexpr (__element_count == 32) {
+            return _mm512_mask_cmp_epi16_mask(__mask, __lhs, __rhs, __comparator);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        } else if constexpr (__element_size == 4) {
+          if constexpr (__element_count == 4) {
+            return _mm_mask_cmp_epi32_mask(__mask, __lhs, __rhs, __comparator);
+          } else if constexpr (__element_count == 8) {
+            return _mm256_mask_cmp_epi32_mask(__mask, __lhs, __rhs, __comparator);
+          } else if constexpr (__element_count == 16) {
+            return _mm512_mask_cmp_epi32_mask(__mask, __lhs, __rhs, __comparator);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        } else if constexpr (__element_size == 8) {
+          if constexpr (__element_count == 2) {
+            return _mm_mask_cmp_epi64_mask(__mask, __lhs, __rhs, __comparator);
+          } else if constexpr (__element_count == 4) {
+            return _mm256_mask_cmp_epi64_mask(__mask, __lhs, __rhs, __comparator);
+          } else if constexpr (__element_count == 8) {
+            return _mm512_mask_cmp_epi64_mask(__mask, __lhs, __rhs, __comparator);
+          } else {
+            static_assert(_Np == 3, "Unexpected size");
+          }
+        }
+      }();
+      return {__from_storage, {__ret}};
+    }
+  }
+
+  static _LIBCPP_HIDE_FROM_ABI __mask_t __mask_cmp_eq(__mask_t __mask, __simd_t __lhs, __simd_t __rhs) noexcept {
+    return __mask_cmp_mask<_MM_CMPINT_EQ>(__mask, __lhs, __rhs);
+  }
+#    else
+  static constexpr bool __has_maskload  = false;
+  static constexpr bool __has_maskstore = false;
+#    endif
+};
+
+#  endif // __AVX512F__
+
+} // namespace parallelism_v2
+_LIBCPP_END_NAMESPACE_EXPERIMENTAL
+
+#endif
+
+// NOLINTEND(libcpp-robust-against-adl)
+
+#endif // _LIBCPP_EXPERIMENTAL___SIMD_FEATURE_TRAITS_H
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index c345811fee7fc7..adc40048954c98 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -15,6 +15,7 @@
 #include <__utility/forward.h>
 #include <cstddef>
 #include <experimental/__config>
+#include <experimental/__simd/avx512.h>
 #include <experimental/__simd/declaration.h>
 #include <experimental/__simd/reference.h>
 #include <experimental/__simd/traits.h>
@@ -44,6 +45,9 @@ class simd {
 
   _LIBCPP_HIDE_FROM_ABI simd() noexcept = default;
 
+  template <class _Up, class _Flags>
+  _LIBCPP_HIDE_FROM_ABI simd(const _Up* __data, _Flags) noexcept : __s_(_Impl::__load(__data)) {}
+
   // broadcast constructor
   template <class _Up, enable_if_t<__can_broadcast_v<value_type, __remove_cvref_t<_Up>>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI simd(_Up&& __v) noexcept : __s_(_Impl::__broadcast(static_cast<value_type>(__v))) {}
@@ -64,9 +68,78 @@ class simd {
   explicit _LIBCPP_HIDE_FROM_ABI simd(_Generator&& __g) noexcept
       : __s_(_Impl::__generate(std::forward<_Generator>(__g))) {}
 
+  _LIBCPP_HIDE_FROM_ABI simd(__from_storage_t, _Storage __data) noexcept : __s_(__data) {}
+
   // scalar access [simd.subscr]
   _LIBCPP_HIDE_FROM_ABI reference operator[](size_t __i) noexcept { return reference(__s_, __i); }
   _LIBCPP_HIDE_FROM_ABI value_type operator[](size_t __i) const noexcept { return __s_.__get(__i); }
+
+  _LIBCPP_HIDE_FROM_ABI _Storage __get_data() const { return __s_; }
+
+#  ifdef __AVX512F__
+  template <int __comparator>
+  static _LIBCPP_HIDE_FROM_ABI auto __cmp(_Storage __lhs_wrapped, _Storage __rhs_wrapped) {
+      auto __lhs = __lhs_wrapped.__data;
+      auto __rhs = __rhs_wrapped.__data;
+      constexpr auto __element_size  = sizeof(_Tp);
+      constexpr auto __element_count = size();
+      if constexpr (__element_size == 1) {
+        if constexpr (__element_count == 16) {
+          return _mm_cmp_epi8_mask(__lhs, __rhs, __comparator);
+        } else if constexpr (__element_count == 32) {
+          return _mm256_cmp_epi8_mask(__lhs, __rhs, __comparator);
+        } else if constexpr (__element_count == 64) {
+          return _mm512_cmp_epi8_mask(__lhs, __rhs, __comparator);
+        } else {
+          static_assert(__element_count == 0, "Unexpected size");
+        }
+      } else if constexpr (__element_size == 2) {
+        if constexpr (__element_count == 8) {
+          return _mm_cmp_epi16_mask(__lhs, __rhs, __comparator);
+        } else if constexpr (__element_count == 16) {
+          return _mm256_cmp_epi16_mask(__lhs, __rhs, __comparator);
+        } else if constexpr (__element_count == 32) {
+          return _mm512_cmp_epi16_mask(__lhs, __rhs, __comparator);
+        } else {
+          static_assert(__element_count == 0, "Unexpected size");
+        }
+      } else if constexpr (__element_size == 4) {
+        if constexpr (__element_count == 4) {
+          return _mm_cmp_epi32_mask(__lhs, __rhs, __comparator);
+        } else if constexpr (__element_count == 8) {
+          return _mm256_cmp_epi32_mask(__lhs, __rhs, __comparator);
+        } else if constexpr (__element_count == 16) {
+          return _mm512_cmp_epi32_mask(__lhs, __rhs, __comparator);
+        } else {
+          static_assert(__element_count == 0, "Unexpected size");
+        }
+      } else if constexpr (__element_size == 8) {
+        if constexpr (__element_count == 2) {
+          return _mm_cmp_epi64_mask(__lhs, __rhs, __comparator);
+        } else if constexpr (__element_count == 4) {
+          return _mm256_cmp_epi64_mask(__lhs, __rhs, __comparator);
+        } else if constexpr (__element_count == 8) {
+          return _mm512_cmp_epi64_mask(__lhs, __rhs, __comparator);
+        } else {
+          static_assert(__element_count == 0, "Unexpected size");
+        }
+      }
+  }
+#  endif
+
+  friend _LIBCPP_HIDE_FROM_ABI mask_type operator==(const simd& __lhs, const simd& __rhs) noexcept {
+#ifdef __AVX512F__
+    if constexpr (simd_abi::__is_avx512_v<_Abi>) {
+      return {__from_storage, {__cmp<_MM_CMPINT_EQ>(__lhs.__s_, __rhs.__s_)}};
+    } else
+#endif
+    {
+      mask_type __result;
+      for (int __i = 0; __i != size(); ++__i)
+        __result[__i] = __lhs[__i] == __rhs[__i];
+      return __result;
+    }
+  }
 };
 
 template <class _Tp, class _Abi>
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index db03843b46e3ad..3a3dab7cbb7b5b 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -11,6 +11,7 @@
 #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H
 
 #include <__type_traits/is_same.h>
+#include <__utility/unreachable.h>
 #include <cstddef>
 #include <experimental/__config>
 #include <experimental/__simd/declaration.h>
@@ -41,6 +42,8 @@ class simd_mask {
 
   _LIBCPP_HIDE_FROM_ABI simd_mask() noexcept = default;
 
+  _LIBCPP_HIDE_FROM_ABI simd_mask(__from_storage_t, _Storage __data) : __s_(__data) {}
+
   // broadcast constructor
   _LIBCPP_HIDE_FROM_ABI explicit simd_mask(value_type __v) noexcept : __s_(_Impl::__broadcast(__v)) {}
 
@@ -55,6 +58,8 @@ class simd_mask {
   // scalar access [simd.mask.subscr]
   _LIBCPP_HIDE_FROM_ABI reference operator[](size_t __i) noexcept { return reference(__s_, __i); }
   _LIBCPP_HIDE_FROM_ABI value_type operator[](size_t __i) const noexcept { return __s_.__get(__i); }
+
+  _LIBCPP_HIDE_FROM_ABI _Storage __get_data() const noexcept { return __s_; }
 };
 
 template <class _Tp, class _Abi>
@@ -66,6 +71,27 @@ using native_simd_mask = simd_mask<_Tp, simd_abi::native<_Tp>>;
 template <class _Tp, int _Np>
 using fixed_size_simd_mask = simd_mask<_Tp, simd_abi::fixed_size<_Np>>;
 
+template <class _Tp, class _Abi>
+_LIBCPP_HIDE_FROM_ABI bool all_of(const simd_mask<_Tp, _Abi>& __mask) noexcept {
+  return __mask_operations<_Tp, _Abi>::all_of(__mask.__get_data());
+}
+
+template <class _Tp, class _Abi>
+_LIBCPP_HIDE_FROM_ABI int find_first_set(const simd_mask<_Tp, _Abi>& __mask) noexcept {
+#  ifdef __AVX512F__
+  if constexpr (simd_abi::__is_avx512_v<_Abi>) {
+    return std::__countl_zero(__mask.__get_data().__mask_);
+  } else
+#  endif
+  {
+    for (int __i = 0; __i != __mask.size(); ++__i) {
+      if (__mask[__i])
+        return __i;
+    }
+    std::__libcpp_unreachable();
+  }
+}
+
 } // namespace parallelism_v2
 _LIBCPP_END_NAMESPACE_EXPERIMENTAL
 
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index 07ba032f493b1e..79dbfdf5982461 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -19,6 +19,10 @@
 #include <experimental/__simd/traits.h>
 #include <experimental/__simd/utility.h>
 
+#if __has_include(<immintrin.h>)
+#  include <immintrin.h>
+#endif
+
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
 
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
@@ -73,6 +77,14 @@ struct __simd_operations<_Tp, simd_abi::__vec_ext<_Np>> {
   static _LIBCPP_HIDE_FROM_ABI _SimdStorage __generate(_Generator&& __g) noexcept {
     return __generate_init(std::forward<_Generator>(__g), std::make_index_sequence<_Np>());
   }
+
+  template <class _Up>
+  static _LIBCPP_HIDE_FROM_ABI _SimdStorage __load(const _Up* __data) noexcept {
+    _SimdStorage __result;
+    for (size_t __i = 0; __i != _Np; ++__i)
+      __result.__set(__i, __data[__i]);
+    return __result;
+  }
 };
 
 template <class _Tp, int _Np>
@@ -87,6 +99,27 @@ struct __mask_operations<_Tp, simd_abi::__vec_ext<_Np>> {
     }
     return __result;
   }
+
+  static _LIBCPP_HIDE_FROM_ABI bool all_of(_MaskStorage __mask) noexcept {
+    [[maybe_unused]] constexpr auto __vec_size = sizeof(_Tp) * _Np;
+#  ifdef __AVX2__
+    if constexpr (__vec_size == 32) {
+      return _mm256_movemask_epi8((__m256i)__mask.__data) == 0xffffffffU;
+    } else
+#  endif
+#  ifdef __SSE2__
+    if constexpr (__vec_size == 16) {
+      return _mm_movemask_epi8((__m128i)__mask.__data) == 0xffffU;
+    } else
+#  endif
+    {
+      for (int __i = 0; __i != _Np; ++__i) {
+        if (!__mask.__get(__i))
+          return false;
+      }
+      return true;
+    }
+  }
 };
 
 } // namespace parallelism_v2
diff --git a/libcxx/src/memory_resource.cpp b/libcxx/src/memory_resource.cpp
index afd1b892086da8..7aea374a5a8c1d 100644
--- a/libcxx/src/memory_resource.cpp
+++ b/libcxx/src/memory_resource.cpp
@@ -8,6 +8,7 @@
 
 #include <memory>
 #include <memory_resource>
+#include <__utility/align_down.h>
 
 #ifndef _LIBCPP_HAS_NO_ATOMIC_HEADER
 #  include <atomic>
@@ -416,7 +417,7 @@ static void* align_down(size_t align, size_t size, void*& ptr, size_t& space) {
     return nullptr;
 
   char* p1      = static_cast<char*>(ptr);
-  char* new_ptr = reinterpret_cast<char*>(reinterpret_cast<uintptr_t>(p1 - size) & ~(align - 1));
+  char* new_ptr = static_cast<char*>(std::__align_down(align, p1 - size));
 
   if (new_ptr < (p1 - space))
     return nullptr;
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
index cc588c095ccfb2..e5f481bc6cfae0 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/mismatch/mismatch.pass.cpp
@@ -23,71 +23,36 @@
 #include "test_macros.h"
 #include "test_iterators.h"
 
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
-    int ia[] = {1, 3, 6, 7};
-    int ib[] = {1, 3};
-    int ic[] = {1, 3, 5, 7};
-    typedef cpp17_input_iterator<int*>         II;
-    typedef bidirectional_iterator<int*> BI;
+TEST_CONSTEXPR_CXX20 bool test() {
+  int ia[]          = {0, 1, 2, 2, 0, 1, 2, 3};
+  const unsigned sa = sizeof(ia) / sizeof(ia[0]);
+  int ib[]          = {0, 1, 2, 3, 0, 1, 2, 3};
+  const unsigned sb = sizeof(ib) / sizeof(ib[0]);
+  ((void)sb); // unused in C++11
 
-    auto p1 = std::mismatch(std::begin(ia), std::end(ia), std::begin(ic));
-    if (p1.first != ia+2 || p1.second != ic+2)
-        return false;
+  typedef cpp17_input_iterator<const int*> II;
+  typedef random_access_iterator<const int*> RAI;
 
-    auto p2 = std::mismatch(std::begin(ia), std::end(ia), std::begin(ic), std::end(ic));
-    if (p2.first != ia+2 || p2.second != ic+2)
-        return false;
+  assert(std::mismatch(II(ia), II(ia + sa), II(ib)) == (std::pair<II, II>(II(ia + 3), II(ib + 3))));
 
-    auto p3 = std::mismatch(std::begin(ib), std::end(ib), std::begin(ic));
-    if (p3.first != ib+2 || p3.second != ic+2)
-        return false;
-
-    auto p4 = std::mismatch(std::begin(ib), std::end(ib), std::begin(ic), std::end(ic));
-    if (p4.first != ib+2 || p4.second != ic+2)
-        return false;
-
-    auto p5 = std::mismatch(II(std::begin(ib)), II(std::end(ib)), II(std::begin(ic)));
-    if (p5.first != II(ib+2) || p5.second != II(ic+2))
-        return false;
-    auto p6 = std::mismatch(BI(std::begin(ib)), BI(std::end(ib)), BI(std::begin(ic)), BI(std::end(ic)));
-    if (p6.first != BI(ib+2) || p6.second != BI(ic+2))
-        return false;
-
-    return true;
-    }
-#endif
-
-int main(int, char**)
-{
-    int ia[] = {0, 1, 2, 2, 0, 1, 2, 3};
-    const unsigned sa = sizeof(ia)/sizeof(ia[0]);
-    int ib[] = {0, 1, 2, 3, 0, 1, 2, 3};
-    const unsigned sb = sizeof(ib)/sizeof(ib[0]); ((void)sb); // unused in C++11
-
-    typedef cpp17_input_iterator<const int*> II;
-    typedef random_access_iterator<const int*>  RAI;
-
-    assert(std::mismatch(II(ia), II(ia + sa), II(ib))
-            == (std::pair<II, II>(II(ia+3), II(ib+3))));
-
-    assert(std::mismatch(RAI(ia), RAI(ia + sa), RAI(ib))
-            == (std::pair<RAI, RAI>(RAI(ia+3), RAI(ib+3))));
+  assert(std::mismatch(RAI(ia), RAI(ia + sa), RAI(ib)) == (std::pair<RAI, RAI>(RAI(ia + 3), RAI(ib + 3))));
 
 #if TEST_STD_VER > 11 // We have the four iteration version
-    assert(std::mismatch(II(ia), II(ia + sa), II(ib), II(ib+sb))
-            == (std::pair<II, II>(II(ia+3), II(ib+3))));
-
-    assert(std::mismatch(RAI(ia), RAI(ia + sa), RAI(ib), RAI(ib+sb))
-            == (std::pair<RAI, RAI>(RAI(ia+3), RAI(ib+3))));
+  assert(std::mismatch(II(ia), II(ia + sa), II(ib), II(ib + sb)) == (std::pair<II, II>(II(ia + 3), II(ib + 3))));
 
+  assert(std::mismatch(RAI(ia), RAI(ia + sa), RAI(ib), RAI(ib + sb)) ==
+         (std::pair<RAI, RAI>(RAI(ia + 3), RAI(ib + 3))));
 
-    assert(std::mismatch(II(ia), II(ia + sa), II(ib), II(ib+2))
-            == (std::pair<II, II>(II(ia+2), II(ib+2))));
+  assert(std::mismatch(II(ia), II(ia + sa), II(ib), II(ib + 2)) == (std::pair<II, II>(II(ia + 2), II(ib + 2))));
 #endif
 
-#if TEST_STD_VER > 17
-    static_assert(test_constexpr());
+  return true;
+}
+
+int main(int, char**) {
+  test();
+#if TEST_STD_VER >= 20
+  static_assert(test());
 #endif
 
   return 0;