[libcxx-commits] [libcxx] [libc++] Optimize std::remove (PR #107756)
Nikolas Klauser via libcxx-commits
libcxx-commits at lists.llvm.org
Sun Sep 8 07:45:40 PDT 2024
https://github.com/philnik777 created https://github.com/llvm/llvm-project/pull/107756
None
>From 983d3477338be496c187e969631d438239e62a9f Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Sun, 8 Sep 2024 16:44:22 +0200
Subject: [PATCH] [libc++] Optimize std::remove
---
libcxx/include/__algorithm/count.h | 6 +-
libcxx/include/__algorithm/remove.h | 41 +++++-
libcxx/include/__algorithm/simd_utils.h | 136 +++++++++++++++---
libcxx/include/__bit/popcount.h | 39 +++--
.../include/__stop_token/atomic_unique_lock.h | 2 +-
libcxx/test/benchmarks/CMakeLists.txt | 1 +
.../benchmarks/algorithms/remove.bench.cpp | 73 ++++++++++
7 files changed, 245 insertions(+), 53 deletions(-)
create mode 100644 libcxx/test/benchmarks/algorithms/remove.bench.cpp
diff --git a/libcxx/include/__algorithm/count.h b/libcxx/include/__algorithm/count.h
index 1cfe7f631ac1b7..bc75f4f421badf 100644
--- a/libcxx/include/__algorithm/count.h
+++ b/libcxx/include/__algorithm/count.h
@@ -55,17 +55,17 @@ __count_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
__storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
__storage_type __dn = std::min(__clz_f, __n);
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
- __r = std::__libcpp_popcount(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
+ __r = std::__popcount(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
__n -= __dn;
++__first.__seg_;
}
// do middle whole words
for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
- __r += std::__libcpp_popcount(std::__invert_if<!_ToCount>(*__first.__seg_));
+ __r += std::__popcount(std::__invert_if<!_ToCount>(*__first.__seg_));
// do last partial word
if (__n > 0) {
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
- __r += std::__libcpp_popcount(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
+ __r += std::__popcount(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
}
return __r;
}
diff --git a/libcxx/include/__algorithm/remove.h b/libcxx/include/__algorithm/remove.h
index fd01c23cb6708a..b2ac7018c4bbc3 100644
--- a/libcxx/include/__algorithm/remove.h
+++ b/libcxx/include/__algorithm/remove.h
@@ -11,6 +11,9 @@
#include <__algorithm/find.h>
#include <__algorithm/find_if.h>
+#include <__algorithm/simd_utils.h>
+#include <__algorithm/unwrap_iter.h>
+#include <__bit/popcount.h>
#include <__config>
#include <__utility/move.h>
@@ -23,12 +26,36 @@ _LIBCPP_PUSH_MACROS
_LIBCPP_BEGIN_NAMESPACE_STD
-template <class _ForwardIterator, class _Tp>
-_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
-remove(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+template <class _Tp, __enable_if_t<__has_compressstore<__simd_vector<_Tp, __native_vector_size<_Tp>>>, int> = 0>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp*
+__remove(_Tp* __first, _Tp* __last, const _Tp& __val) {
+ __first = std::find(__first, __last, __val);
+ constexpr size_t __vec_size = __native_vector_size<_Tp>;
+ using __vec = __simd_vector<_Tp, __vec_size>;
+
+ auto __vals = std::__broadcast<__vec>(__val);
+ _Tp* __out = __first;
+
+ while (static_cast<size_t>(__last - __first) >= __vec_size) {
+ auto __elements = std::__load_vector<__vec>(__first);
+ auto __cmp = __elements != __vals;
+ std::__compressstore(__out, __elements, __cmp);
+ __out += std::__popcount(std::__to_int_mask(__cmp));
+ __first += __vec_size;
+ }
+ for (; __first != __last; ++__first) {
+ if (*__first != __val)
+ *__out++ = *__first;
+ }
+ return __out;
+}
+
+template <class _Iter, class _Tp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Iter
+__remove(_Iter __first, _Iter __last, const _Tp& __value) {
__first = std::find(__first, __last, __value);
if (__first != __last) {
- _ForwardIterator __i = __first;
+ _Iter __i = __first;
while (++__i != __last) {
if (!(*__i == __value)) {
*__first = std::move(*__i);
@@ -39,6 +66,12 @@ remove(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
return __first;
}
+template <class _ForwardIterator, class _Tp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _ForwardIterator
+remove(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+ return std::__rewrap_iter(__first, std::__remove(std::__unwrap_iter(__first), std::__unwrap_iter(__last), __value));
+}
+
_LIBCPP_END_NAMESPACE_STD
_LIBCPP_POP_MACROS
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index 549197be80183f..473d158b40499d 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -108,6 +108,13 @@ _LIBCPP_HIDE_FROM_ABI _Tp __simd_vector_underlying_type_impl(__simd_vector<_Tp,
template <class _VecT>
using __simd_vector_underlying_type_t = decltype(std::__simd_vector_underlying_type_impl(_VecT{}));
+template <class _VecT>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _VecT __broadcast(__simd_vector_underlying_type_t<_VecT> __value) noexcept {
+ return [=]<size_t... _Indices>(index_sequence<_Indices...>) _LIBCPP_ALWAYS_INLINE noexcept {
+ return _VecT{((void)_Indices, __value)...};
+ }(make_index_sequence<__simd_vector_size_v<_VecT>>{});
+}
+
// This isn't inlined without always_inline when loading chars.
template <class _VecT, class _Iter>
_LIBCPP_NODISCARD _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vector(_Iter __iter) noexcept {
@@ -116,6 +123,111 @@ _LIBCPP_NODISCARD _LIBCPP_ALWAYS_INLINE _LIBCPP_HIDE_FROM_ABI _VecT __load_vecto
}(make_index_sequence<__simd_vector_size_v<_VecT>>{});
}
+template <size_t _Np>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI auto __extend_vector(__simd_vector<bool, _Np> __vec) noexcept {
+ using _VecT = __simd_vector<bool, _Np>;
+ static_assert(_Np <= 8, "Unexpected vector size");
+ if constexpr (_Np >= 4) {
+ return __builtin_shufflevector(__vec, _VecT{}, 0, 1, 2, 3, 4, 5, 6, 7);
+ } else if constexpr (_Np >= 2) {
+ return std::__extend_vector(__builtin_shufflevector(__vec, _VecT{}, 0, 1, 2, 3));
+ } else {
+ return std::__extend_vector(__builtin_shufflevector(__vec, _VecT{}, 0, 1));
+ }
+}
+
+template <class _Tp, size_t _Np>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI auto __to_int_mask(__simd_vector<_Tp, _Np> __in_vec) noexcept {
+ auto __vec = __builtin_convertvector(__in_vec, __simd_vector<bool, _Np>);
+ if constexpr (_Np <= 8) {
+ return std::__bit_cast<uint8_t>(std::__extend_vector(__vec));
+ } else if constexpr (_Np <= 16) {
+ return std::__bit_cast<uint16_t>(__vec);
+ } else if constexpr (_Np <= 32) {
+ return std::__bit_cast<uint32_t>(__vec);
+ } else if constexpr (_Np <= 64) {
+ return std::__bit_cast<uint64_t>(__vec);
+ } else {
+ static_assert(sizeof(__simd_vector<bool, _Np>) == 0, "unexpected vector size");
+ }
+}
+
+template <size_t _Np>
+__simd_vector<bool, _Np> __popcount(__simd_vector<bool, _Np> __vec) {
+ return std::__popcount(std::__to_int_mask(__vec));
+}
+
+template <class _Vector, class = void>
+inline constexpr bool __has_compressstore = false;
+
+void __compressstore() = delete;
+
+# if defined(__BMI2__) && __has_builtin(__builtin_ia32_bzhi_di) && defined(__AVX512VL__)
+
+// 8 bit elements
+# if defined(__AVX512BW__) && defined(__AVX512VBMI2__)
+# if __has_builtin(__builtin_ia32_compressqi256_mask) && __has_builtin(__builtin_ia32_storedquqi256_mask)
+template <class _Tp>
+inline constexpr bool __has_compressstore<__simd_vector<_Tp, 32>, __enable_if_t<sizeof(_Tp) == 1>> = true;
+
+template <class _Tp, __enable_if_t<sizeof(_Tp) == 1, int> = 0>
+void __compressstore(_Tp* __dest, __simd_vector<_Tp, 32> __vec, __simd_vector<_Tp, 32> __mask) {
+ auto __storemask = __builtin_ia32_bzhi_di(-1, std::__popcount(std::__to_int_mask(__mask)));
+ __builtin_ia32_storedquqi256_mask(
+ (__simd_vector<char, 32>*)__dest,
+ __builtin_ia32_compressqi256_mask(__vec, {}, std::__to_int_mask(__mask)),
+ __storemask);
+}
+# endif // __has_builtin(__builtin_ia32_compressqi256_mask) && __has_builtin(__builtin_ia32_storedquqi256_mask)
+
+// 16 bit elements
+# if __has_builtin(__builtin_ia32_compresshi256_mask) && __has_builtin(__builtin_ia32_storedquhi256_mask)
+template <class _Tp>
+inline constexpr bool __has_compressstore<__simd_vector<_Tp, 16>, __enable_if_t<sizeof(_Tp) == 2>> = true;
+
+template <class _Tp, __enable_if_t<sizeof(_Tp) == 2, int> = 0>
+void __compressstore(_Tp* __dest, __simd_vector<_Tp, 16> __vec, __simd_vector<_Tp, 16> __mask) {
+ auto __storemask = __builtin_ia32_bzhi_di(-1, std::__popcount(std::__to_int_mask(__mask)));
+ __builtin_ia32_storedquhi256_mask(
+ (__simd_vector<char, 32>*)__dest,
+ __builtin_ia32_compresshi256_mask(__vec, {}, std::__to_int_mask(__mask)),
+ __storemask);
+}
+# endif // __has_builtin(__builtin_ia32_compresshi256_mask) && __has_builtin(__builtin_ia32_storedquhi256_mask)
+# endif // defined(__AVX512BW__) && defined(__AVX512VBMI2__)
+
+// 32 bit elements
+# if __has_builtin(__builtin_ia32_compresssi256_mask) && __has_builtin(__builtin_ia32_movdqa32store256_mask)
+template <class _Tp>
+inline constexpr bool __has_compressstore<__simd_vector<_Tp, 8>, __enable_if_t<sizeof(_Tp) == 4>> = true;
+
+template <class _Tp, __enable_if_t<sizeof(_Tp) == 4, int> = 0>
+void __compressstore(_Tp* __dest, __simd_vector<_Tp, 8> __vec, __simd_vector<_Tp, 8> __mask) {
+ auto __storemask = __builtin_ia32_bzhi_di(-1, std::__popcount(std::__to_int_mask(__mask)));
+ __builtin_ia32_movdqa32store256_mask(
+ (__simd_vector<char, 32>*)__dest,
+ __builtin_ia32_compresssi256_mask(__vec, {}, std::__to_int_mask(__mask)),
+ __storemask);
+}
+# endif // __has_builtin(__builtin_ia32_compresssi256_mask) && __has_builtin(__builtin_ia32_movdqa32store256_mask)
+
+// 64 bit elements
+# if __has_builtin(__builtin_ia32_compresssi256_mask) && __has_builtin(__builtin_ia32_movdqa64store256_mask)
+template <class _Tp>
+inline constexpr bool __has_compressstore<__simd_vector<_Tp, 4>, __enable_if_t<sizeof(_Tp) == 8>> = true;
+
+template <class _Tp, __enable_if_t<sizeof(_Tp) == 8, int> = 0>
+void __compressstore(_Tp* __dest, __simd_vector<_Tp, 4> __vec, __simd_vector<_Tp, 4> __mask) {
+ auto __storemask = __builtin_ia32_bzhi_di(-1, std::__popcount(std::__to_int_mask(__mask)));
+ __builtin_ia32_movdqa64store256_mask(
+ (__simd_vector<char, 32>*)__dest,
+ __builtin_ia32_compresssi256_mask(__vec, {}, std::__to_int_mask(__mask)),
+ __storemask);
+}
+# endif // __has_builtin(__builtin_ia32_compresssi256_mask) && __has_builtin(__builtin_ia32_movdqa64store256_mask)
+
+# endif
+
template <class _Tp, size_t _Np>
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __vec) noexcept {
return __builtin_reduce_and(__builtin_convertvector(__vec, __simd_vector<bool, _Np>));
@@ -123,31 +235,11 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI bool __all_of(__simd_vector<_Tp, _Np> __
template <class _Tp, size_t _Np>
_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_Tp, _Np> __vec) noexcept {
- using __mask_vec = __simd_vector<bool, _Np>;
-
- // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876
- auto __impl = [&]<class _MaskT>(_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept {
# if defined(_LIBCPP_BIG_ENDIAN)
- return std::min<size_t>(
- _Np, std::__countl_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
+ return std::min<size_t>(_Np, std::__countl_zero(std::__to_int_mask(__vec)));
# else
- return std::min<size_t>(
- _Np, std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
+ return std::min<size_t>(_Np, std::__countr_zero(std::__to_int_mask(__vec)));
# endif
- };
-
- if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) {
- return __impl(uint8_t{});
- } else if constexpr (sizeof(__mask_vec) == sizeof(uint16_t)) {
- return __impl(uint16_t{});
- } else if constexpr (sizeof(__mask_vec) == sizeof(uint32_t)) {
- return __impl(uint32_t{});
- } else if constexpr (sizeof(__mask_vec) == sizeof(uint64_t)) {
- return __impl(uint64_t{});
- } else {
- static_assert(sizeof(__mask_vec) == 0, "unexpected required size for mask integer type");
- return 0;
- }
}
template <class _Tp, size_t _Np>
diff --git a/libcxx/include/__bit/popcount.h b/libcxx/include/__bit/popcount.h
index 5cf0a01d073382..13da1624afcb2a 100644
--- a/libcxx/include/__bit/popcount.h
+++ b/libcxx/include/__bit/popcount.h
@@ -26,40 +26,33 @@ _LIBCPP_PUSH_MACROS
_LIBCPP_BEGIN_NAMESPACE_STD
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_popcount(unsigned __x) _NOEXCEPT {
- return __builtin_popcount(__x);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_popcount(unsigned long __x) _NOEXCEPT {
- return __builtin_popcountl(__x);
-}
-
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __libcpp_popcount(unsigned long long __x) _NOEXCEPT {
- return __builtin_popcountll(__x);
-}
-
-#if _LIBCPP_STD_VER >= 20
-
-template <__libcpp_unsigned_integer _Tp>
-[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int popcount(_Tp __t) noexcept {
-# if __has_builtin(__builtin_popcountg)
+template <class _Tp>
+_LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int __popcount(_Tp __t) _NOEXCEPT {
+#if __has_builtin(__builtin_popcountg)
return __builtin_popcountg(__t);
-# else // __has_builtin(__builtin_popcountg)
+#else // __has_builtin(__builtin_popcountg)
if (sizeof(_Tp) <= sizeof(unsigned int))
- return std::__libcpp_popcount(static_cast<unsigned int>(__t));
+ return __builtin_popcount(__t);
else if (sizeof(_Tp) <= sizeof(unsigned long))
- return std::__libcpp_popcount(static_cast<unsigned long>(__t));
+ return __builtin_popcountl(__t);
else if (sizeof(_Tp) <= sizeof(unsigned long long))
- return std::__libcpp_popcount(static_cast<unsigned long long>(__t));
+ return __builtin_popcountll(__t);
else {
int __ret = 0;
while (__t != 0) {
- __ret += std::__libcpp_popcount(static_cast<unsigned long long>(__t));
+ __ret += std::__popcount(static_cast<unsigned long long>(__t));
__t >>= numeric_limits<unsigned long long>::digits;
}
return __ret;
}
-# endif // __has_builtin(__builtin_popcountg)
+#endif // __has_builtin(__builtin_popcountg)
+}
+
+#if _LIBCPP_STD_VER >= 20
+
+template <__libcpp_unsigned_integer _Tp>
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int popcount(_Tp __t) noexcept {
+ return std::__popcount(__t);
}
#endif // _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__stop_token/atomic_unique_lock.h b/libcxx/include/__stop_token/atomic_unique_lock.h
index 13e59f9f0dce00..20ba663d70c7f0 100644
--- a/libcxx/include/__stop_token/atomic_unique_lock.h
+++ b/libcxx/include/__stop_token/atomic_unique_lock.h
@@ -28,7 +28,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
// and LockedBit is the value of State when the lock bit is set, e.g 1 << 2
template <class _State, _State _LockedBit>
class _LIBCPP_AVAILABILITY_SYNC __atomic_unique_lock {
- static_assert(std::__libcpp_popcount(static_cast<unsigned long long>(_LockedBit)) == 1,
+ static_assert(std::__popcount(static_cast<unsigned long long>(_LockedBit)) == 1,
"LockedBit must be an integer where only one bit is set");
std::atomic<_State>& __state_;
diff --git a/libcxx/test/benchmarks/CMakeLists.txt b/libcxx/test/benchmarks/CMakeLists.txt
index 616cf0ff8d2374..f2e24b3259a070 100644
--- a/libcxx/test/benchmarks/CMakeLists.txt
+++ b/libcxx/test/benchmarks/CMakeLists.txt
@@ -134,6 +134,7 @@ set(BENCHMARK_TESTS
algorithms/ranges_sort.bench.cpp
algorithms/ranges_sort_heap.bench.cpp
algorithms/ranges_stable_sort.bench.cpp
+ algorithms/remove.bench.cpp
algorithms/set_intersection.bench.cpp
algorithms/sort.bench.cpp
algorithms/sort_heap.bench.cpp
diff --git a/libcxx/test/benchmarks/algorithms/remove.bench.cpp b/libcxx/test/benchmarks/algorithms/remove.bench.cpp
new file mode 100644
index 00000000000000..17cda19ec01009
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/remove.bench.cpp
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <benchmark/benchmark.h>
+#include <cstddef>
+#include <random>
+#include <vector>
+
+struct remove_all {
+ std::size_t operator()() { return 10; }
+};
+
+struct remove_first {
+ std::size_t I = 10;
+ std::size_t operator()() { return I++; }
+};
+
+struct remove_every_second {
+ bool b = false;
+ std::size_t operator()() {
+ b = !b;
+ return b ? 10 : 11;
+ }
+};
+
+struct remove_random {
+ std::shared_ptr<std::mt19937> rng = std::make_shared<std::mt19937>(std::random_device{}());
+ std::size_t operator()() {
+ return (*rng)();
+ }
+};
+
+template <class T, class Generator>
+static void bm_remove(benchmark::State& state) {
+ std::vector<T> vec(state.range());
+ Generator gen;
+ std::generate(vec.begin(), vec.end(), gen);
+
+ for (auto _ : state) {
+ auto cpy = vec;
+ benchmark::DoNotOptimize(cpy);
+ benchmark::DoNotOptimize(std::remove(cpy.begin(), cpy.end(), char(10)));
+ }
+}
+BENCHMARK(bm_remove<char, remove_all>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_remove<char, remove_first>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_remove<char, remove_every_second>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_remove<char, remove_random>)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+template <class T, class Generator>
+static void bm_ranges_remove(benchmark::State& state) {
+ std::vector<T> vec(state.range());
+ Generator gen;
+ std::generate(vec.begin(), vec.end(), gen);
+
+ for (auto _ : state) {
+ auto cpy = vec;
+ benchmark::DoNotOptimize(cpy);
+ benchmark::DoNotOptimize(std::ranges::remove(cpy, char(10)));
+ }
+}
+BENCHMARK(bm_ranges_remove<char, remove_all>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_ranges_remove<char, remove_first>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_ranges_remove<char, remove_every_second>)->DenseRange(1, 8)->Range(16, 1 << 20);
+BENCHMARK(bm_ranges_remove<char, remove_random>)->DenseRange(1, 8)->Range(16, 1 << 20);
+
+BENCHMARK_MAIN();
More information about the libcxx-commits
mailing list