[libcxx-commits] [libcxx] 4eddbf9 - std::sort: add BlockQuickSort partitioning algorithm for arithmetic types
Nilay Vaish via libcxx-commits
libcxx-commits at lists.llvm.org
Thu Dec 22 14:47:31 PST 2022
Author: Nilay Vaish
Date: 2022-12-22T14:46:56-08:00
New Revision: 4eddbf9f10a6d1881c93d84f4363d6d881daf848
URL: https://github.com/llvm/llvm-project/commit/4eddbf9f10a6d1881c93d84f4363d6d881daf848
DIFF: https://github.com/llvm/llvm-project/commit/4eddbf9f10a6d1881c93d84f4363d6d881daf848.diff
LOG: std::sort: add BlockQuickSort partitioning algorithm for arithmetic types
This diff modifies std::sort in two ways:
* for arithmetic types we update the core partitioning algorithm to use
BlockQuickSort for partitioning. The partition function was carefully
written to let the compiler generates SIMD instructions without actually
writing SIMD intrinsics in the loop. We see up to 50% better performance
for sorting arithmetic types. The use of the BlockQuickSort partitioning
has been limited to arithmetic types since the algorithm works well when
branch instructions can be avoided during partitioning. This usually not
true for types other than the arithmetic ones.
* for other types (tuples, strings) updates have been made to improve
performance by about 10%. Performance numbers comparing std::sort (old)
and Bitset sort (new) on libcxx benchmark.
name old cpu/op new cpu/op delta
BM_Sort_uint32_Random_1 3.72ns ± 5% 3.78ns ±16% ~ (p=0.819 n=36+34)
BM_Sort_uint32_Random_4 5.42ns ± 5% 5.29ns ± 7% -2.42% (p=0.000 n=35+31)
BM_Sort_uint32_Random_16 10.5ns ± 3% 11.9ns ±15% +13.08% (p=0.000 n=36+40)
BM_Sort_uint32_Random_64 18.6ns ± 7% 18.5ns ±15% -0.95% (p=0.002 n=33+40)
BM_Sort_uint32_Random_256 26.2ns ± 4% 21.3ns ± 8% -18.89% (p=0.000 n=37+34)
BM_Sort_uint32_Random_1024 33.4ns ± 5% 23.3ns ± 4% -30.37% (p=0.000 n=39+35)
BM_Sort_uint32_Random_16384 47.7ns ± 5% 26.7ns ± 5% -44.06% (p=0.000 n=39+35)
BM_Sort_uint32_Random_262144 62.6ns ± 3% 30.1ns ± 6% -51.81% (p=0.000 n=37+36)
BM_Sort_uint32_Ascending_1 3.71ns ± 3% 4.28ns ± 3% +15.53% (p=0.000 n=37+35)
BM_Sort_uint32_Ascending_4 1.47ns ± 3% 1.46ns ± 3% ~ (p=0.083 n=36+37)
BM_Sort_uint32_Ascending_16 0.93ns ± 4% 1.02ns ± 3% +9.32% (p=0.000 n=36+36)
BM_Sort_uint32_Ascending_64 1.23ns ± 5% 1.51ns ± 3% +22.56% (p=0.000 n=34+36)
BM_Sort_uint32_Ascending_256 1.21ns ± 3% 1.57ns ± 4% +29.77% (p=0.000 n=33+35)
BM_Sort_uint32_Ascending_1024 1.03ns ± 4% 1.43ns ± 3% +38.44% (p=0.000 n=32+35)
BM_Sort_uint32_Ascending_16384 0.94ns ± 8% 1.36ns ± 5% +44.09% (p=0.000 n=32+35)
BM_Sort_uint32_Ascending_262144 0.93ns ± 3% 1.35ns ± 7% +45.06% (p=0.000 n=32+36)
BM_Sort_uint32_Descending_1 3.69ns ± 2% 4.27ns ± 3% +15.73% (p=0.000 n=31+36)
BM_Sort_uint32_Descending_4 1.74ns ± 2% 1.78ns ± 3% +2.29% (p=0.000 n=31+38)
BM_Sort_uint32_Descending_16 3.92ns ± 4% 4.20ns ± 4% +7.13% (p=0.000 n=32+38)
BM_Sort_uint32_Descending_64 2.09ns ± 4% 3.25ns ± 4% +55.10% (p=0.000 n=33+37)
BM_Sort_uint32_Descending_256 1.98ns ± 7% 2.93ns ± 4% +47.95% (p=0.000 n=34+36)
BM_Sort_uint32_Descending_1024 2.23ns ± 6% 2.64ns ± 3% +18.22% (p=0.000 n=34+38)
BM_Sort_uint32_Descending_16384 1.93ns ± 6% 2.43ns ± 4% +25.99% (p=0.000 n=34+35)
BM_Sort_uint32_Descending_262144 1.89ns ± 3% 2.38ns ± 4% +25.41% (p=0.000 n=33+35)
BM_Sort_uint32_SingleElement_1 3.67ns ± 2% 4.28ns ± 4% +16.60% (p=0.000 n=34+34)
BM_Sort_uint32_SingleElement_4 1.48ns ± 4% 1.48ns ± 5% ~ (p=0.951 n=35+33)
BM_Sort_uint32_SingleElement_16 0.93ns ± 3% 1.02ns ± 4% +9.51% (p=0.000 n=36+33)
BM_Sort_uint32_SingleElement_64 0.76ns ± 3% 1.59ns ± 8% +109.78% (p=0.000 n=36+32)
BM_Sort_uint32_SingleElement_256 0.82ns ± 4% 1.45ns ± 5% +76.62% (p=0.000 n=37+34)
BM_Sort_uint32_SingleElement_1024 0.77ns ± 4% 1.31ns ± 4% +71.40% (p=0.000 n=34+34)
BM_Sort_uint32_SingleElement_16384 0.64ns ± 4% 1.24ns ± 6% +93.29% (p=0.000 n=35+36)
BM_Sort_uint32_SingleElement_262144 0.63ns ± 3% 1.23ns ± 4% +95.17% (p=0.000 n=35+35)
BM_Sort_uint32_PipeOrgan_1 3.68ns ± 2% 4.42ns ± 3% +20.31% (p=0.000 n=34+36)
BM_Sort_uint32_PipeOrgan_4 1.54ns ± 3% 1.53ns ± 3% ~ (p=0.128 n=34+36)
BM_Sort_uint32_PipeOrgan_16 2.22ns ± 3% 1.99ns ± 3% -10.28% (p=0.000 n=33+36)
BM_Sort_uint32_PipeOrgan_64 4.41ns ± 3% 3.39ns ± 4% -23.17% (p=0.000 n=35+37)
BM_Sort_uint32_PipeOrgan_256 2.75ns ± 5% 3.07ns ± 3% +11.74% (p=0.000 n=37+37)
BM_Sort_uint32_PipeOrgan_1024 3.58ns ± 2% 5.48ns ± 3% +52.97% (p=0.000 n=37+36)
BM_Sort_uint32_PipeOrgan_16384 4.10ns ± 3% 6.53ns ± 3% +59.27% (p=0.000 n=37+37)
BM_Sort_uint32_PipeOrgan_262144 4.90ns ± 3% 7.39ns ± 3% +50.71% (p=0.000 n=34+37)
BM_Sort_uint32_QuickSortAdversary_1 3.68ns ± 2% 4.28ns ± 3% +16.19% (p=0.000 n=36+37)
BM_Sort_uint32_QuickSortAdversary_4 1.46ns ± 4% 1.46ns ± 3% ~ (p=0.736 n=35+38)
BM_Sort_uint32_QuickSortAdversary_16 0.93ns ± 3% 1.02ns ± 4% +9.69% (p=0.000 n=36+37)
BM_Sort_uint32_QuickSortAdversary_64 13.6ns ± 4% 17.9ns ± 8% +31.37% (p=0.000 n=36+35)
BM_Sort_uint32_QuickSortAdversary_256 20.0ns ± 4% 25.7ns ± 4% +28.69% (p=0.000 n=36+35)
BM_Sort_uint32_QuickSortAdversary_1024 28.3ns ± 6% 31.7ns ± 3% +12.12% (p=0.000 n=36+37)
BM_Sort_uint32_QuickSortAdversary_16384 45.8ns ± 3% 50.6ns ± 4% +10.32% (p=0.000 n=38+36)
BM_Sort_uint32_QuickSortAdversary_262144 61.6ns ± 4% 68.2ns ± 4% +10.68% (p=0.000 n=37+37)
BM_Sort_uint64_Random_1 3.71ns ± 4% 4.00ns ± 4% +7.93% (p=0.000 n=34+35)
BM_Sort_uint64_Random_4 5.52ns ± 8% 5.22ns ± 6% -5.41% (p=0.000 n=32+32)
BM_Sort_uint64_Random_16 10.7ns ±15% 10.2ns ± 7% ~ (p=0.077 n=40+31)
BM_Sort_uint64_Random_64 19.0ns ±14% 18.2ns ±14% -4.31% (p=0.001 n=40+40)
BM_Sort_uint64_Random_256 25.7ns ± 9% 22.1ns ±15% -13.82% (p=0.000 n=33+40)
BM_Sort_uint64_Random_1024 32.4ns ± 6% 23.8ns ±16% -26.64% (p=0.000 n=33+40)
BM_Sort_uint64_Random_16384 46.8ns ± 3% 27.1ns ±16% -42.15% (p=0.000 n=33+40)
BM_Sort_uint64_Random_262144 61.3ns ± 4% 30.4ns ±16% -50.34% (p=0.000 n=34+40)
BM_Sort_uint64_Ascending_1 3.67ns ± 3% 3.87ns ±16% +5.36% (p=0.049 n=35+40)
BM_Sort_uint64_Ascending_4 1.46ns ± 3% 1.46ns ± 3% ~ (p=0.130 n=37+31)
BM_Sort_uint64_Ascending_16 1.09ns ± 3% 0.91ns ± 6% -16.79% (p=0.000 n=38+32)
BM_Sort_uint64_Ascending_64 1.25ns ± 3% 1.29ns ± 5% +3.11% (p=0.000 n=38+34)
BM_Sort_uint64_Ascending_256 1.37ns ± 3% 1.42ns ± 3% +3.07% (p=0.000 n=39+35)
BM_Sort_uint64_Ascending_1024 1.12ns ± 3% 1.17ns ± 3% +5.28% (p=0.000 n=37+36)
BM_Sort_uint64_Ascending_16384 0.98ns ± 3% 1.09ns ± 3% +10.95% (p=0.000 n=36+37)
BM_Sort_uint64_Ascending_262144 0.98ns ± 3% 1.08ns ± 3% +10.97% (p=0.000 n=36+37)
BM_Sort_uint64_Descending_1 3.68ns ± 3% 3.67ns ± 3% ~ (p=0.652 n=36+36)
BM_Sort_uint64_Descending_4 1.71ns ± 3% 1.73ns ± 3% +1.50% (p=0.000 n=33+34)
BM_Sort_uint64_Descending_16 4.96ns ± 2% 5.49ns ± 3% +10.73% (p=0.000 n=31+36)
BM_Sort_uint64_Descending_64 2.14ns ± 6% 3.03ns ± 3% +41.72% (p=0.000 n=32+35)
BM_Sort_uint64_Descending_256 2.03ns ± 4% 2.86ns ± 4% +40.55% (p=0.000 n=32+34)
BM_Sort_uint64_Descending_1024 2.20ns ± 2% 2.29ns ± 3% +4.20% (p=0.000 n=31+36)
BM_Sort_uint64_Descending_16384 1.89ns ± 3% 2.08ns ± 3% +10.00% (p=0.000 n=31+37)
BM_Sort_uint64_Descending_262144 1.92ns ± 3% 2.07ns ± 4% +7.95% (p=0.000 n=31+36)
BM_Sort_uint64_SingleElement_1 3.68ns ± 5% 3.67ns ± 3% ~ (p=0.716 n=31+37)
BM_Sort_uint64_SingleElement_4 1.46ns ± 3% 1.46ns ± 3% ~ (p=0.557 n=34+37)
BM_Sort_uint64_SingleElement_16 1.09ns ± 2% 0.91ns ± 3% -16.93% (p=0.000 n=33+36)
BM_Sort_uint64_SingleElement_64 0.83ns ± 4% 1.47ns ± 4% +78.03% (p=0.000 n=34+34)
BM_Sort_uint64_SingleElement_256 0.95ns ± 4% 1.28ns ± 4% +35.17% (p=0.000 n=35+35)
BM_Sort_uint64_SingleElement_1024 0.76ns ± 3% 1.05ns ± 3% +37.78% (p=0.000 n=35+33)
BM_Sort_uint64_SingleElement_16384 0.71ns ± 2% 0.98ns ± 5% +38.43% (p=0.000 n=34+33)
BM_Sort_uint64_SingleElement_262144 0.72ns ± 3% 0.98ns ± 4% +35.93% (p=0.000 n=35+33)
BM_Sort_uint64_PipeOrgan_1 3.68ns ± 3% 3.68ns ± 3% ~ (p=0.650 n=35+33)
BM_Sort_uint64_PipeOrgan_4 1.53ns ± 2% 1.54ns ± 4% ~ (p=0.424 n=33+36)
BM_Sort_uint64_PipeOrgan_16 2.23ns ± 3% 2.06ns ± 4% -7.68% (p=0.000 n=34+35)
BM_Sort_uint64_PipeOrgan_64 5.46ns ± 2% 3.41ns ± 4% -37.67% (p=0.000 n=33+36)
BM_Sort_uint64_PipeOrgan_256 2.92ns ± 4% 2.91ns ± 3% ~ (p=0.257 n=35+35)
BM_Sort_uint64_PipeOrgan_1024 3.72ns ± 3% 5.35ns ± 4% +43.95% (p=0.000 n=35+35)
BM_Sort_uint64_PipeOrgan_16384 4.12ns ± 3% 6.37ns ± 3% +54.74% (p=0.000 n=34+36)
BM_Sort_uint64_PipeOrgan_262144 4.99ns ± 3% 7.25ns ± 5% +45.45% (p=0.000 n=35+35)
BM_Sort_uint64_QuickSortAdversary_1 3.67ns ± 2% 3.65ns ± 3% ~ (p=0.071 n=35+37)
BM_Sort_uint64_QuickSortAdversary_4 1.46ns ± 3% 1.46ns ± 3% ~ (p=0.214 n=36+37)
BM_Sort_uint64_QuickSortAdversary_16 1.09ns ± 3% 0.91ns ± 3% -16.73% (p=0.000 n=36+38)
BM_Sort_uint64_QuickSortAdversary_64 13.7ns ± 3% 17.8ns ± 5% +29.86% (p=0.000 n=36+37)
BM_Sort_uint64_QuickSortAdversary_256 20.0ns ± 3% 25.9ns ± 3% +29.25% (p=0.000 n=35+38)
BM_Sort_uint64_QuickSortAdversary_1024 28.1ns ± 3% 31.0ns ± 4% +10.35% (p=0.000 n=33+37)
BM_Sort_uint64_QuickSortAdversary_16384 45.8ns ± 2% 50.5ns ± 4% +10.29% (p=0.000 n=36+37)
BM_Sort_uint64_QuickSortAdversary_262144 64.9ns ± 3% 69.5ns ± 3% +7.15% (p=0.000 n=36+36)
BM_Sort_pair<uint32, uint32>_Random_1 4.03ns ± 5% 4.33ns ± 4% +7.31% (p=0.000 n=36+36)
BM_Sort_pair<uint32, uint32>_Random_4 6.78ns ± 5% 6.71ns ± 4% -1.09% (p=0.040 n=35+35)
BM_Sort_pair<uint32, uint32>_Random_16 25.2ns ± 6% 16.8ns ± 7% -33.35% (p=0.000 n=35+35)
BM_Sort_pair<uint32, uint32>_Random_64 35.6ns ± 7% 27.2ns ± 8% -23.73% (p=0.000 n=34+36)
BM_Sort_pair<uint32, uint32>_Random_256 43.5ns ±13% 34.0ns ± 8% -21.78% (p=0.000 n=32+34)
BM_Sort_pair<uint32, uint32>_Random_1024 50.6ns ± 8% 40.8ns ± 5% -19.35% (p=0.000 n=32+32)
BM_Sort_pair<uint32, uint32>_Random_16384 66.0ns ± 3% 55.9ns ± 6% -15.24% (p=0.000 n=32+32)
BM_Sort_pair<uint32, uint32>_Random_262144 82.4ns ± 4% 72.0ns ± 5% -12.64% (p=0.000 n=32+31)
BM_Sort_pair<uint32, uint32>_Ascending_1 4.00ns ± 2% 4.50ns ±16% +12.59% (p=0.000 n=33+40)
BM_Sort_pair<uint32, uint32>_Ascending_4 2.22ns ± 3% 2.34ns ±16% +5.46% (p=0.041 n=33+40)
BM_Sort_pair<uint32, uint32>_Ascending_16 2.33ns ± 4% 1.30ns ±15% -44.33% (p=0.000 n=34+40)
BM_Sort_pair<uint32, uint32>_Ascending_64 1.39ns ± 4% 1.50ns ± 8% +8.48% (p=0.000 n=35+32)
BM_Sort_pair<uint32, uint32>_Ascending_256 1.47ns ± 4% 1.56ns ± 3% +5.96% (p=0.000 n=37+31)
BM_Sort_pair<uint32, uint32>_Ascending_1024 1.34ns ± 3% 1.35ns ± 4% +1.22% (p=0.000 n=38+31)
BM_Sort_pair<uint32, uint32>_Ascending_16384 1.18ns ± 2% 1.18ns ± 3% ~ (p=0.687 n=37+32)
BM_Sort_pair<uint32, uint32>_Ascending_262144 1.18ns ± 3% 1.17ns ± 2% ~ (p=0.153 n=38+34)
BM_Sort_pair<uint32, uint32>_Descending_1 4.00ns ± 2% 4.29ns ± 3% +7.22% (p=0.000 n=37+36)
BM_Sort_pair<uint32, uint32>_Descending_4 2.91ns ± 3% 2.92ns ± 3% ~ (p=0.065 n=37+35)
BM_Sort_pair<uint32, uint32>_Descending_16 4.96ns ± 4% 6.51ns ± 2% +31.36% (p=0.000 n=37+30)
BM_Sort_pair<uint32, uint32>_Descending_64 3.13ns ± 2% 2.92ns ± 3% -6.71% (p=0.000 n=36+37)
BM_Sort_pair<uint32, uint32>_Descending_256 2.56ns ± 3% 2.73ns ± 5% +6.55% (p=0.000 n=35+37)
BM_Sort_pair<uint32, uint32>_Descending_1024 3.11ns ± 3% 2.34ns ± 4% -24.85% (p=0.000 n=36+35)
BM_Sort_pair<uint32, uint32>_Descending_16384 2.84ns ± 3% 2.14ns ± 5% -24.48% (p=0.000 n=37+37)
BM_Sort_pair<uint32, uint32>_Descending_262144 2.86ns ± 3% 2.15ns ± 3% -25.08% (p=0.000 n=36+35)
BM_Sort_pair<uint32, uint32>_SingleElement_1 3.99ns ± 3% 4.28ns ± 3% +7.08% (p=0.000 n=33+35)
BM_Sort_pair<uint32, uint32>_SingleElement_4 2.32ns ± 6% 2.30ns ± 3% -0.77% (p=0.032 n=32+35)
BM_Sort_pair<uint32, uint32>_SingleElement_16 1.67ns ± 4% 1.27ns ± 4% -24.13% (p=0.000 n=32+35)
BM_Sort_pair<uint32, uint32>_SingleElement_64 1.64ns ± 7% 1.83ns ± 4% +11.54% (p=0.000 n=31+35)
BM_Sort_pair<uint32, uint32>_SingleElement_256 1.57ns ± 3% 1.90ns ± 3% +21.46% (p=0.000 n=31+36)
BM_Sort_pair<uint32, uint32>_SingleElement_1024 1.49ns ±15% 1.63ns ± 3% +9.42% (p=0.000 n=40+37)
BM_Sort_pair<uint32, uint32>_SingleElement_16384 1.29ns ±17% 1.57ns ± 3% +21.51% (p=0.000 n=33+36)
BM_Sort_pair<uint32, uint32>_SingleElement_262144 1.26ns ± 4% 1.56ns ± 4% +24.11% (p=0.000 n=33+36)
BM_Sort_pair<uint32, uint32>_PipeOrgan_1 4.01ns ± 2% 4.28ns ± 3% +6.68% (p=0.000 n=32+35)
BM_Sort_pair<uint32, uint32>_PipeOrgan_4 2.38ns ± 5% 2.42ns ± 4% +1.61% (p=0.000 n=34+35)
BM_Sort_pair<uint32, uint32>_PipeOrgan_16 4.83ns ± 2% 2.71ns ± 7% -43.96% (p=0.000 n=34+34)
BM_Sort_pair<uint32, uint32>_PipeOrgan_64 4.53ns ± 3% 3.89ns ± 7% -14.11% (p=0.000 n=35+33)
BM_Sort_pair<uint32, uint32>_PipeOrgan_256 5.53ns ± 4% 2.81ns ± 4% -49.13% (p=0.000 n=36+33)
BM_Sort_pair<uint32, uint32>_PipeOrgan_1024 6.49ns ± 4% 5.29ns ± 3% -18.50% (p=0.000 n=35+32)
BM_Sort_pair<uint32, uint32>_PipeOrgan_16384 7.21ns ± 4% 5.97ns ± 3% -17.24% (p=0.000 n=36+33)
BM_Sort_pair<uint32, uint32>_PipeOrgan_262144 7.98ns ± 5% 6.59ns ± 3% -17.46% (p=0.000 n=33+33)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_1 3.99ns ± 3% 4.27ns ± 3% +6.95% (p=0.000 n=36+34)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_4 2.40ns ± 3% 2.37ns ± 3% -1.00% (p=0.007 n=34+34)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_16 4.96ns ± 5% 2.72ns ± 7% -45.07% (p=0.000 n=35+35)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_64 7.24ns ± 4% 7.51ns ± 4% +3.63% (p=0.000 n=34+35)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_256 9.85ns ± 5% 7.12ns ± 4% -27.70% (p=0.000 n=34+35)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_1024 11.6ns ± 6% 8.8ns ± 5% -23.86% (p=0.000 n=35+35)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_16384 32.7ns ± 3% 20.8ns ± 4% -36.26% (p=0.000 n=35+35)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_262144 36.4ns ± 3% 24.0ns ± 4% -34.12% (p=0.000 n=34+36)
BM_Sort_tuple<uint32, uint64, uint32>_Random_1 4.04ns ± 6% 4.34ns ± 4% +7.55% (p=0.000 n=37+37)
BM_Sort_tuple<uint32, uint64, uint32>_Random_4 7.19ns ± 6% 7.26ns ± 5% +0.99% (p=0.042 n=36+38)
BM_Sort_tuple<uint32, uint64, uint32>_Random_16 30.4ns ± 6% 21.8ns ± 7% -28.28% (p=0.000 n=34+37)
BM_Sort_tuple<uint32, uint64, uint32>_Random_64 42.8ns ±11% 33.5ns ± 9% -21.70% (p=0.000 n=36+38)
BM_Sort_tuple<uint32, uint64, uint32>_Random_256 49.9ns ± 6% 40.3ns ± 9% -19.20% (p=0.000 n=35+38)
BM_Sort_tuple<uint32, uint64, uint32>_Random_1024 56.3ns ± 3% 46.1ns ± 4% -18.08% (p=0.000 n=35+35)
BM_Sort_tuple<uint32, uint64, uint32>_Random_16384 72.2ns ± 5% 62.1ns ± 3% -14.05% (p=0.000 n=37+36)
BM_Sort_tuple<uint32, uint64, uint32>_Random_262144 88.7ns ± 6% 79.0ns ± 6% -10.93% (p=0.000 n=36+36)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_1 3.96ns ± 3% 4.36ns ± 3% +9.96% (p=0.000 n=34+37)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_4 2.39ns ± 2% 2.39ns ± 3% ~ (p=0.604 n=36+37)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_16 3.04ns ± 4% 1.48ns ± 3% -51.20% (p=0.000 n=34+35)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_64 2.44ns ± 3% 2.30ns ± 5% -5.61% (p=0.000 n=36+35)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_256 2.35ns ± 3% 2.39ns ± 5% +1.78% (p=0.000 n=33+34)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_1024 2.12ns ± 5% 2.08ns ± 4% -1.80% (p=0.000 n=33+34)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_16384 2.02ns ± 3% 2.00ns ± 5% -1.25% (p=0.000 n=32+32)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_262144 2.06ns ± 5% 2.11ns ± 9% ~ (p=0.618 n=32+40)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_1 3.97ns ± 2% 4.57ns ±16% +15.19% (p=0.000 n=32+40)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_4 3.64ns ± 3% 4.05ns ±15% +11.05% (p=0.000 n=33+40)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_16 5.68ns ± 5% 9.36ns ±16% +64.92% (p=0.000 n=35+40)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_64 4.27ns ± 4% 3.88ns ± 8% -9.13% (p=0.000 n=35+32)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_256 3.58ns ± 3% 3.76ns ±14% +5.12% (p=0.002 n=38+40)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_1024 4.16ns ± 3% 3.21ns ± 5% -22.77% (p=0.000 n=38+31)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_16384 3.90ns ± 4% 3.00ns ± 3% -23.12% (p=0.000 n=38+32)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_262144 4.52ns ± 3% 3.42ns ± 3% -24.29% (p=0.000 n=38+33)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_1 3.97ns ± 3% 4.31ns ± 3% +8.78% (p=0.000 n=39+34)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_4 2.54ns ± 2% 2.54ns ± 4% ~ (p=0.341 n=38+36)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_16 2.39ns ± 3% 1.70ns ± 6% -28.90% (p=0.000 n=38+35)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_64 2.61ns ± 2% 3.23ns ± 3% +24.07% (p=0.000 n=35+35)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_256 2.83ns ± 2% 2.97ns ± 4% +4.83% (p=0.000 n=35+37)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_1024 2.44ns ± 4% 2.44ns ± 3% ~ (p=0.481 n=36+36)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_16384 2.19ns ± 3% 2.37ns ± 6% +8.01% (p=0.000 n=36+37)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_262144 2.34ns ± 2% 2.36ns ± 5% +1.11% (p=0.001 n=36+36)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_1 3.96ns ± 2% 4.31ns ± 3% +8.76% (p=0.000 n=33+35)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_4 2.65ns ± 6% 2.67ns ± 4% ~ (p=0.139 n=32+37)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_16 5.64ns ± 3% 3.56ns ± 3% -36.80% (p=0.000 n=31+35)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_64 6.12ns ±16% 5.04ns ± 4% -17.64% (p=0.000 n=40+37)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_256 6.78ns ± 6% 3.73ns ± 3% -44.94% (p=0.000 n=31+36)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_1024 8.36ns ±15% 6.51ns ± 4% -22.13% (p=0.000 n=40+37)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_16384 9.24ns ±15% 7.91ns ± 3% -14.34% (p=0.000 n=40+37)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_262144 10.7ns ± 3% 9.3ns ± 6% -12.36% (p=0.000 n=32+36)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_1 3.97ns ± 3% 4.31ns ± 3% +8.63% (p=0.000 n=32+35)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_4 2.79ns ± 3% 2.76ns ± 4% -0.95% (p=0.002 n=33+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_16 5.07ns ± 3% 3.69ns ± 4% -27.35% (p=0.000 n=35+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_64 9.26ns ± 3% 8.34ns ± 7% -9.88% (p=0.000 n=35+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_256 11.8ns ± 5% 9.7ns ± 3% -17.83% (p=0.000 n=37+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_1024 19.2ns ± 4% 14.5ns ±10% -24.59% (p=0.000 n=36+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_16384 45.5ns ± 4% 37.4ns ± 9% -17.71% (p=0.000 n=35+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_262144 50.0ns ± 4% 43.2ns ± 3% -13.69% (p=0.000 n=35+34)
BM_Sort_string_Random_1 4.66ns ± 6% 4.40ns ± 4% -5.55% (p=0.000 n=35+37)
BM_Sort_string_Random_4 14.9ns ± 3% 15.0ns ± 6% ~ (p=0.863 n=36+38)
BM_Sort_string_Random_16 45.5ns ± 6% 35.8ns ± 8% -21.37% (p=0.000 n=36+36)
BM_Sort_string_Random_64 66.6ns ± 4% 58.2ns ± 3% -12.69% (p=0.000 n=36+37)
BM_Sort_string_Random_256 86.0ns ± 5% 77.4ns ± 3% -10.01% (p=0.000 n=37+37)
BM_Sort_string_Random_1024 106ns ± 3% 96ns ± 6% -9.39% (p=0.000 n=37+37)
BM_Sort_string_Random_16384 154ns ± 3% 141ns ± 5% -8.03% (p=0.000 n=35+36)
BM_Sort_string_Random_262144 213ns ± 4% 197ns ± 4% -7.59% (p=0.000 n=34+34)
BM_Sort_string_Ascending_1 4.59ns ± 2% 4.56ns ±17% -0.60% (p=0.002 n=32+40)
BM_Sort_string_Ascending_4 7.52ns ± 9% 7.54ns ±12% ~ (p=0.554 n=37+40)
BM_Sort_string_Ascending_16 13.1ns ± 6% 8.8ns ±12% -33.26% (p=0.000 n=39+38)
BM_Sort_string_Ascending_64 14.8ns ±10% 14.5ns ±11% -2.15% (p=0.013 n=40+37)
BM_Sort_string_Ascending_256 14.0ns ± 6% 14.1ns ±10% ~ (p=0.760 n=37+40)
BM_Sort_string_Ascending_1024 12.9ns ±10% 12.8ns ±20% ~ (p=0.055 n=35+40)
BM_Sort_string_Ascending_16384 17.2ns ±13% 17.4ns ±21% ~ (p=1.000 n=37+40)
BM_Sort_string_Ascending_262144 17.5ns ±12% 17.5ns ±25% ~ (p=0.392 n=35+39)
BM_Sort_string_Descending_1 4.59ns ± 3% 4.34ns ± 3% -5.51% (p=0.000 n=32+33)
BM_Sort_string_Descending_4 10.1ns ± 5% 9.8ns ± 4% -2.84% (p=0.000 n=36+34)
BM_Sort_string_Descending_16 22.0ns ± 4% 39.6ns ± 4% +79.84% (p=0.000 n=36+33)
BM_Sort_string_Descending_64 21.4ns ±12% 21.3ns ±14% ~ (p=0.542 n=37+39)
BM_Sort_string_Descending_256 19.4ns ±13% 18.9ns ±13% -2.74% (p=0.039 n=37+39)
BM_Sort_string_Descending_1024 22.7ns ± 5% 17.6ns ±15% -22.52% (p=0.000 n=35+40)
BM_Sort_string_Descending_16384 27.9ns ±14% 22.6ns ±10% -19.11% (p=0.000 n=40+37)
BM_Sort_string_Descending_262144 33.8ns ±14% 26.1ns ±21% -22.74% (p=0.000 n=39+38)
BM_Sort_string_SingleElement_1 4.58ns ± 2% 4.35ns ± 3% -5.14% (p=0.000 n=35+37)
BM_Sort_string_SingleElement_4 7.92ns ± 3% 7.92ns ± 7% ~ (p=0.625 n=38+39)
BM_Sort_string_SingleElement_16 18.0ns ± 3% 7.9ns ± 6% -56.23% (p=0.000 n=36+35)
BM_Sort_string_SingleElement_64 20.3ns ± 5% 19.3ns ±15% -4.83% (p=0.000 n=34+38)
BM_Sort_string_SingleElement_256 19.4ns ± 7% 18.1ns ±14% -6.67% (p=0.000 n=36+39)
BM_Sort_string_SingleElement_1024 19.3ns ± 9% 17.4ns ±17% -9.40% (p=0.000 n=35+40)
BM_Sort_string_SingleElement_16384 17.5ns ±12% 16.2ns ±20% -7.91% (p=0.000 n=37+40)
BM_Sort_string_SingleElement_262144 16.7ns ±18% 15.3ns ±27% -8.56% (p=0.000 n=40+40)
BM_Sort_string_PipeOrgan_1 4.60ns ± 2% 4.33ns ± 3% -5.80% (p=0.000 n=33+31)
BM_Sort_string_PipeOrgan_4 8.29ns ± 4% 8.17ns ± 8% -1.50% (p=0.004 n=39+36)
BM_Sort_string_PipeOrgan_16 22.9ns ± 3% 16.4ns ± 6% -28.45% (p=0.000 n=39+38)
BM_Sort_string_PipeOrgan_64 30.7ns ± 4% 28.9ns ± 7% -6.05% (p=0.000 n=38+37)
BM_Sort_string_PipeOrgan_256 38.1ns ± 3% 22.5ns ± 9% -40.78% (p=0.000 n=37+37)
BM_Sort_string_PipeOrgan_1024 45.4ns ± 4% 36.2ns ± 6% -20.33% (p=0.000 n=37+37)
BM_Sort_string_PipeOrgan_16384 56.2ns ± 4% 49.0ns ± 8% -12.73% (p=0.000 n=36+38)
BM_Sort_string_PipeOrgan_262144 77.8ns ±13% 62.8ns ±10% -19.27% (p=0.000 n=39+39)
BM_Sort_string_QuickSortAdversary_1 4.80ns ±16% 4.34ns ± 4% -9.56% (p=0.000 n=39+34)
BM_Sort_string_QuickSortAdversary_4 14.8ns ± 5% 14.7ns ± 4% -0.80% (p=0.037 n=33+33)
BM_Sort_string_QuickSortAdversary_16 44.6ns ± 4% 34.8ns ± 5% -21.98% (p=0.000 n=35+34)
BM_Sort_string_QuickSortAdversary_64 66.2ns ± 3% 58.1ns ± 4% -12.32% (p=0.000 n=36+35)
BM_Sort_string_QuickSortAdversary_256 85.4ns ± 5% 76.9ns ± 6% -9.99% (p=0.000 n=36+36)
BM_Sort_string_QuickSortAdversary_1024 106ns ± 4% 96ns ± 3% -9.62% (p=0.000 n=34+37)
BM_Sort_string_QuickSortAdversary_16384 153ns ± 3% 141ns ± 4% -8.22% (p=0.000 n=34+37)
BM_Sort_string_QuickSortAdversary_262144 211ns ± 5% 195ns ± 6% -7.77% (p=0.000 n=35+38)
Differential Revision: https://reviews.llvm.org/D122780
Added:
Modified:
libcxx/include/__algorithm/sort.h
libcxx/include/__bits
Removed:
################################################################################
diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h
index 81f6eeb238b6b..6242f413a0900 100644
--- a/libcxx/include/__algorithm/sort.h
+++ b/libcxx/include/__algorithm/sort.h
@@ -11,10 +11,12 @@
#include <__algorithm/comp.h>
#include <__algorithm/comp_ref_type.h>
+#include <__algorithm/iter_swap.h>
#include <__algorithm/iterator_operations.h>
#include <__algorithm/min_element.h>
#include <__algorithm/partial_sort.h>
#include <__algorithm/unwrap_iter.h>
+#include <__assert>
#include <__bits>
#include <__config>
#include <__debug>
@@ -25,6 +27,7 @@
#include <__memory/destruct_n.h>
#include <__memory/unique_ptr.h>
#include <__utility/move.h>
+#include <__utility/pair.h>
#include <climits>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -124,8 +127,7 @@ template <class _AlgPolicy, class _Compare, class _ForwardIterator>
_LIBCPP_HIDE_FROM_ABI
unsigned __sort4(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3, _ForwardIterator __x4,
_Compare __c) {
- using _Ops = _IterOps<_AlgPolicy>;
-
+ using _Ops = _IterOps<_AlgPolicy>;
unsigned __r = std::__sort3<_AlgPolicy, _Compare>(__x1, __x2, __x3, __c);
if (__c(*__x4, *__x3)) {
_Ops::iter_swap(__x3, __x4);
@@ -180,7 +182,7 @@ _LIBCPP_HIDE_FROM_ABI unsigned __sort5_wrap_policy(
_Compare __c) {
using _WrappedComp = typename _WrapAlgPolicy<_AlgPolicy, _Compare>::type;
_WrappedComp __wrapped_comp(__c);
- return std::__sort5<_WrappedComp>(
+ return std::__sort5<_WrappedComp, _ForwardIterator>(
std::move(__x1), std::move(__x2), std::move(__x3), std::move(__x4), std::move(__x5), __wrapped_comp);
}
@@ -205,6 +207,13 @@ using __use_branchless_sort =
integral_constant<bool, __is_cpp17_contiguous_iterator<_Iter>::value && sizeof(_Tp) <= sizeof(void*) &&
is_arithmetic<_Tp>::value && __is_simple_comparator<_Compare>::value>;
+namespace __detail {
+
+// Size in bits for the bitset in use.
+enum { __block_size = sizeof(uint64_t) * 8 };
+
+} // namespace __detail
+
// Ensures that __c(*__x, *__y) is true by swapping *__x and *__y if necessary.
template <class _Compare, class _RandomAccessIterator>
inline _LIBCPP_HIDE_FROM_ABI void __cond_swap(_RandomAccessIterator __x, _RandomAccessIterator __y, _Compare __c) {
@@ -264,10 +273,15 @@ __sort4_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2,
std::__sort4<_AlgPolicy, _Compare>(__x1, __x2, __x3, __x4, __c);
}
-template <class, class _Compare, class _RandomAccessIterator>
+template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void>
-__sort5_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3,
- _RandomAccessIterator __x4, _RandomAccessIterator __x5, _Compare __c) {
+__sort5_maybe_branchless(
+ _RandomAccessIterator __x1,
+ _RandomAccessIterator __x2,
+ _RandomAccessIterator __x3,
+ _RandomAccessIterator __x4,
+ _RandomAccessIterator __x5,
+ _Compare __c) {
std::__cond_swap<_Compare>(__x1, __x2, __c);
std::__cond_swap<_Compare>(__x4, __x5, __c);
std::__partially_sorted_swap<_Compare>(__x3, __x4, __x5, __c);
@@ -296,34 +310,47 @@ _LIBCPP_CONSTEXPR_SINCE_CXX14 void __selection_sort(_BidirectionalIterator __fir
}
}
+// Sort the iterator range [__first, __last) using the comparator __comp using
+// the insertion sort algorithm.
template <class _AlgPolicy, class _Compare, class _BidirectionalIterator>
_LIBCPP_HIDE_FROM_ABI
void __insertion_sort(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp) {
using _Ops = _IterOps<_AlgPolicy>;
typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
- if (__first != __last) {
- _BidirectionalIterator __i = __first;
- for (++__i; __i != __last; ++__i) {
- _BidirectionalIterator __j = __i;
- value_type __t(_Ops::__iter_move(__j));
- for (_BidirectionalIterator __k = __i; __k != __first && __comp(__t, *--__k); --__j)
+ if (__first == __last)
+ return;
+ _BidirectionalIterator __i = __first;
+ for (++__i; __i != __last; ++__i) {
+ _BidirectionalIterator __j = __i;
+ --__j;
+ if (__comp(*__i, *__j)) {
+ value_type __t(_Ops::__iter_move(__i));
+ _BidirectionalIterator __k = __j;
+ __j = __i;
+ do {
*__j = _Ops::__iter_move(__k);
+ __j = __k;
+ } while (__j != __first && __comp(__t, *--__k));
*__j = std::move(__t);
}
}
}
+// Sort the iterator range [__first, __last) using the comparator __comp using
+// the insertion sort algorithm. Insertion sort has two loops, outer and inner.
+// The implementation below has not bounds check (unguarded) for the inner loop.
+// Assumes that there is an element in the position (__first - 1) and that each
+// element in the input range is greater or equal to the element at __first - 1.
template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
-_LIBCPP_HIDE_FROM_ABI
-void __insertion_sort_3(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
+void __insertion_sort_unguarded(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
using _Ops = _IterOps<_AlgPolicy>;
-
typedef typename iterator_traits<_RandomAccessIterator>::
diff erence_type
diff erence_type;
typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
- _RandomAccessIterator __j = __first +
diff erence_type(2);
- std::__sort3_maybe_branchless<_AlgPolicy, _Compare>(__first, __first +
diff erence_type(1), __j, __comp);
- for (_RandomAccessIterator __i = __j +
diff erence_type(1); __i != __last; ++__i) {
+ if (__first == __last)
+ return;
+ for (_RandomAccessIterator __i = __first +
diff erence_type(1); __i != __last; ++__i) {
+ _RandomAccessIterator __j = __i -
diff erence_type(1);
if (__comp(*__i, *__j)) {
value_type __t(_Ops::__iter_move(__i));
_RandomAccessIterator __k = __j;
@@ -331,10 +358,9 @@ void __insertion_sort_3(_RandomAccessIterator __first, _RandomAccessIterator __l
do {
*__j = _Ops::__iter_move(__k);
__j = __k;
- } while (__j != __first && __comp(__t, *--__k));
+ } while (__comp(__t, *--__k)); // No need for bounds check due to the assumption stated above.
*__j = std::move(__t);
}
- __j = __i;
}
}
@@ -355,7 +381,7 @@ _LIBCPP_HIDDEN bool __insertion_sort_incomplete(
return true;
case 2:
if (__comp(*--__last, *__first))
- _IterOps<_AlgPolicy>::iter_swap(__first, __last);
+ _Ops::iter_swap(__first, __last);
return true;
case 3:
std::__sort3_maybe_branchless<_AlgPolicy, _Compare>(__first, __first +
diff erence_type(1), --__last, __comp);
@@ -424,17 +450,336 @@ void __insertion_sort_move(_BidirectionalIterator __first1, _BidirectionalIterat
}
}
-template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
-void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp,
- typename iterator_traits<_RandomAccessIterator>::
diff erence_type __depth) {
+template <class _AlgPolicy, class _RandomAccessIterator>
+inline _LIBCPP_HIDE_FROM_ABI void __swap_bitmap_pos(
+ _RandomAccessIterator __first, _RandomAccessIterator __last, uint64_t& __left_bitset, uint64_t& __right_bitset) {
+ using _Ops = _IterOps<_AlgPolicy>;
+ typedef typename std::iterator_traits<_RandomAccessIterator>::
diff erence_type
diff erence_type;
+ // Swap one pair on each iteration as long as both bitsets have at least one
+ // element for swapping.
+ while (__left_bitset != 0 && __right_bitset != 0) {
+
diff erence_type tz_left = __libcpp_ctz(__left_bitset);
+ __left_bitset = __libcpp_blsr(__left_bitset);
+
diff erence_type tz_right = __libcpp_ctz(__right_bitset);
+ __right_bitset = __libcpp_blsr(__right_bitset);
+ _Ops::iter_swap(__first + tz_left, __last - tz_right);
+ }
+}
+
+template <class _Compare,
+ class _RandomAccessIterator,
+ class _ValueType = typename iterator_traits<_RandomAccessIterator>::value_type>
+inline _LIBCPP_HIDE_FROM_ABI void
+__populate_left_bitset(_RandomAccessIterator __first, _Compare __comp, _ValueType& __pivot, uint64_t& __left_bitset) {
+ // Possible vectorization. With a proper "-march" flag, the following loop
+ // will be compiled into a set of SIMD instructions.
+ _RandomAccessIterator __iter = __first;
+ for (int __j = 0; __j < __detail::__block_size;) {
+ bool __comp_result = !__comp(*__iter, __pivot);
+ __left_bitset |= (static_cast<uint64_t>(__comp_result) << __j);
+ __j++;
+ ++__iter;
+ }
+}
+
+template <class _Compare,
+ class _RandomAccessIterator,
+ class _ValueType = typename iterator_traits<_RandomAccessIterator>::value_type>
+inline _LIBCPP_HIDE_FROM_ABI void
+__populate_right_bitset(_RandomAccessIterator __lm1, _Compare __comp, _ValueType& __pivot, uint64_t& __right_bitset) {
+ // Possible vectorization. With a proper "-march" flag, the following loop
+ // will be compiled into a set of SIMD instructions.
+ _RandomAccessIterator __iter = __lm1;
+ for (int __j = 0; __j < __detail::__block_size;) {
+ bool __comp_result = __comp(*__iter, __pivot);
+ __right_bitset |= (static_cast<uint64_t>(__comp_result) << __j);
+ __j++;
+ --__iter;
+ }
+}
+
+template <class _AlgPolicy,
+ class _Compare,
+ class _RandomAccessIterator,
+ class _ValueType = typename iterator_traits<_RandomAccessIterator>::value_type>
+inline _LIBCPP_HIDE_FROM_ABI void __bitset_partition_partial_blocks(
+ _RandomAccessIterator& __first,
+ _RandomAccessIterator& __lm1,
+ _Compare __comp,
+ _ValueType& __pivot,
+ uint64_t& __left_bitset,
+ uint64_t& __right_bitset) {
+ typedef typename std::iterator_traits<_RandomAccessIterator>::
diff erence_type
diff erence_type;
+
diff erence_type __remaining_len = __lm1 - __first + 1;
+
diff erence_type __l_size;
+
diff erence_type __r_size;
+ if (__left_bitset == 0 && __right_bitset == 0) {
+ __l_size = __remaining_len / 2;
+ __r_size = __remaining_len - __l_size;
+ } else if (__left_bitset == 0) {
+ // We know at least one side is a full block.
+ __l_size = __remaining_len - __detail::__block_size;
+ __r_size = __detail::__block_size;
+ } else { // if (__right_bitset == 0)
+ __l_size = __detail::__block_size;
+ __r_size = __remaining_len - __detail::__block_size;
+ }
+ // Record the comparison outcomes for the elements currently on the left side.
+ if (__left_bitset == 0) {
+ _RandomAccessIterator __iter = __first;
+ for (int j = 0; j < __l_size; j++) {
+ bool __comp_result = !__comp(*__iter, __pivot);
+ __left_bitset |= (static_cast<uint64_t>(__comp_result) << j);
+ ++__iter;
+ }
+ }
+ // Record the comparison outcomes for the elements currently on the right
+ // side.
+ if (__right_bitset == 0) {
+ _RandomAccessIterator __iter = __lm1;
+ for (int j = 0; j < __r_size; j++) {
+ bool __comp_result = __comp(*__iter, __pivot);
+ __right_bitset |= (static_cast<uint64_t>(__comp_result) << j);
+ --__iter;
+ }
+ }
+ __swap_bitmap_pos<_AlgPolicy, _RandomAccessIterator>(__first, __lm1, __left_bitset, __right_bitset);
+ __first += (__left_bitset == 0) ? __l_size : 0;
+ __lm1 -= (__right_bitset == 0) ? __r_size : 0;
+}
+
+template <class _AlgPolicy, class _RandomAccessIterator>
+inline _LIBCPP_HIDE_FROM_ABI void __swap_bitmap_pos_within(
+ _RandomAccessIterator& __first, _RandomAccessIterator& __lm1, uint64_t& __left_bitset, uint64_t& __right_bitset) {
+ using _Ops = _IterOps<_AlgPolicy>;
+ typedef typename std::iterator_traits<_RandomAccessIterator>::
diff erence_type
diff erence_type;
+ if (__left_bitset) {
+ // Swap within the left side. Need to find set positions in the reverse
+ // order.
+ while (__left_bitset != 0) {
+
diff erence_type __tz_left = __detail::__block_size - 1 - __libcpp_clz(__left_bitset);
+ __left_bitset &= (static_cast<uint64_t>(1) << __tz_left) - 1;
+ _RandomAccessIterator it = __first + __tz_left;
+ if (it != __lm1) {
+ _Ops::iter_swap(it, __lm1);
+ }
+ --__lm1;
+ }
+ __first = __lm1 +
diff erence_type(1);
+ } else if (__right_bitset) {
+ // Swap within the right side. Need to find set positions in the reverse
+ // order.
+ while (__right_bitset != 0) {
+
diff erence_type __tz_right = __detail::__block_size - 1 - __libcpp_clz(__right_bitset);
+ __right_bitset &= (static_cast<uint64_t>(1) << __tz_right) - 1;
+ _RandomAccessIterator it = __lm1 - __tz_right;
+ if (it != __first) {
+ _Ops::iter_swap(it, __first);
+ }
+ ++__first;
+ }
+ }
+}
+
+// Partition [__first, __last) using the comparator __comp. *__first has the
+// chosen pivot. Elements that are equivalent are kept to the left of the
+// pivot. Returns the iterator for the pivot and a bool value which is true if
+// the provided range is already sorted, false otherwise. We assume that the
+// length of the range is at least three elements.
+//
+// __bitset_partition uses bitsets for storing outcomes of the comparisons
+// between the pivot and other elements.
+template <class _AlgPolicy, class _RandomAccessIterator, class _Compare>
+_LIBCPP_HIDE_FROM_ABI std::pair<_RandomAccessIterator, bool>
+__bitset_partition(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
+ using _Ops = _IterOps<_AlgPolicy>;
+ typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type;
+ typedef typename std::iterator_traits<_RandomAccessIterator>::
diff erence_type
diff erence_type;
+ _LIBCPP_ASSERT(__last - __first >=
diff erence_type(3), "");
+
+ _RandomAccessIterator __begin = __first;
+ value_type __pivot(_Ops::__iter_move(__first));
+ // Find the first element greater than the pivot.
+ if (__comp(__pivot, *(__last -
diff erence_type(1)))) {
+ // Not guarded since we know the last element is greater than the pivot.
+ while (!__comp(__pivot, *++__first)) {
+ }
+ } else {
+ while (++__first < __last && !__comp(__pivot, *__first)) {
+ }
+ }
+ // Find the last element less than or equal to the pivot.
+ if (__first < __last) {
+ // It will be always guarded because __introsort will do the median-of-three
+ // before calling this.
+ while (__comp(__pivot, *--__last)) {
+ }
+ }
+ // If the first element greater than the pivot is at or after the
+ // last element less than or equal to the pivot, then we have covered the
+ // entire range without swapping elements. This implies the range is already
+ // partitioned.
+ bool __already_partitioned = __first >= __last;
+ if (!__already_partitioned) {
+ _Ops::iter_swap(__first, __last);
+ ++__first;
+ }
+
+ // In [__first, __last) __last is not inclusive. From now on, it uses last
+ // minus one to be inclusive on both sides.
+ _RandomAccessIterator __lm1 = __last -
diff erence_type(1);
+ uint64_t __left_bitset = 0;
+ uint64_t __right_bitset = 0;
+
+ // Reminder: length = __lm1 - __first + 1.
+ while (__lm1 - __first >= 2 * __detail::__block_size - 1) {
+ // Record the comparison outcomes for the elements currently on the left
+ // side.
+ if (__left_bitset == 0)
+ __populate_left_bitset<_Compare>(__first, __comp, __pivot, __left_bitset);
+ // Record the comparison outcomes for the elements currently on the right
+ // side.
+ if (__right_bitset == 0)
+ __populate_right_bitset<_Compare>(__lm1, __comp, __pivot, __right_bitset);
+ // Swap the elements recorded to be the candidates for swapping in the
+ // bitsets.
+ __swap_bitmap_pos<_AlgPolicy, _RandomAccessIterator>(__first, __lm1, __left_bitset, __right_bitset);
+ // Only advance the iterator if all the elements that need to be moved to
+ // other side were moved.
+ __first += (__left_bitset == 0) ?
diff erence_type(__detail::__block_size) :
diff erence_type(0);
+ __lm1 -= (__right_bitset == 0) ?
diff erence_type(__detail::__block_size) :
diff erence_type(0);
+ }
+ // Now, we have a less-than a block worth of elements on at least one of the
+ // sides.
+ __bitset_partition_partial_blocks<_AlgPolicy, _Compare>(
+ __first, __lm1, __comp, __pivot, __left_bitset, __right_bitset);
+ // At least one the bitsets would be empty. For the non-empty one, we need to
+ // properly partition the elements that appear within that bitset.
+ __swap_bitmap_pos_within<_AlgPolicy>(__first, __lm1, __left_bitset, __right_bitset);
+
+ // Move the pivot to its correct position.
+ _RandomAccessIterator __pivot_pos = __first -
diff erence_type(1);
+ if (__begin != __pivot_pos) {
+ *__begin = _Ops::__iter_move(__pivot_pos);
+ }
+ *__pivot_pos = std::move(__pivot);
+ return std::make_pair(__pivot_pos, __already_partitioned);
+}
+
+// Partition [__first, __last) using the comparator __comp. *__first has the
+// chosen pivot. Elements that are equivalent are kept to the right of the
+// pivot. Returns the iterator for the pivot and a bool value which is true if
+// the provided range is already sorted, false otherwise. We assume that the
+// length of the range is at least three elements.
+template <class _AlgPolicy, class _RandomAccessIterator, class _Compare>
+_LIBCPP_HIDE_FROM_ABI std::pair<_RandomAccessIterator, bool>
+__partition_with_equals_on_right(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
using _Ops = _IterOps<_AlgPolicy>;
+ typedef typename iterator_traits<_RandomAccessIterator>::
diff erence_type
diff erence_type;
+ typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type;
+ _LIBCPP_ASSERT(__last - __first >=
diff erence_type(3), "");
+ _RandomAccessIterator __begin = __first;
+ value_type __pivot(_Ops::__iter_move(__first));
+ // Find the first element greater or equal to the pivot. It will be always
+ // guarded because __introsort will do the median-of-three before calling
+ // this.
+ while (__comp(*++__first, __pivot))
+ ;
+
+ // Find the last element less than the pivot.
+ if (__begin == __first -
diff erence_type(1)) {
+ while (__first < __last && !__comp(*--__last, __pivot))
+ ;
+ } else {
+ // Guarded.
+ while (!__comp(*--__last, __pivot))
+ ;
+ }
+
+ // If the first element greater than or equal to the pivot is at or after the
+ // last element less than the pivot, then we have covered the entire range
+ // without swapping elements. This implies the range is already partitioned.
+ bool __already_partitioned = __first >= __last;
+ // Go through the remaining elements. Swap pairs of elements (one to the
+ // right of the pivot and the other to left of the pivot) that are not on the
+ // correct side of the pivot.
+ while (__first < __last) {
+ _Ops::iter_swap(__first, __last);
+ while (__comp(*++__first, __pivot))
+ ;
+ while (!__comp(*--__last, __pivot))
+ ;
+ }
+ // Move the pivot to its correct position.
+ _RandomAccessIterator __pivot_pos = __first -
diff erence_type(1);
+ if (__begin != __pivot_pos) {
+ *__begin = _Ops::__iter_move(__pivot_pos);
+ }
+ *__pivot_pos = std::move(__pivot);
+ return std::make_pair(__pivot_pos, __already_partitioned);
+}
+// Similar to the above function. Elements equivalent to the pivot are put to
+// the left of the pivot. Returns the iterator to the pivot element.
+template <class _AlgPolicy, class _RandomAccessIterator, class _Compare>
+_LIBCPP_HIDE_FROM_ABI _RandomAccessIterator
+__partition_with_equals_on_left(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
+ using _Ops = _IterOps<_AlgPolicy>;
typedef typename iterator_traits<_RandomAccessIterator>::
diff erence_type
diff erence_type;
- typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
- const
diff erence_type __limit =
- is_trivially_copy_constructible<value_type>::value && is_trivially_copy_assignable<value_type>::value ? 30 : 6;
+ typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type;
+ _RandomAccessIterator __begin = __first;
+ value_type __pivot(_Ops::__iter_move(__first));
+ if (__comp(__pivot, *(__last -
diff erence_type(1)))) {
+ // Guarded.
+ while (!__comp(__pivot, *++__first)) {
+ }
+ } else {
+ while (++__first < __last && !__comp(__pivot, *__first)) {
+ }
+ }
+
+ if (__first < __last) {
+ // It will be always guarded because __introsort will do the
+ // median-of-three before calling this.
+ while (__comp(__pivot, *--__last)) {
+ }
+ }
+ while (__first < __last) {
+ _Ops::iter_swap(__first, __last);
+ while (!__comp(__pivot, *++__first))
+ ;
+ while (__comp(__pivot, *--__last))
+ ;
+ }
+ _RandomAccessIterator __pivot_pos = __first -
diff erence_type(1);
+ if (__begin != __pivot_pos) {
+ *__begin = _Ops::__iter_move(__pivot_pos);
+ }
+ *__pivot_pos = std::move(__pivot);
+ return __first;
+}
+
+// The main sorting function. Implements introsort combined with other ideas:
+// - option of using block quick sort for partitioning,
+// - guarded and unguarded insertion sort for small lengths,
+// - Tuckey's ninther technique for computing the pivot,
+// - check on whether partition was not required.
+// The implementation is partly based on Orson Peters' pattern-defeating
+// quicksort, published at: <https://github.com/orlp/pdqsort>.
+template <class _AlgPolicy, class _Compare, class _RandomAccessIterator, bool _UseBitSetPartition>
+void __introsort(_RandomAccessIterator __first,
+ _RandomAccessIterator __last,
+ _Compare __comp,
+ typename iterator_traits<_RandomAccessIterator>::
diff erence_type __depth,
+ bool __leftmost = true) {
+ using _Ops = _IterOps<_AlgPolicy>;
+ typedef typename iterator_traits<_RandomAccessIterator>::
diff erence_type
diff erence_type;
+ using _Comp_ref = __comp_ref_type<_Compare>;
+ // Upper bound for using insertion sort for sorting.
+ _LIBCPP_CONSTEXPR
diff erence_type __limit = 24;
+ // Lower bound for using Tuckey's ninther technique for median computation.
+ _LIBCPP_CONSTEXPR
diff erence_type __ninther_threshold = 128;
while (true) {
- __restart:
diff erence_type __len = __last - __first;
switch (__len) {
case 0:
@@ -442,7 +787,7 @@ void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _C
return;
case 2:
if (__comp(*--__last, *__first))
- _IterOps<_AlgPolicy>::iter_swap(__first, __last);
+ _Ops::iter_swap(__first, __last);
return;
case 3:
std::__sort3_maybe_branchless<_AlgPolicy, _Compare>(__first, __first +
diff erence_type(1), --__last, __comp);
@@ -457,127 +802,60 @@ void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _C
--__last, __comp);
return;
}
- if (__len <= __limit) {
- std::__insertion_sort_3<_AlgPolicy, _Compare>(__first, __last, __comp);
+ // Use insertion sort if the length of the range is below the specified limit.
+ if (__len < __limit) {
+ if (__leftmost) {
+ std::__insertion_sort<_AlgPolicy, _Compare>(__first, __last, __comp);
+ } else {
+ std::__insertion_sort_unguarded<_AlgPolicy, _Compare>(__first, __last, __comp);
+ }
return;
}
- // __len > 5
if (__depth == 0) {
// Fallback to heap sort as Introsort suggests.
std::__partial_sort<_AlgPolicy, _Compare>(__first, __last, __last, __comp);
return;
}
--__depth;
- _RandomAccessIterator __m = __first;
- _RandomAccessIterator __lm1 = __last;
- --__lm1;
- unsigned __n_swaps;
{
-
diff erence_type __delta;
- if (__len >= 1000) {
- __delta = __len / 2;
- __m += __delta;
- __delta /= 2;
- __n_swaps = std::__sort5_wrap_policy<_AlgPolicy, _Compare>(
- __first, __first + __delta, __m, __m + __delta, __lm1, __comp);
+
diff erence_type __half_len = __len / 2;
+ // Use Tuckey's ninther technique or median of 3 for pivot selection
+ // depending on the length of the range being sorted.
+ if (__len > __ninther_threshold) {
+ std::__sort3<_AlgPolicy, _Compare>(__first, __first + __half_len, __last -
diff erence_type(1), __comp);
+ std::__sort3<_AlgPolicy, _Compare>(
+ __first +
diff erence_type(1), __first + (__half_len - 1), __last -
diff erence_type(2), __comp);
+ std::__sort3<_AlgPolicy, _Compare>(
+ __first +
diff erence_type(2), __first + (__half_len + 1), __last -
diff erence_type(3), __comp);
+ std::__sort3<_AlgPolicy, _Compare>(
+ __first + (__half_len - 1), __first + __half_len, __first + (__half_len + 1), __comp);
+ _Ops::iter_swap(__first, __first + __half_len);
} else {
- __delta = __len / 2;
- __m += __delta;
- __n_swaps = std::__sort3<_AlgPolicy, _Compare>(__first, __m, __lm1, __comp);
+ std::__sort3<_AlgPolicy, _Compare>(__first + __half_len, __first, __last -
diff erence_type(1), __comp);
}
}
- // *__m is median
- // partition [__first, __m) < *__m and *__m <= [__m, __last)
- // (this inhibits tossing elements equivalent to __m around unnecessarily)
- _RandomAccessIterator __i = __first;
- _RandomAccessIterator __j = __lm1;
- // j points beyond range to be tested, *__m is known to be <= *__lm1
- // The search going up is known to be guarded but the search coming down isn't.
- // Prime the downward search with a guard.
- if (!__comp(*__i, *__m)) // if *__first == *__m
- {
- // *__first == *__m, *__first doesn't go in first part
- // manually guard downward moving __j against __i
- while (true) {
- if (__i == --__j) {
- // *__first == *__m, *__m <= all other elements
- // Parition instead into [__first, __i) == *__first and *__first < [__i, __last)
- ++__i; // __first + 1
- __j = __last;
- if (!__comp(*__first, *--__j)) // we need a guard if *__first == *(__last-1)
- {
- while (true) {
- if (__i == __j)
- return; // [__first, __last) all equivalent elements
- if (__comp(*__first, *__i)) {
- _Ops::iter_swap(__i, __j);
- ++__n_swaps;
- ++__i;
- break;
- }
- ++__i;
- }
- }
- // [__first, __i) == *__first and *__first < [__j, __last) and __j == __last - 1
- if (__i == __j)
- return;
- while (true) {
- while (!__comp(*__first, *__i))
- ++__i;
- while (__comp(*__first, *--__j))
- ;
- if (__i >= __j)
- break;
- _Ops::iter_swap(__i, __j);
- ++__n_swaps;
- ++__i;
- }
- // [__first, __i) == *__first and *__first < [__i, __last)
- // The first part is sorted, sort the second part
- // std::__sort<_Compare>(__i, __last, __comp);
- __first = __i;
- goto __restart;
- }
- if (__comp(*__j, *__m)) {
- _Ops::iter_swap(__i, __j);
- ++__n_swaps;
- break; // found guard for downward moving __j, now use unguarded partition
- }
- }
- }
- // It is known that *__i < *__m
- ++__i;
- // j points beyond range to be tested, *__m is known to be <= *__lm1
- // if not yet partitioned...
- if (__i < __j) {
- // known that *(__i - 1) < *__m
- // known that __i <= __m
- while (true) {
- // __m still guards upward moving __i
- while (__comp(*__i, *__m))
- ++__i;
- // It is now known that a guard exists for downward moving __j
- while (!__comp(*--__j, *__m))
- ;
- if (__i > __j)
- break;
- _Ops::iter_swap(__i, __j);
- ++__n_swaps;
- // It is known that __m != __j
- // If __m just moved, follow it
- if (__m == __i)
- __m = __j;
- ++__i;
- }
- }
- // [__first, __i) < *__m and *__m <= [__i, __last)
- if (__i != __m && __comp(*__m, *__i)) {
- _Ops::iter_swap(__i, __m);
- ++__n_swaps;
+ // The elements to the left of the current iterator range are already
+ // sorted. If the current iterator range to be sorted is not the
+ // leftmost part of the entire iterator range and the pivot is same as
+ // the highest element in the range to the left, then we know that all
+ // the elements in the range [first, pivot] would be equal to the pivot,
+ // assuming the equal elements are put on the left side when
+ // partitioned. This also means that we do not need to sort the left
+ // side of the partition.
+ if (!__leftmost && !__comp(*(__first -
diff erence_type(1)), *__first)) {
+ __first = __partition_with_equals_on_left<_AlgPolicy, _RandomAccessIterator, _Comp_ref>(
+ __first, __last, _Comp_ref(__comp));
+ continue;
}
+ // Use bitset partition only if asked for.
+ auto __ret =
+ _UseBitSetPartition
+ ? __bitset_partition<_AlgPolicy, _RandomAccessIterator, _Compare>(__first, __last, __comp)
+ : __partition_with_equals_on_right<_AlgPolicy, _RandomAccessIterator, _Compare>(__first, __last, __comp);
+ _RandomAccessIterator __i = __ret.first;
// [__first, __i) < *__i and *__i <= [__i+1, __last)
// If we were given a perfect partition, see if insertion sort is quick...
- if (__n_swaps == 0) {
+ if (__ret.second) {
using _WrappedComp = typename _WrapAlgPolicy<_AlgPolicy, _Compare>::type;
_WrappedComp __wrapped_comp(__comp);
bool __fs = std::__insertion_sort_incomplete<_WrappedComp>(__first, __i, __wrapped_comp);
@@ -593,14 +871,11 @@ void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _C
}
}
}
- // sort smaller range with recursive call and larger with tail recursion elimination
- if (__i - __first < __last - __i) {
- std::__introsort<_AlgPolicy, _Compare>(__first, __i, __comp, __depth);
- __first = ++__i;
- } else {
- std::__introsort<_AlgPolicy, _Compare>(__i +
diff erence_type(1), __last, __comp, __depth);
- __last = __i;
- }
+ // Sort the left partiton recursively and the right partition with tail recursion elimination.
+ std::__introsort<_AlgPolicy, _Compare, _RandomAccessIterator, _UseBitSetPartition>(
+ __first, __i, __comp, __depth, __leftmost);
+ __leftmost = false;
+ __first = ++__i;
}
}
@@ -627,12 +902,18 @@ template <class _WrappedComp, class _RandomAccessIterator>
_LIBCPP_HIDDEN void __sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _WrappedComp __wrapped_comp) {
typedef typename iterator_traits<_RandomAccessIterator>::
diff erence_type
diff erence_type;
diff erence_type __depth_limit = 2 * __log2i(__last - __first);
-
using _Unwrap = _UnwrapAlgPolicy<_WrappedComp>;
using _AlgPolicy = typename _Unwrap::_AlgPolicy;
using _Compare = typename _Unwrap::_Comp;
_Compare __comp = _Unwrap::__get_comp(__wrapped_comp);
- std::__introsort<_AlgPolicy, _Compare>(__first, __last, __comp, __depth_limit);
+ // Only use bitset partitioning for arithmetic types. We should also check
+ // that the default comparator is in use so that we are sure that there are no
+ // branches in the comparator.
+ std::__introsort<_AlgPolicy,
+ _Compare,
+ _RandomAccessIterator,
+ __use_branchless_sort<_Compare, _RandomAccessIterator>::value>(
+ __first, __last, __comp, __depth_limit);
}
template <class _Compare, class _Tp>
diff --git a/libcxx/include/__bits b/libcxx/include/__bits
index d2c8439a6ba72..1e4b51ca699b9 100644
--- a/libcxx/include/__bits
+++ b/libcxx/include/__bits
@@ -68,6 +68,18 @@ int __libcpp_popcount(unsigned long __x) _NOEXCEPT { return __builtin_popco
inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
int __libcpp_popcount(unsigned long long __x) _NOEXCEPT { return __builtin_popcountll(__x); }
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR unsigned __libcpp_blsr(unsigned __x) _NOEXCEPT {
+ return __x ^ (__x & -__x);
+}
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR unsigned long __libcpp_blsr(unsigned long __x) _NOEXCEPT {
+ return __x ^ (__x & -__x);
+}
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR unsigned long long __libcpp_blsr(unsigned long long __x) _NOEXCEPT {
+ return __x ^ (__x & -__x);
+}
+
_LIBCPP_END_NAMESPACE_STD
_LIBCPP_POP_MACROS
More information about the libcxx-commits
mailing list