[libcxx-commits] [libcxx] 4eddbf9 - std::sort: add BlockQuickSort partitioning algorithm for arithmetic types

Nilay Vaish via libcxx-commits libcxx-commits at lists.llvm.org
Thu Dec 22 14:47:31 PST 2022


Author: Nilay Vaish
Date: 2022-12-22T14:46:56-08:00
New Revision: 4eddbf9f10a6d1881c93d84f4363d6d881daf848

URL: https://github.com/llvm/llvm-project/commit/4eddbf9f10a6d1881c93d84f4363d6d881daf848
DIFF: https://github.com/llvm/llvm-project/commit/4eddbf9f10a6d1881c93d84f4363d6d881daf848.diff

LOG: std::sort: add BlockQuickSort partitioning algorithm for arithmetic types

This diff modifies std::sort in two ways:

* for arithmetic types we update the core partitioning algorithm to use
BlockQuickSort for partitioning. The partition function was carefully
written to let the compiler generates SIMD instructions without actually
writing SIMD intrinsics in the loop. We see up to 50% better performance
for sorting arithmetic types. The use of the BlockQuickSort partitioning
has been limited to arithmetic types since the algorithm works well when
branch instructions can be avoided during partitioning. This usually not
true for types other than the arithmetic ones.

* for other types (tuples, strings) updates have been made to improve
performance by about 10%.  Performance numbers comparing std::sort (old)
and Bitset sort (new) on libcxx benchmark.

name                                                             old cpu/op  new cpu/op  delta
BM_Sort_uint32_Random_1                                          3.72ns ± 5%  3.78ns ±16%      ~     (p=0.819 n=36+34)
BM_Sort_uint32_Random_4                                          5.42ns ± 5%  5.29ns ± 7%    -2.42%  (p=0.000 n=35+31)
BM_Sort_uint32_Random_16                                         10.5ns ± 3%  11.9ns ±15%   +13.08%  (p=0.000 n=36+40)
BM_Sort_uint32_Random_64                                         18.6ns ± 7%  18.5ns ±15%    -0.95%  (p=0.002 n=33+40)
BM_Sort_uint32_Random_256                                        26.2ns ± 4%  21.3ns ± 8%   -18.89%  (p=0.000 n=37+34)
BM_Sort_uint32_Random_1024                                       33.4ns ± 5%  23.3ns ± 4%   -30.37%  (p=0.000 n=39+35)
BM_Sort_uint32_Random_16384                                      47.7ns ± 5%  26.7ns ± 5%   -44.06%  (p=0.000 n=39+35)
BM_Sort_uint32_Random_262144                                     62.6ns ± 3%  30.1ns ± 6%   -51.81%  (p=0.000 n=37+36)
BM_Sort_uint32_Ascending_1                                       3.71ns ± 3%  4.28ns ± 3%   +15.53%  (p=0.000 n=37+35)
BM_Sort_uint32_Ascending_4                                       1.47ns ± 3%  1.46ns ± 3%      ~     (p=0.083 n=36+37)
BM_Sort_uint32_Ascending_16                                      0.93ns ± 4%  1.02ns ± 3%    +9.32%  (p=0.000 n=36+36)
BM_Sort_uint32_Ascending_64                                      1.23ns ± 5%  1.51ns ± 3%   +22.56%  (p=0.000 n=34+36)
BM_Sort_uint32_Ascending_256                                     1.21ns ± 3%  1.57ns ± 4%   +29.77%  (p=0.000 n=33+35)
BM_Sort_uint32_Ascending_1024                                    1.03ns ± 4%  1.43ns ± 3%   +38.44%  (p=0.000 n=32+35)
BM_Sort_uint32_Ascending_16384                                   0.94ns ± 8%  1.36ns ± 5%   +44.09%  (p=0.000 n=32+35)
BM_Sort_uint32_Ascending_262144                                  0.93ns ± 3%  1.35ns ± 7%   +45.06%  (p=0.000 n=32+36)
BM_Sort_uint32_Descending_1                                      3.69ns ± 2%  4.27ns ± 3%   +15.73%  (p=0.000 n=31+36)
BM_Sort_uint32_Descending_4                                      1.74ns ± 2%  1.78ns ± 3%    +2.29%  (p=0.000 n=31+38)
BM_Sort_uint32_Descending_16                                     3.92ns ± 4%  4.20ns ± 4%    +7.13%  (p=0.000 n=32+38)
BM_Sort_uint32_Descending_64                                     2.09ns ± 4%  3.25ns ± 4%   +55.10%  (p=0.000 n=33+37)
BM_Sort_uint32_Descending_256                                    1.98ns ± 7%  2.93ns ± 4%   +47.95%  (p=0.000 n=34+36)
BM_Sort_uint32_Descending_1024                                   2.23ns ± 6%  2.64ns ± 3%   +18.22%  (p=0.000 n=34+38)
BM_Sort_uint32_Descending_16384                                  1.93ns ± 6%  2.43ns ± 4%   +25.99%  (p=0.000 n=34+35)
BM_Sort_uint32_Descending_262144                                 1.89ns ± 3%  2.38ns ± 4%   +25.41%  (p=0.000 n=33+35)
BM_Sort_uint32_SingleElement_1                                   3.67ns ± 2%  4.28ns ± 4%   +16.60%  (p=0.000 n=34+34)
BM_Sort_uint32_SingleElement_4                                   1.48ns ± 4%  1.48ns ± 5%      ~     (p=0.951 n=35+33)
BM_Sort_uint32_SingleElement_16                                  0.93ns ± 3%  1.02ns ± 4%    +9.51%  (p=0.000 n=36+33)
BM_Sort_uint32_SingleElement_64                                  0.76ns ± 3%  1.59ns ± 8%  +109.78%  (p=0.000 n=36+32)
BM_Sort_uint32_SingleElement_256                                 0.82ns ± 4%  1.45ns ± 5%   +76.62%  (p=0.000 n=37+34)
BM_Sort_uint32_SingleElement_1024                                0.77ns ± 4%  1.31ns ± 4%   +71.40%  (p=0.000 n=34+34)
BM_Sort_uint32_SingleElement_16384                               0.64ns ± 4%  1.24ns ± 6%   +93.29%  (p=0.000 n=35+36)
BM_Sort_uint32_SingleElement_262144                              0.63ns ± 3%  1.23ns ± 4%   +95.17%  (p=0.000 n=35+35)
BM_Sort_uint32_PipeOrgan_1                                       3.68ns ± 2%  4.42ns ± 3%   +20.31%  (p=0.000 n=34+36)
BM_Sort_uint32_PipeOrgan_4                                       1.54ns ± 3%  1.53ns ± 3%      ~     (p=0.128 n=34+36)
BM_Sort_uint32_PipeOrgan_16                                      2.22ns ± 3%  1.99ns ± 3%   -10.28%  (p=0.000 n=33+36)
BM_Sort_uint32_PipeOrgan_64                                      4.41ns ± 3%  3.39ns ± 4%   -23.17%  (p=0.000 n=35+37)
BM_Sort_uint32_PipeOrgan_256                                     2.75ns ± 5%  3.07ns ± 3%   +11.74%  (p=0.000 n=37+37)
BM_Sort_uint32_PipeOrgan_1024                                    3.58ns ± 2%  5.48ns ± 3%   +52.97%  (p=0.000 n=37+36)
BM_Sort_uint32_PipeOrgan_16384                                   4.10ns ± 3%  6.53ns ± 3%   +59.27%  (p=0.000 n=37+37)
BM_Sort_uint32_PipeOrgan_262144                                  4.90ns ± 3%  7.39ns ± 3%   +50.71%  (p=0.000 n=34+37)
BM_Sort_uint32_QuickSortAdversary_1                              3.68ns ± 2%  4.28ns ± 3%   +16.19%  (p=0.000 n=36+37)
BM_Sort_uint32_QuickSortAdversary_4                              1.46ns ± 4%  1.46ns ± 3%      ~     (p=0.736 n=35+38)
BM_Sort_uint32_QuickSortAdversary_16                             0.93ns ± 3%  1.02ns ± 4%    +9.69%  (p=0.000 n=36+37)
BM_Sort_uint32_QuickSortAdversary_64                             13.6ns ± 4%  17.9ns ± 8%   +31.37%  (p=0.000 n=36+35)
BM_Sort_uint32_QuickSortAdversary_256                            20.0ns ± 4%  25.7ns ± 4%   +28.69%  (p=0.000 n=36+35)
BM_Sort_uint32_QuickSortAdversary_1024                           28.3ns ± 6%  31.7ns ± 3%   +12.12%  (p=0.000 n=36+37)
BM_Sort_uint32_QuickSortAdversary_16384                          45.8ns ± 3%  50.6ns ± 4%   +10.32%  (p=0.000 n=38+36)
BM_Sort_uint32_QuickSortAdversary_262144                         61.6ns ± 4%  68.2ns ± 4%   +10.68%  (p=0.000 n=37+37)
BM_Sort_uint64_Random_1                                          3.71ns ± 4%  4.00ns ± 4%    +7.93%  (p=0.000 n=34+35)
BM_Sort_uint64_Random_4                                          5.52ns ± 8%  5.22ns ± 6%    -5.41%  (p=0.000 n=32+32)
BM_Sort_uint64_Random_16                                         10.7ns ±15%  10.2ns ± 7%      ~     (p=0.077 n=40+31)
BM_Sort_uint64_Random_64                                         19.0ns ±14%  18.2ns ±14%    -4.31%  (p=0.001 n=40+40)
BM_Sort_uint64_Random_256                                        25.7ns ± 9%  22.1ns ±15%   -13.82%  (p=0.000 n=33+40)
BM_Sort_uint64_Random_1024                                       32.4ns ± 6%  23.8ns ±16%   -26.64%  (p=0.000 n=33+40)
BM_Sort_uint64_Random_16384                                      46.8ns ± 3%  27.1ns ±16%   -42.15%  (p=0.000 n=33+40)
BM_Sort_uint64_Random_262144                                     61.3ns ± 4%  30.4ns ±16%   -50.34%  (p=0.000 n=34+40)
BM_Sort_uint64_Ascending_1                                       3.67ns ± 3%  3.87ns ±16%    +5.36%  (p=0.049 n=35+40)
BM_Sort_uint64_Ascending_4                                       1.46ns ± 3%  1.46ns ± 3%      ~     (p=0.130 n=37+31)
BM_Sort_uint64_Ascending_16                                      1.09ns ± 3%  0.91ns ± 6%   -16.79%  (p=0.000 n=38+32)
BM_Sort_uint64_Ascending_64                                      1.25ns ± 3%  1.29ns ± 5%    +3.11%  (p=0.000 n=38+34)
BM_Sort_uint64_Ascending_256                                     1.37ns ± 3%  1.42ns ± 3%    +3.07%  (p=0.000 n=39+35)
BM_Sort_uint64_Ascending_1024                                    1.12ns ± 3%  1.17ns ± 3%    +5.28%  (p=0.000 n=37+36)
BM_Sort_uint64_Ascending_16384                                   0.98ns ± 3%  1.09ns ± 3%   +10.95%  (p=0.000 n=36+37)
BM_Sort_uint64_Ascending_262144                                  0.98ns ± 3%  1.08ns ± 3%   +10.97%  (p=0.000 n=36+37)
BM_Sort_uint64_Descending_1                                      3.68ns ± 3%  3.67ns ± 3%      ~     (p=0.652 n=36+36)
BM_Sort_uint64_Descending_4                                      1.71ns ± 3%  1.73ns ± 3%    +1.50%  (p=0.000 n=33+34)
BM_Sort_uint64_Descending_16                                     4.96ns ± 2%  5.49ns ± 3%   +10.73%  (p=0.000 n=31+36)
BM_Sort_uint64_Descending_64                                     2.14ns ± 6%  3.03ns ± 3%   +41.72%  (p=0.000 n=32+35)
BM_Sort_uint64_Descending_256                                    2.03ns ± 4%  2.86ns ± 4%   +40.55%  (p=0.000 n=32+34)
BM_Sort_uint64_Descending_1024                                   2.20ns ± 2%  2.29ns ± 3%    +4.20%  (p=0.000 n=31+36)
BM_Sort_uint64_Descending_16384                                  1.89ns ± 3%  2.08ns ± 3%   +10.00%  (p=0.000 n=31+37)
BM_Sort_uint64_Descending_262144                                 1.92ns ± 3%  2.07ns ± 4%    +7.95%  (p=0.000 n=31+36)
BM_Sort_uint64_SingleElement_1                                   3.68ns ± 5%  3.67ns ± 3%      ~     (p=0.716 n=31+37)
BM_Sort_uint64_SingleElement_4                                   1.46ns ± 3%  1.46ns ± 3%      ~     (p=0.557 n=34+37)
BM_Sort_uint64_SingleElement_16                                  1.09ns ± 2%  0.91ns ± 3%   -16.93%  (p=0.000 n=33+36)
BM_Sort_uint64_SingleElement_64                                  0.83ns ± 4%  1.47ns ± 4%   +78.03%  (p=0.000 n=34+34)
BM_Sort_uint64_SingleElement_256                                 0.95ns ± 4%  1.28ns ± 4%   +35.17%  (p=0.000 n=35+35)
BM_Sort_uint64_SingleElement_1024                                0.76ns ± 3%  1.05ns ± 3%   +37.78%  (p=0.000 n=35+33)
BM_Sort_uint64_SingleElement_16384                               0.71ns ± 2%  0.98ns ± 5%   +38.43%  (p=0.000 n=34+33)
BM_Sort_uint64_SingleElement_262144                              0.72ns ± 3%  0.98ns ± 4%   +35.93%  (p=0.000 n=35+33)
BM_Sort_uint64_PipeOrgan_1                                       3.68ns ± 3%  3.68ns ± 3%      ~     (p=0.650 n=35+33)
BM_Sort_uint64_PipeOrgan_4                                       1.53ns ± 2%  1.54ns ± 4%      ~     (p=0.424 n=33+36)
BM_Sort_uint64_PipeOrgan_16                                      2.23ns ± 3%  2.06ns ± 4%    -7.68%  (p=0.000 n=34+35)
BM_Sort_uint64_PipeOrgan_64                                      5.46ns ± 2%  3.41ns ± 4%   -37.67%  (p=0.000 n=33+36)
BM_Sort_uint64_PipeOrgan_256                                     2.92ns ± 4%  2.91ns ± 3%      ~     (p=0.257 n=35+35)
BM_Sort_uint64_PipeOrgan_1024                                    3.72ns ± 3%  5.35ns ± 4%   +43.95%  (p=0.000 n=35+35)
BM_Sort_uint64_PipeOrgan_16384                                   4.12ns ± 3%  6.37ns ± 3%   +54.74%  (p=0.000 n=34+36)
BM_Sort_uint64_PipeOrgan_262144                                  4.99ns ± 3%  7.25ns ± 5%   +45.45%  (p=0.000 n=35+35)
BM_Sort_uint64_QuickSortAdversary_1                              3.67ns ± 2%  3.65ns ± 3%      ~     (p=0.071 n=35+37)
BM_Sort_uint64_QuickSortAdversary_4                              1.46ns ± 3%  1.46ns ± 3%      ~     (p=0.214 n=36+37)
BM_Sort_uint64_QuickSortAdversary_16                             1.09ns ± 3%  0.91ns ± 3%   -16.73%  (p=0.000 n=36+38)
BM_Sort_uint64_QuickSortAdversary_64                             13.7ns ± 3%  17.8ns ± 5%   +29.86%  (p=0.000 n=36+37)
BM_Sort_uint64_QuickSortAdversary_256                            20.0ns ± 3%  25.9ns ± 3%   +29.25%  (p=0.000 n=35+38)
BM_Sort_uint64_QuickSortAdversary_1024                           28.1ns ± 3%  31.0ns ± 4%   +10.35%  (p=0.000 n=33+37)
BM_Sort_uint64_QuickSortAdversary_16384                          45.8ns ± 2%  50.5ns ± 4%   +10.29%  (p=0.000 n=36+37)
BM_Sort_uint64_QuickSortAdversary_262144                         64.9ns ± 3%  69.5ns ± 3%    +7.15%  (p=0.000 n=36+36)
BM_Sort_pair<uint32, uint32>_Random_1                            4.03ns ± 5%  4.33ns ± 4%    +7.31%  (p=0.000 n=36+36)
BM_Sort_pair<uint32, uint32>_Random_4                            6.78ns ± 5%  6.71ns ± 4%    -1.09%  (p=0.040 n=35+35)
BM_Sort_pair<uint32, uint32>_Random_16                           25.2ns ± 6%  16.8ns ± 7%   -33.35%  (p=0.000 n=35+35)
BM_Sort_pair<uint32, uint32>_Random_64                           35.6ns ± 7%  27.2ns ± 8%   -23.73%  (p=0.000 n=34+36)
BM_Sort_pair<uint32, uint32>_Random_256                          43.5ns ±13%  34.0ns ± 8%   -21.78%  (p=0.000 n=32+34)
BM_Sort_pair<uint32, uint32>_Random_1024                         50.6ns ± 8%  40.8ns ± 5%   -19.35%  (p=0.000 n=32+32)
BM_Sort_pair<uint32, uint32>_Random_16384                        66.0ns ± 3%  55.9ns ± 6%   -15.24%  (p=0.000 n=32+32)
BM_Sort_pair<uint32, uint32>_Random_262144                       82.4ns ± 4%  72.0ns ± 5%   -12.64%  (p=0.000 n=32+31)
BM_Sort_pair<uint32, uint32>_Ascending_1                         4.00ns ± 2%  4.50ns ±16%   +12.59%  (p=0.000 n=33+40)
BM_Sort_pair<uint32, uint32>_Ascending_4                         2.22ns ± 3%  2.34ns ±16%    +5.46%  (p=0.041 n=33+40)
BM_Sort_pair<uint32, uint32>_Ascending_16                        2.33ns ± 4%  1.30ns ±15%   -44.33%  (p=0.000 n=34+40)
BM_Sort_pair<uint32, uint32>_Ascending_64                        1.39ns ± 4%  1.50ns ± 8%    +8.48%  (p=0.000 n=35+32)
BM_Sort_pair<uint32, uint32>_Ascending_256                       1.47ns ± 4%  1.56ns ± 3%    +5.96%  (p=0.000 n=37+31)
BM_Sort_pair<uint32, uint32>_Ascending_1024                      1.34ns ± 3%  1.35ns ± 4%    +1.22%  (p=0.000 n=38+31)
BM_Sort_pair<uint32, uint32>_Ascending_16384                     1.18ns ± 2%  1.18ns ± 3%      ~     (p=0.687 n=37+32)
BM_Sort_pair<uint32, uint32>_Ascending_262144                    1.18ns ± 3%  1.17ns ± 2%      ~     (p=0.153 n=38+34)
BM_Sort_pair<uint32, uint32>_Descending_1                        4.00ns ± 2%  4.29ns ± 3%    +7.22%  (p=0.000 n=37+36)
BM_Sort_pair<uint32, uint32>_Descending_4                        2.91ns ± 3%  2.92ns ± 3%      ~     (p=0.065 n=37+35)
BM_Sort_pair<uint32, uint32>_Descending_16                       4.96ns ± 4%  6.51ns ± 2%   +31.36%  (p=0.000 n=37+30)
BM_Sort_pair<uint32, uint32>_Descending_64                       3.13ns ± 2%  2.92ns ± 3%    -6.71%  (p=0.000 n=36+37)
BM_Sort_pair<uint32, uint32>_Descending_256                      2.56ns ± 3%  2.73ns ± 5%    +6.55%  (p=0.000 n=35+37)
BM_Sort_pair<uint32, uint32>_Descending_1024                     3.11ns ± 3%  2.34ns ± 4%   -24.85%  (p=0.000 n=36+35)
BM_Sort_pair<uint32, uint32>_Descending_16384                    2.84ns ± 3%  2.14ns ± 5%   -24.48%  (p=0.000 n=37+37)
BM_Sort_pair<uint32, uint32>_Descending_262144                   2.86ns ± 3%  2.15ns ± 3%   -25.08%  (p=0.000 n=36+35)
BM_Sort_pair<uint32, uint32>_SingleElement_1                     3.99ns ± 3%  4.28ns ± 3%    +7.08%  (p=0.000 n=33+35)
BM_Sort_pair<uint32, uint32>_SingleElement_4                     2.32ns ± 6%  2.30ns ± 3%    -0.77%  (p=0.032 n=32+35)
BM_Sort_pair<uint32, uint32>_SingleElement_16                    1.67ns ± 4%  1.27ns ± 4%   -24.13%  (p=0.000 n=32+35)
BM_Sort_pair<uint32, uint32>_SingleElement_64                    1.64ns ± 7%  1.83ns ± 4%   +11.54%  (p=0.000 n=31+35)
BM_Sort_pair<uint32, uint32>_SingleElement_256                   1.57ns ± 3%  1.90ns ± 3%   +21.46%  (p=0.000 n=31+36)
BM_Sort_pair<uint32, uint32>_SingleElement_1024                  1.49ns ±15%  1.63ns ± 3%    +9.42%  (p=0.000 n=40+37)
BM_Sort_pair<uint32, uint32>_SingleElement_16384                 1.29ns ±17%  1.57ns ± 3%   +21.51%  (p=0.000 n=33+36)
BM_Sort_pair<uint32, uint32>_SingleElement_262144                1.26ns ± 4%  1.56ns ± 4%   +24.11%  (p=0.000 n=33+36)
BM_Sort_pair<uint32, uint32>_PipeOrgan_1                         4.01ns ± 2%  4.28ns ± 3%    +6.68%  (p=0.000 n=32+35)
BM_Sort_pair<uint32, uint32>_PipeOrgan_4                         2.38ns ± 5%  2.42ns ± 4%    +1.61%  (p=0.000 n=34+35)
BM_Sort_pair<uint32, uint32>_PipeOrgan_16                        4.83ns ± 2%  2.71ns ± 7%   -43.96%  (p=0.000 n=34+34)
BM_Sort_pair<uint32, uint32>_PipeOrgan_64                        4.53ns ± 3%  3.89ns ± 7%   -14.11%  (p=0.000 n=35+33)
BM_Sort_pair<uint32, uint32>_PipeOrgan_256                       5.53ns ± 4%  2.81ns ± 4%   -49.13%  (p=0.000 n=36+33)
BM_Sort_pair<uint32, uint32>_PipeOrgan_1024                      6.49ns ± 4%  5.29ns ± 3%   -18.50%  (p=0.000 n=35+32)
BM_Sort_pair<uint32, uint32>_PipeOrgan_16384                     7.21ns ± 4%  5.97ns ± 3%   -17.24%  (p=0.000 n=36+33)
BM_Sort_pair<uint32, uint32>_PipeOrgan_262144                    7.98ns ± 5%  6.59ns ± 3%   -17.46%  (p=0.000 n=33+33)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_1                3.99ns ± 3%  4.27ns ± 3%    +6.95%  (p=0.000 n=36+34)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_4                2.40ns ± 3%  2.37ns ± 3%    -1.00%  (p=0.007 n=34+34)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_16               4.96ns ± 5%  2.72ns ± 7%   -45.07%  (p=0.000 n=35+35)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_64               7.24ns ± 4%  7.51ns ± 4%    +3.63%  (p=0.000 n=34+35)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_256              9.85ns ± 5%  7.12ns ± 4%   -27.70%  (p=0.000 n=34+35)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_1024             11.6ns ± 6%   8.8ns ± 5%   -23.86%  (p=0.000 n=35+35)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_16384            32.7ns ± 3%  20.8ns ± 4%   -36.26%  (p=0.000 n=35+35)
BM_Sort_pair<uint32, uint32>_QuickSortAdversary_262144           36.4ns ± 3%  24.0ns ± 4%   -34.12%  (p=0.000 n=34+36)
BM_Sort_tuple<uint32, uint64, uint32>_Random_1                   4.04ns ± 6%  4.34ns ± 4%    +7.55%  (p=0.000 n=37+37)
BM_Sort_tuple<uint32, uint64, uint32>_Random_4                   7.19ns ± 6%  7.26ns ± 5%    +0.99%  (p=0.042 n=36+38)
BM_Sort_tuple<uint32, uint64, uint32>_Random_16                  30.4ns ± 6%  21.8ns ± 7%   -28.28%  (p=0.000 n=34+37)
BM_Sort_tuple<uint32, uint64, uint32>_Random_64                  42.8ns ±11%  33.5ns ± 9%   -21.70%  (p=0.000 n=36+38)
BM_Sort_tuple<uint32, uint64, uint32>_Random_256                 49.9ns ± 6%  40.3ns ± 9%   -19.20%  (p=0.000 n=35+38)
BM_Sort_tuple<uint32, uint64, uint32>_Random_1024                56.3ns ± 3%  46.1ns ± 4%   -18.08%  (p=0.000 n=35+35)
BM_Sort_tuple<uint32, uint64, uint32>_Random_16384               72.2ns ± 5%  62.1ns ± 3%   -14.05%  (p=0.000 n=37+36)
BM_Sort_tuple<uint32, uint64, uint32>_Random_262144              88.7ns ± 6%  79.0ns ± 6%   -10.93%  (p=0.000 n=36+36)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_1                3.96ns ± 3%  4.36ns ± 3%    +9.96%  (p=0.000 n=34+37)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_4                2.39ns ± 2%  2.39ns ± 3%      ~     (p=0.604 n=36+37)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_16               3.04ns ± 4%  1.48ns ± 3%   -51.20%  (p=0.000 n=34+35)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_64               2.44ns ± 3%  2.30ns ± 5%    -5.61%  (p=0.000 n=36+35)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_256              2.35ns ± 3%  2.39ns ± 5%    +1.78%  (p=0.000 n=33+34)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_1024             2.12ns ± 5%  2.08ns ± 4%    -1.80%  (p=0.000 n=33+34)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_16384            2.02ns ± 3%  2.00ns ± 5%    -1.25%  (p=0.000 n=32+32)
BM_Sort_tuple<uint32, uint64, uint32>_Ascending_262144           2.06ns ± 5%  2.11ns ± 9%      ~     (p=0.618 n=32+40)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_1               3.97ns ± 2%  4.57ns ±16%   +15.19%  (p=0.000 n=32+40)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_4               3.64ns ± 3%  4.05ns ±15%   +11.05%  (p=0.000 n=33+40)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_16              5.68ns ± 5%  9.36ns ±16%   +64.92%  (p=0.000 n=35+40)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_64              4.27ns ± 4%  3.88ns ± 8%    -9.13%  (p=0.000 n=35+32)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_256             3.58ns ± 3%  3.76ns ±14%    +5.12%  (p=0.002 n=38+40)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_1024            4.16ns ± 3%  3.21ns ± 5%   -22.77%  (p=0.000 n=38+31)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_16384           3.90ns ± 4%  3.00ns ± 3%   -23.12%  (p=0.000 n=38+32)
BM_Sort_tuple<uint32, uint64, uint32>_Descending_262144          4.52ns ± 3%  3.42ns ± 3%   -24.29%  (p=0.000 n=38+33)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_1            3.97ns ± 3%  4.31ns ± 3%    +8.78%  (p=0.000 n=39+34)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_4            2.54ns ± 2%  2.54ns ± 4%      ~     (p=0.341 n=38+36)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_16           2.39ns ± 3%  1.70ns ± 6%   -28.90%  (p=0.000 n=38+35)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_64           2.61ns ± 2%  3.23ns ± 3%   +24.07%  (p=0.000 n=35+35)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_256          2.83ns ± 2%  2.97ns ± 4%    +4.83%  (p=0.000 n=35+37)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_1024         2.44ns ± 4%  2.44ns ± 3%      ~     (p=0.481 n=36+36)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_16384        2.19ns ± 3%  2.37ns ± 6%    +8.01%  (p=0.000 n=36+37)
BM_Sort_tuple<uint32, uint64, uint32>_SingleElement_262144       2.34ns ± 2%  2.36ns ± 5%    +1.11%  (p=0.001 n=36+36)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_1                3.96ns ± 2%  4.31ns ± 3%    +8.76%  (p=0.000 n=33+35)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_4                2.65ns ± 6%  2.67ns ± 4%      ~     (p=0.139 n=32+37)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_16               5.64ns ± 3%  3.56ns ± 3%   -36.80%  (p=0.000 n=31+35)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_64               6.12ns ±16%  5.04ns ± 4%   -17.64%  (p=0.000 n=40+37)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_256              6.78ns ± 6%  3.73ns ± 3%   -44.94%  (p=0.000 n=31+36)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_1024             8.36ns ±15%  6.51ns ± 4%   -22.13%  (p=0.000 n=40+37)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_16384            9.24ns ±15%  7.91ns ± 3%   -14.34%  (p=0.000 n=40+37)
BM_Sort_tuple<uint32, uint64, uint32>_PipeOrgan_262144           10.7ns ± 3%   9.3ns ± 6%   -12.36%  (p=0.000 n=32+36)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_1       3.97ns ± 3%  4.31ns ± 3%    +8.63%  (p=0.000 n=32+35)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_4       2.79ns ± 3%  2.76ns ± 4%    -0.95%  (p=0.002 n=33+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_16      5.07ns ± 3%  3.69ns ± 4%   -27.35%  (p=0.000 n=35+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_64      9.26ns ± 3%  8.34ns ± 7%    -9.88%  (p=0.000 n=35+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_256     11.8ns ± 5%   9.7ns ± 3%   -17.83%  (p=0.000 n=37+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_1024    19.2ns ± 4%  14.5ns ±10%   -24.59%  (p=0.000 n=36+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_16384   45.5ns ± 4%  37.4ns ± 9%   -17.71%  (p=0.000 n=35+33)
BM_Sort_tuple<uint32, uint64, uint32>_QuickSortAdversary_262144  50.0ns ± 4%  43.2ns ± 3%   -13.69%  (p=0.000 n=35+34)
BM_Sort_string_Random_1                                          4.66ns ± 6%  4.40ns ± 4%    -5.55%  (p=0.000 n=35+37)
BM_Sort_string_Random_4                                          14.9ns ± 3%  15.0ns ± 6%      ~     (p=0.863 n=36+38)
BM_Sort_string_Random_16                                         45.5ns ± 6%  35.8ns ± 8%   -21.37%  (p=0.000 n=36+36)
BM_Sort_string_Random_64                                         66.6ns ± 4%  58.2ns ± 3%   -12.69%  (p=0.000 n=36+37)
BM_Sort_string_Random_256                                        86.0ns ± 5%  77.4ns ± 3%   -10.01%  (p=0.000 n=37+37)
BM_Sort_string_Random_1024                                        106ns ± 3%    96ns ± 6%    -9.39%  (p=0.000 n=37+37)
BM_Sort_string_Random_16384                                       154ns ± 3%   141ns ± 5%    -8.03%  (p=0.000 n=35+36)
BM_Sort_string_Random_262144                                      213ns ± 4%   197ns ± 4%    -7.59%  (p=0.000 n=34+34)
BM_Sort_string_Ascending_1                                       4.59ns ± 2%  4.56ns ±17%    -0.60%  (p=0.002 n=32+40)
BM_Sort_string_Ascending_4                                       7.52ns ± 9%  7.54ns ±12%      ~     (p=0.554 n=37+40)
BM_Sort_string_Ascending_16                                      13.1ns ± 6%   8.8ns ±12%   -33.26%  (p=0.000 n=39+38)
BM_Sort_string_Ascending_64                                      14.8ns ±10%  14.5ns ±11%    -2.15%  (p=0.013 n=40+37)
BM_Sort_string_Ascending_256                                     14.0ns ± 6%  14.1ns ±10%      ~     (p=0.760 n=37+40)
BM_Sort_string_Ascending_1024                                    12.9ns ±10%  12.8ns ±20%      ~     (p=0.055 n=35+40)
BM_Sort_string_Ascending_16384                                   17.2ns ±13%  17.4ns ±21%      ~     (p=1.000 n=37+40)
BM_Sort_string_Ascending_262144                                  17.5ns ±12%  17.5ns ±25%      ~     (p=0.392 n=35+39)
BM_Sort_string_Descending_1                                      4.59ns ± 3%  4.34ns ± 3%    -5.51%  (p=0.000 n=32+33)
BM_Sort_string_Descending_4                                      10.1ns ± 5%   9.8ns ± 4%    -2.84%  (p=0.000 n=36+34)
BM_Sort_string_Descending_16                                     22.0ns ± 4%  39.6ns ± 4%   +79.84%  (p=0.000 n=36+33)
BM_Sort_string_Descending_64                                     21.4ns ±12%  21.3ns ±14%      ~     (p=0.542 n=37+39)
BM_Sort_string_Descending_256                                    19.4ns ±13%  18.9ns ±13%    -2.74%  (p=0.039 n=37+39)
BM_Sort_string_Descending_1024                                   22.7ns ± 5%  17.6ns ±15%   -22.52%  (p=0.000 n=35+40)
BM_Sort_string_Descending_16384                                  27.9ns ±14%  22.6ns ±10%   -19.11%  (p=0.000 n=40+37)
BM_Sort_string_Descending_262144                                 33.8ns ±14%  26.1ns ±21%   -22.74%  (p=0.000 n=39+38)
BM_Sort_string_SingleElement_1                                   4.58ns ± 2%  4.35ns ± 3%    -5.14%  (p=0.000 n=35+37)
BM_Sort_string_SingleElement_4                                   7.92ns ± 3%  7.92ns ± 7%      ~     (p=0.625 n=38+39)
BM_Sort_string_SingleElement_16                                  18.0ns ± 3%   7.9ns ± 6%   -56.23%  (p=0.000 n=36+35)
BM_Sort_string_SingleElement_64                                  20.3ns ± 5%  19.3ns ±15%    -4.83%  (p=0.000 n=34+38)
BM_Sort_string_SingleElement_256                                 19.4ns ± 7%  18.1ns ±14%    -6.67%  (p=0.000 n=36+39)
BM_Sort_string_SingleElement_1024                                19.3ns ± 9%  17.4ns ±17%    -9.40%  (p=0.000 n=35+40)
BM_Sort_string_SingleElement_16384                               17.5ns ±12%  16.2ns ±20%    -7.91%  (p=0.000 n=37+40)
BM_Sort_string_SingleElement_262144                              16.7ns ±18%  15.3ns ±27%    -8.56%  (p=0.000 n=40+40)
BM_Sort_string_PipeOrgan_1                                       4.60ns ± 2%  4.33ns ± 3%    -5.80%  (p=0.000 n=33+31)
BM_Sort_string_PipeOrgan_4                                       8.29ns ± 4%  8.17ns ± 8%    -1.50%  (p=0.004 n=39+36)
BM_Sort_string_PipeOrgan_16                                      22.9ns ± 3%  16.4ns ± 6%   -28.45%  (p=0.000 n=39+38)
BM_Sort_string_PipeOrgan_64                                      30.7ns ± 4%  28.9ns ± 7%    -6.05%  (p=0.000 n=38+37)
BM_Sort_string_PipeOrgan_256                                     38.1ns ± 3%  22.5ns ± 9%   -40.78%  (p=0.000 n=37+37)
BM_Sort_string_PipeOrgan_1024                                    45.4ns ± 4%  36.2ns ± 6%   -20.33%  (p=0.000 n=37+37)
BM_Sort_string_PipeOrgan_16384                                   56.2ns ± 4%  49.0ns ± 8%   -12.73%  (p=0.000 n=36+38)
BM_Sort_string_PipeOrgan_262144                                  77.8ns ±13%  62.8ns ±10%   -19.27%  (p=0.000 n=39+39)
BM_Sort_string_QuickSortAdversary_1                              4.80ns ±16%  4.34ns ± 4%    -9.56%  (p=0.000 n=39+34)
BM_Sort_string_QuickSortAdversary_4                              14.8ns ± 5%  14.7ns ± 4%    -0.80%  (p=0.037 n=33+33)
BM_Sort_string_QuickSortAdversary_16                             44.6ns ± 4%  34.8ns ± 5%   -21.98%  (p=0.000 n=35+34)
BM_Sort_string_QuickSortAdversary_64                             66.2ns ± 3%  58.1ns ± 4%   -12.32%  (p=0.000 n=36+35)
BM_Sort_string_QuickSortAdversary_256                            85.4ns ± 5%  76.9ns ± 6%    -9.99%  (p=0.000 n=36+36)
BM_Sort_string_QuickSortAdversary_1024                            106ns ± 4%    96ns ± 3%    -9.62%  (p=0.000 n=34+37)
BM_Sort_string_QuickSortAdversary_16384                           153ns ± 3%   141ns ± 4%    -8.22%  (p=0.000 n=34+37)
BM_Sort_string_QuickSortAdversary_262144                          211ns ± 5%   195ns ± 6%    -7.77%  (p=0.000 n=35+38)

Differential Revision: https://reviews.llvm.org/D122780

Added: 
    

Modified: 
    libcxx/include/__algorithm/sort.h
    libcxx/include/__bits

Removed: 
    


################################################################################
diff  --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h
index 81f6eeb238b6b..6242f413a0900 100644
--- a/libcxx/include/__algorithm/sort.h
+++ b/libcxx/include/__algorithm/sort.h
@@ -11,10 +11,12 @@
 
 #include <__algorithm/comp.h>
 #include <__algorithm/comp_ref_type.h>
+#include <__algorithm/iter_swap.h>
 #include <__algorithm/iterator_operations.h>
 #include <__algorithm/min_element.h>
 #include <__algorithm/partial_sort.h>
 #include <__algorithm/unwrap_iter.h>
+#include <__assert>
 #include <__bits>
 #include <__config>
 #include <__debug>
@@ -25,6 +27,7 @@
 #include <__memory/destruct_n.h>
 #include <__memory/unique_ptr.h>
 #include <__utility/move.h>
+#include <__utility/pair.h>
 #include <climits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -124,8 +127,7 @@ template <class _AlgPolicy, class _Compare, class _ForwardIterator>
 _LIBCPP_HIDE_FROM_ABI
 unsigned __sort4(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3, _ForwardIterator __x4,
                  _Compare __c) {
-  using _Ops = _IterOps<_AlgPolicy>;
-
+  using _Ops   = _IterOps<_AlgPolicy>;
   unsigned __r = std::__sort3<_AlgPolicy, _Compare>(__x1, __x2, __x3, __c);
   if (__c(*__x4, *__x3)) {
     _Ops::iter_swap(__x3, __x4);
@@ -180,7 +182,7 @@ _LIBCPP_HIDE_FROM_ABI unsigned __sort5_wrap_policy(
     _Compare __c) {
   using _WrappedComp = typename _WrapAlgPolicy<_AlgPolicy, _Compare>::type;
   _WrappedComp __wrapped_comp(__c);
-  return std::__sort5<_WrappedComp>(
+  return std::__sort5<_WrappedComp, _ForwardIterator>(
       std::move(__x1), std::move(__x2), std::move(__x3), std::move(__x4), std::move(__x5), __wrapped_comp);
 }
 
@@ -205,6 +207,13 @@ using __use_branchless_sort =
     integral_constant<bool, __is_cpp17_contiguous_iterator<_Iter>::value && sizeof(_Tp) <= sizeof(void*) &&
                                 is_arithmetic<_Tp>::value && __is_simple_comparator<_Compare>::value>;
 
+namespace __detail {
+
+// Size in bits for the bitset in use.
+enum { __block_size = sizeof(uint64_t) * 8 };
+
+} // namespace __detail
+
 // Ensures that __c(*__x, *__y) is true by swapping *__x and *__y if necessary.
 template <class _Compare, class _RandomAccessIterator>
 inline _LIBCPP_HIDE_FROM_ABI void __cond_swap(_RandomAccessIterator __x, _RandomAccessIterator __y, _Compare __c) {
@@ -264,10 +273,15 @@ __sort4_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2,
   std::__sort4<_AlgPolicy, _Compare>(__x1, __x2, __x3, __x4, __c);
 }
 
-template <class, class _Compare, class _RandomAccessIterator>
+template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
 inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void>
-__sort5_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3,
-                         _RandomAccessIterator __x4, _RandomAccessIterator __x5, _Compare __c) {
+__sort5_maybe_branchless(
+    _RandomAccessIterator __x1,
+    _RandomAccessIterator __x2,
+    _RandomAccessIterator __x3,
+    _RandomAccessIterator __x4,
+    _RandomAccessIterator __x5,
+    _Compare __c) {
   std::__cond_swap<_Compare>(__x1, __x2, __c);
   std::__cond_swap<_Compare>(__x4, __x5, __c);
   std::__partially_sorted_swap<_Compare>(__x3, __x4, __x5, __c);
@@ -296,34 +310,47 @@ _LIBCPP_CONSTEXPR_SINCE_CXX14 void __selection_sort(_BidirectionalIterator __fir
   }
 }
 
+// Sort the iterator range [__first, __last) using the comparator __comp using
+// the insertion sort algorithm.
 template <class _AlgPolicy, class _Compare, class _BidirectionalIterator>
 _LIBCPP_HIDE_FROM_ABI
 void __insertion_sort(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp) {
   using _Ops = _IterOps<_AlgPolicy>;
 
   typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
-  if (__first != __last) {
-    _BidirectionalIterator __i = __first;
-    for (++__i; __i != __last; ++__i) {
-      _BidirectionalIterator __j = __i;
-      value_type __t(_Ops::__iter_move(__j));
-      for (_BidirectionalIterator __k = __i; __k != __first && __comp(__t, *--__k); --__j)
+  if (__first == __last)
+    return;
+  _BidirectionalIterator __i = __first;
+  for (++__i; __i != __last; ++__i) {
+    _BidirectionalIterator __j = __i;
+    --__j;
+    if (__comp(*__i, *__j)) {
+      value_type __t(_Ops::__iter_move(__i));
+      _BidirectionalIterator __k = __j;
+      __j                        = __i;
+      do {
         *__j = _Ops::__iter_move(__k);
+        __j  = __k;
+      } while (__j != __first && __comp(__t, *--__k));
       *__j = std::move(__t);
     }
   }
 }
 
+// Sort the iterator range [__first, __last) using the comparator __comp using
+// the insertion sort algorithm.  Insertion sort has two loops, outer and inner.
+// The implementation below has not bounds check (unguarded) for the inner loop.
+// Assumes that there is an element in the position (__first - 1) and that each
+// element in the input range is greater or equal to the element at __first - 1.
 template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
-_LIBCPP_HIDE_FROM_ABI
-void __insertion_sort_3(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
+void __insertion_sort_unguarded(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
   using _Ops = _IterOps<_AlgPolicy>;
-
   typedef typename iterator_traits<_RandomAccessIterator>::
diff erence_type 
diff erence_type;
   typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
-  _RandomAccessIterator __j = __first + 
diff erence_type(2);
-  std::__sort3_maybe_branchless<_AlgPolicy, _Compare>(__first, __first + 
diff erence_type(1), __j, __comp);
-  for (_RandomAccessIterator __i = __j + 
diff erence_type(1); __i != __last; ++__i) {
+  if (__first == __last)
+    return;
+  for (_RandomAccessIterator __i = __first + 
diff erence_type(1); __i != __last; ++__i) {
+    _RandomAccessIterator __j = __i - 
diff erence_type(1);
     if (__comp(*__i, *__j)) {
       value_type __t(_Ops::__iter_move(__i));
       _RandomAccessIterator __k = __j;
@@ -331,10 +358,9 @@ void __insertion_sort_3(_RandomAccessIterator __first, _RandomAccessIterator __l
       do {
         *__j = _Ops::__iter_move(__k);
         __j = __k;
-      } while (__j != __first && __comp(__t, *--__k));
+      } while (__comp(__t, *--__k)); // No need for bounds check due to the assumption stated above.
       *__j = std::move(__t);
     }
-    __j = __i;
   }
 }
 
@@ -355,7 +381,7 @@ _LIBCPP_HIDDEN bool __insertion_sort_incomplete(
     return true;
   case 2:
     if (__comp(*--__last, *__first))
-      _IterOps<_AlgPolicy>::iter_swap(__first, __last);
+      _Ops::iter_swap(__first, __last);
     return true;
   case 3:
     std::__sort3_maybe_branchless<_AlgPolicy, _Compare>(__first, __first + 
diff erence_type(1), --__last, __comp);
@@ -424,17 +450,336 @@ void __insertion_sort_move(_BidirectionalIterator __first1, _BidirectionalIterat
   }
 }
 
-template <class _AlgPolicy, class _Compare, class _RandomAccessIterator>
-void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp,
-                 typename iterator_traits<_RandomAccessIterator>::
diff erence_type __depth) {
+template <class _AlgPolicy, class _RandomAccessIterator>
+inline _LIBCPP_HIDE_FROM_ABI void __swap_bitmap_pos(
+    _RandomAccessIterator __first, _RandomAccessIterator __last, uint64_t& __left_bitset, uint64_t& __right_bitset) {
+  using _Ops = _IterOps<_AlgPolicy>;
+  typedef typename std::iterator_traits<_RandomAccessIterator>::
diff erence_type 
diff erence_type;
+  // Swap one pair on each iteration as long as both bitsets have at least one
+  // element for swapping.
+  while (__left_bitset != 0 && __right_bitset != 0) {
+    
diff erence_type tz_left  = __libcpp_ctz(__left_bitset);
+    __left_bitset            = __libcpp_blsr(__left_bitset);
+    
diff erence_type tz_right = __libcpp_ctz(__right_bitset);
+    __right_bitset           = __libcpp_blsr(__right_bitset);
+    _Ops::iter_swap(__first + tz_left, __last - tz_right);
+  }
+}
+
+template <class _Compare,
+          class _RandomAccessIterator,
+          class _ValueType = typename iterator_traits<_RandomAccessIterator>::value_type>
+inline _LIBCPP_HIDE_FROM_ABI void
+__populate_left_bitset(_RandomAccessIterator __first, _Compare __comp, _ValueType& __pivot, uint64_t& __left_bitset) {
+  // Possible vectorization. With a proper "-march" flag, the following loop
+  // will be compiled into a set of SIMD instructions.
+  _RandomAccessIterator __iter = __first;
+  for (int __j = 0; __j < __detail::__block_size;) {
+    bool __comp_result = !__comp(*__iter, __pivot);
+    __left_bitset |= (static_cast<uint64_t>(__comp_result) << __j);
+    __j++;
+    ++__iter;
+  }
+}
+
+template <class _Compare,
+          class _RandomAccessIterator,
+          class _ValueType = typename iterator_traits<_RandomAccessIterator>::value_type>
+inline _LIBCPP_HIDE_FROM_ABI void
+__populate_right_bitset(_RandomAccessIterator __lm1, _Compare __comp, _ValueType& __pivot, uint64_t& __right_bitset) {
+  // Possible vectorization. With a proper "-march" flag, the following loop
+  // will be compiled into a set of SIMD instructions.
+  _RandomAccessIterator __iter = __lm1;
+  for (int __j = 0; __j < __detail::__block_size;) {
+    bool __comp_result = __comp(*__iter, __pivot);
+    __right_bitset |= (static_cast<uint64_t>(__comp_result) << __j);
+    __j++;
+    --__iter;
+  }
+}
+
+template <class _AlgPolicy,
+          class _Compare,
+          class _RandomAccessIterator,
+          class _ValueType = typename iterator_traits<_RandomAccessIterator>::value_type>
+inline _LIBCPP_HIDE_FROM_ABI void __bitset_partition_partial_blocks(
+    _RandomAccessIterator& __first,
+    _RandomAccessIterator& __lm1,
+    _Compare __comp,
+    _ValueType& __pivot,
+    uint64_t& __left_bitset,
+    uint64_t& __right_bitset) {
+  typedef typename std::iterator_traits<_RandomAccessIterator>::
diff erence_type 
diff erence_type;
+  
diff erence_type __remaining_len = __lm1 - __first + 1;
+  
diff erence_type __l_size;
+  
diff erence_type __r_size;
+  if (__left_bitset == 0 && __right_bitset == 0) {
+    __l_size = __remaining_len / 2;
+    __r_size = __remaining_len - __l_size;
+  } else if (__left_bitset == 0) {
+    // We know at least one side is a full block.
+    __l_size = __remaining_len - __detail::__block_size;
+    __r_size = __detail::__block_size;
+  } else { // if (__right_bitset == 0)
+    __l_size = __detail::__block_size;
+    __r_size = __remaining_len - __detail::__block_size;
+  }
+  // Record the comparison outcomes for the elements currently on the left side.
+  if (__left_bitset == 0) {
+    _RandomAccessIterator __iter = __first;
+    for (int j = 0; j < __l_size; j++) {
+      bool __comp_result = !__comp(*__iter, __pivot);
+      __left_bitset |= (static_cast<uint64_t>(__comp_result) << j);
+      ++__iter;
+    }
+  }
+  // Record the comparison outcomes for the elements currently on the right
+  // side.
+  if (__right_bitset == 0) {
+    _RandomAccessIterator __iter = __lm1;
+    for (int j = 0; j < __r_size; j++) {
+      bool __comp_result = __comp(*__iter, __pivot);
+      __right_bitset |= (static_cast<uint64_t>(__comp_result) << j);
+      --__iter;
+    }
+  }
+  __swap_bitmap_pos<_AlgPolicy, _RandomAccessIterator>(__first, __lm1, __left_bitset, __right_bitset);
+  __first += (__left_bitset == 0) ? __l_size : 0;
+  __lm1 -= (__right_bitset == 0) ? __r_size : 0;
+}
+
+template <class _AlgPolicy, class _RandomAccessIterator>
+inline _LIBCPP_HIDE_FROM_ABI void __swap_bitmap_pos_within(
+    _RandomAccessIterator& __first, _RandomAccessIterator& __lm1, uint64_t& __left_bitset, uint64_t& __right_bitset) {
+  using _Ops = _IterOps<_AlgPolicy>;
+  typedef typename std::iterator_traits<_RandomAccessIterator>::
diff erence_type 
diff erence_type;
+  if (__left_bitset) {
+    // Swap within the left side.  Need to find set positions in the reverse
+    // order.
+    while (__left_bitset != 0) {
+      
diff erence_type __tz_left = __detail::__block_size - 1 - __libcpp_clz(__left_bitset);
+      __left_bitset &= (static_cast<uint64_t>(1) << __tz_left) - 1;
+      _RandomAccessIterator it = __first + __tz_left;
+      if (it != __lm1) {
+        _Ops::iter_swap(it, __lm1);
+      }
+      --__lm1;
+    }
+    __first = __lm1 + 
diff erence_type(1);
+  } else if (__right_bitset) {
+    // Swap within the right side.  Need to find set positions in the reverse
+    // order.
+    while (__right_bitset != 0) {
+      
diff erence_type __tz_right = __detail::__block_size - 1 - __libcpp_clz(__right_bitset);
+      __right_bitset &= (static_cast<uint64_t>(1) << __tz_right) - 1;
+      _RandomAccessIterator it = __lm1 - __tz_right;
+      if (it != __first) {
+        _Ops::iter_swap(it, __first);
+      }
+      ++__first;
+    }
+  }
+}
+
+// Partition [__first, __last) using the comparator __comp.  *__first has the
+// chosen pivot.  Elements that are equivalent are kept to the left of the
+// pivot.  Returns the iterator for the pivot and a bool value which is true if
+// the provided range is already sorted, false otherwise.  We assume that the
+// length of the range is at least three elements.
+//
+// __bitset_partition uses bitsets for storing outcomes of the comparisons
+// between the pivot and other elements.
+template <class _AlgPolicy, class _RandomAccessIterator, class _Compare>
+_LIBCPP_HIDE_FROM_ABI std::pair<_RandomAccessIterator, bool>
+__bitset_partition(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
+  using _Ops = _IterOps<_AlgPolicy>;
+  typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type;
+  typedef typename std::iterator_traits<_RandomAccessIterator>::
diff erence_type 
diff erence_type;
+  _LIBCPP_ASSERT(__last - __first >= 
diff erence_type(3), "");
+
+  _RandomAccessIterator __begin = __first;
+  value_type __pivot(_Ops::__iter_move(__first));
+  // Find the first element greater than the pivot.
+  if (__comp(__pivot, *(__last - 
diff erence_type(1)))) {
+    // Not guarded since we know the last element is greater than the pivot.
+    while (!__comp(__pivot, *++__first)) {
+    }
+  } else {
+    while (++__first < __last && !__comp(__pivot, *__first)) {
+    }
+  }
+  // Find the last element less than or equal to the pivot.
+  if (__first < __last) {
+    // It will be always guarded because __introsort will do the median-of-three
+    // before calling this.
+    while (__comp(__pivot, *--__last)) {
+    }
+  }
+  // If the first element greater than the pivot is at or after the
+  // last element less than or equal to the pivot, then we have covered the
+  // entire range without swapping elements.  This implies the range is already
+  // partitioned.
+  bool __already_partitioned = __first >= __last;
+  if (!__already_partitioned) {
+    _Ops::iter_swap(__first, __last);
+    ++__first;
+  }
+
+  // In [__first, __last) __last is not inclusive. From now on, it uses last
+  // minus one to be inclusive on both sides.
+  _RandomAccessIterator __lm1 = __last - 
diff erence_type(1);
+  uint64_t __left_bitset      = 0;
+  uint64_t __right_bitset     = 0;
+
+  // Reminder: length = __lm1 - __first + 1.
+  while (__lm1 - __first >= 2 * __detail::__block_size - 1) {
+    // Record the comparison outcomes for the elements currently on the left
+    // side.
+    if (__left_bitset == 0)
+      __populate_left_bitset<_Compare>(__first, __comp, __pivot, __left_bitset);
+    // Record the comparison outcomes for the elements currently on the right
+    // side.
+    if (__right_bitset == 0)
+      __populate_right_bitset<_Compare>(__lm1, __comp, __pivot, __right_bitset);
+    // Swap the elements recorded to be the candidates for swapping in the
+    // bitsets.
+    __swap_bitmap_pos<_AlgPolicy, _RandomAccessIterator>(__first, __lm1, __left_bitset, __right_bitset);
+    // Only advance the iterator if all the elements that need to be moved to
+    // other side were moved.
+    __first += (__left_bitset == 0) ? 
diff erence_type(__detail::__block_size) : 
diff erence_type(0);
+    __lm1 -= (__right_bitset == 0) ? 
diff erence_type(__detail::__block_size) : 
diff erence_type(0);
+  }
+  // Now, we have a less-than a block worth of elements on at least one of the
+  // sides.
+  __bitset_partition_partial_blocks<_AlgPolicy, _Compare>(
+      __first, __lm1, __comp, __pivot, __left_bitset, __right_bitset);
+  // At least one the bitsets would be empty.  For the non-empty one, we need to
+  // properly partition the elements that appear within that bitset.
+  __swap_bitmap_pos_within<_AlgPolicy>(__first, __lm1, __left_bitset, __right_bitset);
+
+  // Move the pivot to its correct position.
+  _RandomAccessIterator __pivot_pos = __first - 
diff erence_type(1);
+  if (__begin != __pivot_pos) {
+    *__begin = _Ops::__iter_move(__pivot_pos);
+  }
+  *__pivot_pos = std::move(__pivot);
+  return std::make_pair(__pivot_pos, __already_partitioned);
+}
+
+// Partition [__first, __last) using the comparator __comp.  *__first has the
+// chosen pivot.  Elements that are equivalent are kept to the right of the
+// pivot.  Returns the iterator for the pivot and a bool value which is true if
+// the provided range is already sorted, false otherwise.  We assume that the
+// length of the range is at least three elements.
+template <class _AlgPolicy, class _RandomAccessIterator, class _Compare>
+_LIBCPP_HIDE_FROM_ABI std::pair<_RandomAccessIterator, bool>
+__partition_with_equals_on_right(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
   using _Ops = _IterOps<_AlgPolicy>;
+  typedef typename iterator_traits<_RandomAccessIterator>::
diff erence_type 
diff erence_type;
+  typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type;
+  _LIBCPP_ASSERT(__last - __first >= 
diff erence_type(3), "");
+  _RandomAccessIterator __begin = __first;
+  value_type __pivot(_Ops::__iter_move(__first));
+  // Find the first element greater or equal to the pivot.  It will be always
+  // guarded because __introsort will do the median-of-three before calling
+  // this.
+  while (__comp(*++__first, __pivot))
+    ;
+
+  // Find the last element less than the pivot.
+  if (__begin == __first - 
diff erence_type(1)) {
+    while (__first < __last && !__comp(*--__last, __pivot))
+      ;
+  } else {
+    // Guarded.
+    while (!__comp(*--__last, __pivot))
+      ;
+  }
+
+  // If the first element greater than or equal to the pivot is at or after the
+  // last element less than the pivot, then we have covered the entire range
+  // without swapping elements.  This implies the range is already partitioned.
+  bool __already_partitioned = __first >= __last;
+  // Go through the remaining elements.  Swap pairs of elements (one to the
+  // right of the pivot and the other to left of the pivot) that are not on the
+  // correct side of the pivot.
+  while (__first < __last) {
+    _Ops::iter_swap(__first, __last);
+    while (__comp(*++__first, __pivot))
+      ;
+    while (!__comp(*--__last, __pivot))
+      ;
+  }
+  // Move the pivot to its correct position.
+  _RandomAccessIterator __pivot_pos = __first - 
diff erence_type(1);
+  if (__begin != __pivot_pos) {
+    *__begin = _Ops::__iter_move(__pivot_pos);
+  }
+  *__pivot_pos = std::move(__pivot);
+  return std::make_pair(__pivot_pos, __already_partitioned);
+}
 
+// Similar to the above function.  Elements equivalent to the pivot are put to
+// the left of the pivot.  Returns the iterator to the pivot element.
+template <class _AlgPolicy, class _RandomAccessIterator, class _Compare>
+_LIBCPP_HIDE_FROM_ABI _RandomAccessIterator
+__partition_with_equals_on_left(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
+  using _Ops = _IterOps<_AlgPolicy>;
   typedef typename iterator_traits<_RandomAccessIterator>::
diff erence_type 
diff erence_type;
-  typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
-  const 
diff erence_type __limit =
-      is_trivially_copy_constructible<value_type>::value && is_trivially_copy_assignable<value_type>::value ? 30 : 6;
+  typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type;
+  _RandomAccessIterator __begin = __first;
+  value_type __pivot(_Ops::__iter_move(__first));
+  if (__comp(__pivot, *(__last - 
diff erence_type(1)))) {
+    // Guarded.
+    while (!__comp(__pivot, *++__first)) {
+    }
+  } else {
+    while (++__first < __last && !__comp(__pivot, *__first)) {
+    }
+  }
+
+  if (__first < __last) {
+    // It will be always guarded because __introsort will do the
+    // median-of-three before calling this.
+    while (__comp(__pivot, *--__last)) {
+    }
+  }
+  while (__first < __last) {
+    _Ops::iter_swap(__first, __last);
+    while (!__comp(__pivot, *++__first))
+      ;
+    while (__comp(__pivot, *--__last))
+      ;
+  }
+  _RandomAccessIterator __pivot_pos = __first - 
diff erence_type(1);
+  if (__begin != __pivot_pos) {
+    *__begin = _Ops::__iter_move(__pivot_pos);
+  }
+  *__pivot_pos = std::move(__pivot);
+  return __first;
+}
+
+// The main sorting function.  Implements introsort combined with other ideas:
+//  - option of using block quick sort for partitioning,
+//  - guarded and unguarded insertion sort for small lengths,
+//  - Tuckey's ninther technique for computing the pivot,
+//  - check on whether partition was not required.
+// The implementation is partly based on Orson Peters' pattern-defeating
+// quicksort, published at: <https://github.com/orlp/pdqsort>.
+template <class _AlgPolicy, class _Compare, class _RandomAccessIterator, bool _UseBitSetPartition>
+void __introsort(_RandomAccessIterator __first,
+                 _RandomAccessIterator __last,
+                 _Compare __comp,
+                 typename iterator_traits<_RandomAccessIterator>::
diff erence_type __depth,
+                 bool __leftmost = true) {
+  using _Ops = _IterOps<_AlgPolicy>;
+  typedef typename iterator_traits<_RandomAccessIterator>::
diff erence_type 
diff erence_type;
+  using _Comp_ref = __comp_ref_type<_Compare>;
+  // Upper bound for using insertion sort for sorting.
+  _LIBCPP_CONSTEXPR 
diff erence_type __limit = 24;
+  // Lower bound for using Tuckey's ninther technique for median computation.
+  _LIBCPP_CONSTEXPR 
diff erence_type __ninther_threshold = 128;
   while (true) {
-  __restart:
     
diff erence_type __len = __last - __first;
     switch (__len) {
     case 0:
@@ -442,7 +787,7 @@ void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _C
       return;
     case 2:
       if (__comp(*--__last, *__first))
-        _IterOps<_AlgPolicy>::iter_swap(__first, __last);
+        _Ops::iter_swap(__first, __last);
       return;
     case 3:
       std::__sort3_maybe_branchless<_AlgPolicy, _Compare>(__first, __first + 
diff erence_type(1), --__last, __comp);
@@ -457,127 +802,60 @@ void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _C
           --__last, __comp);
       return;
     }
-    if (__len <= __limit) {
-      std::__insertion_sort_3<_AlgPolicy, _Compare>(__first, __last, __comp);
+    // Use insertion sort if the length of the range is below the specified limit.
+    if (__len < __limit) {
+      if (__leftmost) {
+        std::__insertion_sort<_AlgPolicy, _Compare>(__first, __last, __comp);
+      } else {
+        std::__insertion_sort_unguarded<_AlgPolicy, _Compare>(__first, __last, __comp);
+      }
       return;
     }
-    // __len > 5
     if (__depth == 0) {
       // Fallback to heap sort as Introsort suggests.
       std::__partial_sort<_AlgPolicy, _Compare>(__first, __last, __last, __comp);
       return;
     }
     --__depth;
-    _RandomAccessIterator __m = __first;
-    _RandomAccessIterator __lm1 = __last;
-    --__lm1;
-    unsigned __n_swaps;
     {
-      
diff erence_type __delta;
-      if (__len >= 1000) {
-        __delta = __len / 2;
-        __m += __delta;
-        __delta /= 2;
-        __n_swaps = std::__sort5_wrap_policy<_AlgPolicy, _Compare>(
-            __first, __first + __delta, __m, __m + __delta, __lm1, __comp);
+      
diff erence_type __half_len = __len / 2;
+      // Use Tuckey's ninther technique or median of 3 for pivot selection
+      // depending on the length of the range being sorted.
+      if (__len > __ninther_threshold) {
+        std::__sort3<_AlgPolicy, _Compare>(__first, __first + __half_len, __last - 
diff erence_type(1), __comp);
+        std::__sort3<_AlgPolicy, _Compare>(
+            __first + 
diff erence_type(1), __first + (__half_len - 1), __last - 
diff erence_type(2), __comp);
+        std::__sort3<_AlgPolicy, _Compare>(
+            __first + 
diff erence_type(2), __first + (__half_len + 1), __last - 
diff erence_type(3), __comp);
+        std::__sort3<_AlgPolicy, _Compare>(
+            __first + (__half_len - 1), __first + __half_len, __first + (__half_len + 1), __comp);
+        _Ops::iter_swap(__first, __first + __half_len);
       } else {
-        __delta = __len / 2;
-        __m += __delta;
-        __n_swaps = std::__sort3<_AlgPolicy, _Compare>(__first, __m, __lm1, __comp);
+        std::__sort3<_AlgPolicy, _Compare>(__first + __half_len, __first, __last - 
diff erence_type(1), __comp);
       }
     }
-    // *__m is median
-    // partition [__first, __m) < *__m and *__m <= [__m, __last)
-    // (this inhibits tossing elements equivalent to __m around unnecessarily)
-    _RandomAccessIterator __i = __first;
-    _RandomAccessIterator __j = __lm1;
-    // j points beyond range to be tested, *__m is known to be <= *__lm1
-    // The search going up is known to be guarded but the search coming down isn't.
-    // Prime the downward search with a guard.
-    if (!__comp(*__i, *__m)) // if *__first == *__m
-    {
-      // *__first == *__m, *__first doesn't go in first part
-      // manually guard downward moving __j against __i
-      while (true) {
-        if (__i == --__j) {
-          // *__first == *__m, *__m <= all other elements
-          // Parition instead into [__first, __i) == *__first and *__first < [__i, __last)
-          ++__i; // __first + 1
-          __j = __last;
-          if (!__comp(*__first, *--__j)) // we need a guard if *__first == *(__last-1)
-          {
-            while (true) {
-              if (__i == __j)
-                return; // [__first, __last) all equivalent elements
-              if (__comp(*__first, *__i)) {
-                _Ops::iter_swap(__i, __j);
-                ++__n_swaps;
-                ++__i;
-                break;
-              }
-              ++__i;
-            }
-          }
-          // [__first, __i) == *__first and *__first < [__j, __last) and __j == __last - 1
-          if (__i == __j)
-            return;
-          while (true) {
-            while (!__comp(*__first, *__i))
-              ++__i;
-            while (__comp(*__first, *--__j))
-              ;
-            if (__i >= __j)
-              break;
-            _Ops::iter_swap(__i, __j);
-            ++__n_swaps;
-            ++__i;
-          }
-          // [__first, __i) == *__first and *__first < [__i, __last)
-          // The first part is sorted, sort the second part
-          // std::__sort<_Compare>(__i, __last, __comp);
-          __first = __i;
-          goto __restart;
-        }
-        if (__comp(*__j, *__m)) {
-          _Ops::iter_swap(__i, __j);
-          ++__n_swaps;
-          break; // found guard for downward moving __j, now use unguarded partition
-        }
-      }
-    }
-    // It is known that *__i < *__m
-    ++__i;
-    // j points beyond range to be tested, *__m is known to be <= *__lm1
-    // if not yet partitioned...
-    if (__i < __j) {
-      // known that *(__i - 1) < *__m
-      // known that __i <= __m
-      while (true) {
-        // __m still guards upward moving __i
-        while (__comp(*__i, *__m))
-          ++__i;
-        // It is now known that a guard exists for downward moving __j
-        while (!__comp(*--__j, *__m))
-          ;
-        if (__i > __j)
-          break;
-        _Ops::iter_swap(__i, __j);
-        ++__n_swaps;
-        // It is known that __m != __j
-        // If __m just moved, follow it
-        if (__m == __i)
-          __m = __j;
-        ++__i;
-      }
-    }
-    // [__first, __i) < *__m and *__m <= [__i, __last)
-    if (__i != __m && __comp(*__m, *__i)) {
-      _Ops::iter_swap(__i, __m);
-      ++__n_swaps;
+    // The elements to the left of the current iterator range are already
+    // sorted.  If the current iterator range to be sorted is not the
+    // leftmost part of the entire iterator range and the pivot is same as
+    // the highest element in the range to the left, then we know that all
+    // the elements in the range [first, pivot] would be equal to the pivot,
+    // assuming the equal elements are put on the left side when
+    // partitioned.  This also means that we do not need to sort the left
+    // side of the partition.
+    if (!__leftmost && !__comp(*(__first - 
diff erence_type(1)), *__first)) {
+      __first = __partition_with_equals_on_left<_AlgPolicy, _RandomAccessIterator, _Comp_ref>(
+          __first, __last, _Comp_ref(__comp));
+      continue;
     }
+    // Use bitset partition only if asked for.
+    auto __ret =
+        _UseBitSetPartition
+            ? __bitset_partition<_AlgPolicy, _RandomAccessIterator, _Compare>(__first, __last, __comp)
+            : __partition_with_equals_on_right<_AlgPolicy, _RandomAccessIterator, _Compare>(__first, __last, __comp);
+    _RandomAccessIterator __i = __ret.first;
     // [__first, __i) < *__i and *__i <= [__i+1, __last)
     // If we were given a perfect partition, see if insertion sort is quick...
-    if (__n_swaps == 0) {
+    if (__ret.second) {
       using _WrappedComp = typename _WrapAlgPolicy<_AlgPolicy, _Compare>::type;
       _WrappedComp __wrapped_comp(__comp);
       bool __fs = std::__insertion_sort_incomplete<_WrappedComp>(__first, __i, __wrapped_comp);
@@ -593,14 +871,11 @@ void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _C
         }
       }
     }
-    // sort smaller range with recursive call and larger with tail recursion elimination
-    if (__i - __first < __last - __i) {
-      std::__introsort<_AlgPolicy, _Compare>(__first, __i, __comp, __depth);
-      __first = ++__i;
-    } else {
-      std::__introsort<_AlgPolicy, _Compare>(__i + 
diff erence_type(1), __last, __comp, __depth);
-      __last = __i;
-    }
+    // Sort the left partiton recursively and the right partition with tail recursion elimination.
+    std::__introsort<_AlgPolicy, _Compare, _RandomAccessIterator, _UseBitSetPartition>(
+        __first, __i, __comp, __depth, __leftmost);
+    __leftmost = false;
+    __first    = ++__i;
   }
 }
 
@@ -627,12 +902,18 @@ template <class _WrappedComp, class _RandomAccessIterator>
 _LIBCPP_HIDDEN void __sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _WrappedComp __wrapped_comp) {
   typedef typename iterator_traits<_RandomAccessIterator>::
diff erence_type 
diff erence_type;
   
diff erence_type __depth_limit = 2 * __log2i(__last - __first);
-
   using _Unwrap = _UnwrapAlgPolicy<_WrappedComp>;
   using _AlgPolicy = typename _Unwrap::_AlgPolicy;
   using _Compare = typename _Unwrap::_Comp;
   _Compare __comp = _Unwrap::__get_comp(__wrapped_comp);
-  std::__introsort<_AlgPolicy, _Compare>(__first, __last, __comp, __depth_limit);
+  // Only use bitset partitioning for arithmetic types.  We should also check
+  // that the default comparator is in use so that we are sure that there are no
+  // branches in the comparator.
+  std::__introsort<_AlgPolicy,
+                   _Compare,
+                   _RandomAccessIterator,
+                   __use_branchless_sort<_Compare, _RandomAccessIterator>::value>(
+      __first, __last, __comp, __depth_limit);
 }
 
 template <class _Compare, class _Tp>

diff  --git a/libcxx/include/__bits b/libcxx/include/__bits
index d2c8439a6ba72..1e4b51ca699b9 100644
--- a/libcxx/include/__bits
+++ b/libcxx/include/__bits
@@ -68,6 +68,18 @@ int __libcpp_popcount(unsigned long __x)      _NOEXCEPT { return __builtin_popco
 inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 int __libcpp_popcount(unsigned long long __x) _NOEXCEPT { return __builtin_popcountll(__x); }
 
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR unsigned __libcpp_blsr(unsigned __x) _NOEXCEPT {
+  return __x ^ (__x & -__x);
+}
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR unsigned long __libcpp_blsr(unsigned long __x) _NOEXCEPT {
+  return __x ^ (__x & -__x);
+}
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR unsigned long long __libcpp_blsr(unsigned long long __x) _NOEXCEPT {
+  return __x ^ (__x & -__x);
+}
+
 _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS


        


More information about the libcxx-commits mailing list