[libcxx-commits] [libcxx] [libc++] Optimize ranges::minmax (PR #87335)

Nikolas Klauser via libcxx-commits libcxx-commits at lists.llvm.org
Tue Apr 2 04:09:16 PDT 2024


https://github.com/philnik777 created https://github.com/llvm/llvm-project/pull/87335

This allows Clang to vectorize the loop.
```
---------------------------------------------------------------------
Benchmark                                         old             new
---------------------------------------------------------------------
BM_std_minmax<char>/1                        0.659 ns         1.41 ns
BM_std_minmax<char>/2                         1.08 ns         2.16 ns
BM_std_minmax<char>/3                         2.16 ns         2.96 ns
BM_std_minmax<char>/4                         2.82 ns         3.81 ns
BM_std_minmax<char>/5                         3.43 ns         4.69 ns
BM_std_minmax<char>/6                         4.08 ns         5.63 ns
BM_std_minmax<char>/7                         4.75 ns         6.51 ns
BM_std_minmax<char>/8                         5.42 ns         7.41 ns
BM_std_minmax<char>/9                         6.05 ns         8.34 ns
BM_std_minmax<char>/10                        6.68 ns         9.29 ns
BM_std_minmax<char>/11                        7.47 ns         10.6 ns
BM_std_minmax<char>/12                        7.95 ns         11.4 ns
BM_std_minmax<char>/13                        8.64 ns         12.4 ns
BM_std_minmax<char>/14                        9.35 ns         13.4 ns
BM_std_minmax<char>/15                        10.1 ns         14.4 ns
BM_std_minmax<char>/16                        10.6 ns         2.25 ns
BM_std_minmax<char>/17                        11.3 ns         2.82 ns
BM_std_minmax<char>/18                        11.8 ns         3.71 ns
BM_std_minmax<char>/19                        12.6 ns         4.52 ns
BM_std_minmax<char>/20                        13.2 ns         5.47 ns
BM_std_minmax<char>/21                        14.1 ns         6.67 ns
BM_std_minmax<char>/22                        14.5 ns         7.78 ns
BM_std_minmax<char>/23                        15.1 ns         8.67 ns
BM_std_minmax<char>/24                        15.7 ns         9.68 ns
BM_std_minmax<char>/25                        16.4 ns         10.7 ns
BM_std_minmax<char>/26                        17.1 ns         11.7 ns
BM_std_minmax<char>/27                        17.8 ns         12.8 ns
BM_std_minmax<char>/28                        18.4 ns         14.1 ns
BM_std_minmax<char>/29                        19.0 ns         15.0 ns
BM_std_minmax<char>/30                        19.6 ns         16.0 ns
BM_std_minmax<char>/31                        20.2 ns         17.0 ns
BM_std_minmax<char>/32                        20.8 ns         2.46 ns
BM_std_minmax<char>/64                        41.5 ns         2.97 ns
BM_std_minmax<char>/512                        340 ns         6.05 ns
BM_std_minmax<char>/1024                       667 ns         8.83 ns
BM_std_minmax<char>/4000                      2571 ns         28.6 ns
BM_std_minmax<char>/4096                      2632 ns         25.8 ns
BM_std_minmax<char>/5500                      3554 ns         51.1 ns
BM_std_minmax<char>/64000                    41175 ns          480 ns
BM_std_minmax<char>/65536                    42039 ns          490 ns
BM_std_minmax<char>/70000                    44931 ns          528 ns
BM_std_minmax<short>/1                       0.708 ns         1.20 ns
BM_std_minmax<short>/2                        1.18 ns         1.78 ns
BM_std_minmax<short>/3                        1.98 ns         2.42 ns
BM_std_minmax<short>/4                        2.47 ns         3.05 ns
BM_std_minmax<short>/5                        3.09 ns         3.72 ns
BM_std_minmax<short>/6                        3.49 ns         4.37 ns
BM_std_minmax<short>/7                        4.24 ns         5.03 ns
BM_std_minmax<short>/8                        4.65 ns         2.12 ns
BM_std_minmax<short>/9                        5.34 ns         2.51 ns
BM_std_minmax<short>/10                       5.82 ns         3.18 ns
BM_std_minmax<short>/11                       6.36 ns         3.97 ns
BM_std_minmax<short>/12                       6.73 ns         4.68 ns
BM_std_minmax<short>/13                       7.59 ns         5.49 ns
BM_std_minmax<short>/14                       7.77 ns         6.45 ns
BM_std_minmax<short>/15                       8.54 ns         7.55 ns
BM_std_minmax<short>/16                       8.74 ns         2.38 ns
BM_std_minmax<short>/17                       9.59 ns         2.76 ns
BM_std_minmax<short>/18                       9.88 ns         3.37 ns
BM_std_minmax<short>/19                       10.7 ns         4.17 ns
BM_std_minmax<short>/20                       10.9 ns         4.88 ns
BM_std_minmax<short>/21                       12.1 ns         5.70 ns
BM_std_minmax<short>/22                       12.6 ns         6.64 ns
BM_std_minmax<short>/23                       13.5 ns         7.72 ns
BM_std_minmax<short>/24                       13.2 ns         2.87 ns
BM_std_minmax<short>/25                       14.2 ns         3.10 ns
BM_std_minmax<short>/26                       14.2 ns         3.59 ns
BM_std_minmax<short>/27                       15.4 ns         4.35 ns
BM_std_minmax<short>/28                       15.3 ns         5.10 ns
BM_std_minmax<short>/29                       16.2 ns         5.87 ns
BM_std_minmax<short>/30                       16.2 ns         6.88 ns
BM_std_minmax<short>/31                       17.0 ns         7.78 ns
BM_std_minmax<short>/32                       17.2 ns         3.45 ns
BM_std_minmax<short>/64                       34.1 ns         3.35 ns
BM_std_minmax<short>/512                       279 ns         8.37 ns
BM_std_minmax<short>/1024                      549 ns         14.2 ns
BM_std_minmax<short>/4000                     2111 ns         50.1 ns
BM_std_minmax<short>/4096                     2167 ns         47.9 ns
BM_std_minmax<short>/5500                     2895 ns         69.7 ns
BM_std_minmax<short>/64000                   33454 ns          953 ns
BM_std_minmax<short>/65536                   34474 ns          970 ns
BM_std_minmax<short>/70000                   36691 ns         1037 ns
BM_std_minmax<int>/1                         0.664 ns         1.17 ns
BM_std_minmax<int>/2                          1.11 ns         1.69 ns
BM_std_minmax<int>/3                          2.36 ns         2.29 ns
BM_std_minmax<int>/4                          2.53 ns         2.91 ns
BM_std_minmax<int>/5                          3.23 ns         3.56 ns
BM_std_minmax<int>/6                          3.56 ns         4.23 ns
BM_std_minmax<int>/7                          4.28 ns         4.91 ns
BM_std_minmax<int>/8                          4.60 ns         5.60 ns
BM_std_minmax<int>/9                          5.38 ns         6.31 ns
BM_std_minmax<int>/10                         5.69 ns         7.03 ns
BM_std_minmax<int>/11                         6.41 ns         7.70 ns
BM_std_minmax<int>/12                         6.73 ns         8.39 ns
BM_std_minmax<int>/13                         7.38 ns         9.07 ns
BM_std_minmax<int>/14                         7.74 ns         9.79 ns
BM_std_minmax<int>/15                         8.53 ns         10.5 ns
BM_std_minmax<int>/16                         8.79 ns         11.2 ns
BM_std_minmax<int>/17                         9.63 ns         12.0 ns
BM_std_minmax<int>/18                         9.84 ns         12.7 ns
BM_std_minmax<int>/19                         10.6 ns         13.5 ns
BM_std_minmax<int>/20                         11.0 ns         14.3 ns
BM_std_minmax<int>/21                         11.7 ns         15.0 ns
BM_std_minmax<int>/22                         12.0 ns         15.7 ns
BM_std_minmax<int>/23                         13.1 ns         16.5 ns
BM_std_minmax<int>/24                         13.0 ns         17.3 ns
BM_std_minmax<int>/25                         13.7 ns         17.9 ns
BM_std_minmax<int>/26                         14.0 ns         18.6 ns
BM_std_minmax<int>/27                         14.8 ns         19.4 ns
BM_std_minmax<int>/28                         15.1 ns         20.3 ns
BM_std_minmax<int>/29                         15.8 ns         20.9 ns
BM_std_minmax<int>/30                         16.1 ns         21.7 ns
BM_std_minmax<int>/31                         16.9 ns         22.5 ns
BM_std_minmax<int>/32                         17.2 ns         3.40 ns
BM_std_minmax<int>/64                         33.9 ns         4.04 ns
BM_std_minmax<int>/512                         275 ns         14.6 ns
BM_std_minmax<int>/1024                        541 ns         27.5 ns
BM_std_minmax<int>/4000                       2093 ns         96.3 ns
BM_std_minmax<int>/4096                       2146 ns         98.3 ns
BM_std_minmax<int>/5500                       2866 ns          157 ns
BM_std_minmax<int>/64000                     33619 ns         1954 ns
BM_std_minmax<int>/65536                     34252 ns         2009 ns
BM_std_minmax<int>/70000                     36618 ns         2125 ns
BM_std_minmax<long long>/1                   0.709 ns         1.19 ns
BM_std_minmax<long long>/2                    1.01 ns         1.65 ns
BM_std_minmax<long long>/3                    2.14 ns         2.21 ns
BM_std_minmax<long long>/4                    2.45 ns         2.83 ns
BM_std_minmax<long long>/5                    3.09 ns         3.47 ns
BM_std_minmax<long long>/6                    3.44 ns         4.11 ns
BM_std_minmax<long long>/7                    4.16 ns         4.79 ns
BM_std_minmax<long long>/8                    4.54 ns         5.47 ns
BM_std_minmax<long long>/9                    5.37 ns         6.20 ns
BM_std_minmax<long long>/10                   5.71 ns         6.93 ns
BM_std_minmax<long long>/11                   6.00 ns         7.60 ns
BM_std_minmax<long long>/12                   6.43 ns         8.27 ns
BM_std_minmax<long long>/13                   7.01 ns         8.94 ns
BM_std_minmax<long long>/14                   7.45 ns         9.65 ns
BM_std_minmax<long long>/15                   8.16 ns         10.4 ns
BM_std_minmax<long long>/16                   8.46 ns         5.22 ns
BM_std_minmax<long long>/17                   9.16 ns         5.22 ns
BM_std_minmax<long long>/18                   9.53 ns         5.52 ns
BM_std_minmax<long long>/19                   10.2 ns         6.02 ns
BM_std_minmax<long long>/20                   10.5 ns         6.89 ns
BM_std_minmax<long long>/21                   11.3 ns         7.83 ns
BM_std_minmax<long long>/22                   11.6 ns         8.59 ns
BM_std_minmax<long long>/23                   12.3 ns         9.91 ns
BM_std_minmax<long long>/24                   12.6 ns         10.1 ns
BM_std_minmax<long long>/25                   13.2 ns         12.0 ns
BM_std_minmax<long long>/26                   13.6 ns         13.5 ns
BM_std_minmax<long long>/27                   14.2 ns         14.8 ns
BM_std_minmax<long long>/28                   14.7 ns         15.9 ns
BM_std_minmax<long long>/29                   15.3 ns         16.6 ns
BM_std_minmax<long long>/30                   15.8 ns         17.3 ns
BM_std_minmax<long long>/31                   16.3 ns         18.2 ns
BM_std_minmax<long long>/32                   16.7 ns         7.18 ns
BM_std_minmax<long long>/64                   33.1 ns         11.5 ns
BM_std_minmax<long long>/512                   268 ns         71.0 ns
BM_std_minmax<long long>/1024                  532 ns          138 ns
BM_std_minmax<long long>/4000                 2056 ns          533 ns
BM_std_minmax<long long>/4096                 2112 ns          539 ns
BM_std_minmax<long long>/5500                 2823 ns          749 ns
BM_std_minmax<long long>/64000               32956 ns         8590 ns
BM_std_minmax<long long>/65536               33795 ns         8791 ns
BM_std_minmax<long long>/70000               36084 ns         9442 ns
BM_std_minmax<unsigned char>/1               0.714 ns         1.41 ns
BM_std_minmax<unsigned char>/2               0.955 ns         1.96 ns
BM_std_minmax<unsigned char>/3                1.90 ns         2.63 ns
BM_std_minmax<unsigned char>/4                2.40 ns         3.34 ns
BM_std_minmax<unsigned char>/5                2.87 ns         4.10 ns
BM_std_minmax<unsigned char>/6                3.47 ns         4.88 ns
BM_std_minmax<unsigned char>/7                4.04 ns         5.66 ns
BM_std_minmax<unsigned char>/8                4.65 ns         6.45 ns
BM_std_minmax<unsigned char>/9                5.18 ns         7.24 ns
BM_std_minmax<unsigned char>/10               5.80 ns         8.05 ns
BM_std_minmax<unsigned char>/11               6.24 ns         8.86 ns
BM_std_minmax<unsigned char>/12               6.78 ns         9.70 ns
BM_std_minmax<unsigned char>/13               7.30 ns         10.6 ns
BM_std_minmax<unsigned char>/14               7.86 ns         11.4 ns
BM_std_minmax<unsigned char>/15               8.46 ns         12.3 ns
BM_std_minmax<unsigned char>/16               9.00 ns         2.12 ns
BM_std_minmax<unsigned char>/17               9.58 ns         2.83 ns
BM_std_minmax<unsigned char>/18               10.1 ns         3.37 ns
BM_std_minmax<unsigned char>/19               10.7 ns         4.11 ns
BM_std_minmax<unsigned char>/20               11.2 ns         4.85 ns
BM_std_minmax<unsigned char>/21               11.9 ns         5.69 ns
BM_std_minmax<unsigned char>/22               12.3 ns         6.77 ns
BM_std_minmax<unsigned char>/23               13.1 ns         7.56 ns
BM_std_minmax<unsigned char>/24               13.5 ns         8.40 ns
BM_std_minmax<unsigned char>/25               14.2 ns         9.30 ns
BM_std_minmax<unsigned char>/26               14.4 ns         10.1 ns
BM_std_minmax<unsigned char>/27               15.0 ns         11.1 ns
BM_std_minmax<unsigned char>/28               15.3 ns         11.9 ns
BM_std_minmax<unsigned char>/29               16.2 ns         12.9 ns
BM_std_minmax<unsigned char>/30               16.5 ns         13.9 ns
BM_std_minmax<unsigned char>/31               17.2 ns         14.8 ns
BM_std_minmax<unsigned char>/32               17.6 ns         2.36 ns
BM_std_minmax<unsigned char>/64               35.6 ns         3.21 ns
BM_std_minmax<unsigned char>/512               288 ns         6.00 ns
BM_std_minmax<unsigned char>/1024              573 ns         8.80 ns
BM_std_minmax<unsigned char>/4000             2222 ns         28.6 ns
BM_std_minmax<unsigned char>/4096             2265 ns         25.9 ns
BM_std_minmax<unsigned char>/5500             3047 ns         48.8 ns
BM_std_minmax<unsigned char>/64000           35059 ns          480 ns
BM_std_minmax<unsigned char>/65536           35941 ns          491 ns
BM_std_minmax<unsigned char>/70000           38922 ns          525 ns
BM_std_minmax<unsigned short>/1              0.711 ns         1.18 ns
BM_std_minmax<unsigned short>/2              0.957 ns         1.65 ns
BM_std_minmax<unsigned short>/3               2.13 ns         2.21 ns
BM_std_minmax<unsigned short>/4               2.14 ns         2.78 ns
BM_std_minmax<unsigned short>/5               3.06 ns         3.29 ns
BM_std_minmax<unsigned short>/6               2.89 ns         3.87 ns
BM_std_minmax<unsigned short>/7               3.80 ns         4.55 ns
BM_std_minmax<unsigned short>/8               3.68 ns         2.02 ns
BM_std_minmax<unsigned short>/9               4.53 ns         2.40 ns
BM_std_minmax<unsigned short>/10              4.60 ns         2.94 ns
BM_std_minmax<unsigned short>/11              5.67 ns         3.67 ns
BM_std_minmax<unsigned short>/12              5.39 ns         4.22 ns
BM_std_minmax<unsigned short>/13              6.58 ns         4.78 ns
BM_std_minmax<unsigned short>/14              6.33 ns         5.54 ns
BM_std_minmax<unsigned short>/15              7.34 ns         6.30 ns
BM_std_minmax<unsigned short>/16              7.17 ns         2.25 ns
BM_std_minmax<unsigned short>/17              8.19 ns         2.61 ns
BM_std_minmax<unsigned short>/18              8.02 ns         3.19 ns
BM_std_minmax<unsigned short>/19              9.03 ns         3.72 ns
BM_std_minmax<unsigned short>/20              8.89 ns         4.36 ns
BM_std_minmax<unsigned short>/21              9.77 ns         5.10 ns
BM_std_minmax<unsigned short>/22              9.70 ns         5.55 ns
BM_std_minmax<unsigned short>/23              10.8 ns         6.29 ns
BM_std_minmax<unsigned short>/24              10.6 ns         2.41 ns
BM_std_minmax<unsigned short>/25              11.6 ns         2.75 ns
BM_std_minmax<unsigned short>/26              11.4 ns         3.26 ns
BM_std_minmax<unsigned short>/27              12.4 ns         3.86 ns
BM_std_minmax<unsigned short>/28              12.3 ns         4.45 ns
BM_std_minmax<unsigned short>/29              13.2 ns         5.07 ns
BM_std_minmax<unsigned short>/30              13.1 ns         5.77 ns
BM_std_minmax<unsigned short>/31              13.9 ns         6.65 ns
BM_std_minmax<unsigned short>/32              13.9 ns         2.72 ns
BM_std_minmax<unsigned short>/64              27.8 ns         3.25 ns
BM_std_minmax<unsigned short>/512              220 ns         8.30 ns
BM_std_minmax<unsigned short>/1024             435 ns         14.1 ns
BM_std_minmax<unsigned short>/4000            1703 ns         49.8 ns
BM_std_minmax<unsigned short>/4096            1746 ns         47.9 ns
BM_std_minmax<unsigned short>/5500            2350 ns         69.9 ns
BM_std_minmax<unsigned short>/64000          27388 ns          953 ns
BM_std_minmax<unsigned short>/65536          28040 ns          975 ns
BM_std_minmax<unsigned short>/70000          29967 ns         1040 ns
BM_std_minmax<unsigned int>/1                0.712 ns         1.18 ns
BM_std_minmax<unsigned int>/2                0.965 ns         1.65 ns
BM_std_minmax<unsigned int>/3                 2.13 ns         2.14 ns
BM_std_minmax<unsigned int>/4                 2.09 ns         2.64 ns
BM_std_minmax<unsigned int>/5                 3.02 ns         3.21 ns
BM_std_minmax<unsigned int>/6                 2.94 ns         3.81 ns
BM_std_minmax<unsigned int>/7                 3.91 ns         4.38 ns
BM_std_minmax<unsigned int>/8                 3.75 ns         4.93 ns
BM_std_minmax<unsigned int>/9                 4.71 ns         5.60 ns
BM_std_minmax<unsigned int>/10                4.59 ns         6.26 ns
BM_std_minmax<unsigned int>/11                5.57 ns         6.80 ns
BM_std_minmax<unsigned int>/12                5.43 ns         7.47 ns
BM_std_minmax<unsigned int>/13                6.45 ns         8.10 ns
BM_std_minmax<unsigned int>/14                6.32 ns         8.69 ns
BM_std_minmax<unsigned int>/15                7.29 ns         9.37 ns
BM_std_minmax<unsigned int>/16                7.12 ns         9.99 ns
BM_std_minmax<unsigned int>/17                8.24 ns         10.6 ns
BM_std_minmax<unsigned int>/18                8.00 ns         11.2 ns
BM_std_minmax<unsigned int>/19                8.94 ns         12.0 ns
BM_std_minmax<unsigned int>/20                8.91 ns         12.6 ns
BM_std_minmax<unsigned int>/21                9.73 ns         17.2 ns
BM_std_minmax<unsigned int>/22                9.75 ns         13.8 ns
BM_std_minmax<unsigned int>/23                10.6 ns         14.5 ns
BM_std_minmax<unsigned int>/24                10.6 ns         15.1 ns
BM_std_minmax<unsigned int>/25                11.5 ns         15.7 ns
BM_std_minmax<unsigned int>/26                11.4 ns         16.3 ns
BM_std_minmax<unsigned int>/27                12.3 ns         17.0 ns
BM_std_minmax<unsigned int>/28                12.3 ns         17.6 ns
BM_std_minmax<unsigned int>/29                13.2 ns         18.3 ns
BM_std_minmax<unsigned int>/30                13.2 ns         19.0 ns
BM_std_minmax<unsigned int>/31                14.0 ns         19.6 ns
BM_std_minmax<unsigned int>/32                14.0 ns         3.39 ns
BM_std_minmax<unsigned int>/64                27.6 ns         4.05 ns
BM_std_minmax<unsigned int>/512                221 ns         14.2 ns
BM_std_minmax<unsigned int>/1024               439 ns         25.5 ns
BM_std_minmax<unsigned int>/4000              1720 ns         96.3 ns
BM_std_minmax<unsigned int>/4096              1762 ns         97.8 ns
BM_std_minmax<unsigned int>/5500              2364 ns          146 ns
BM_std_minmax<unsigned int>/64000            27874 ns         1905 ns
BM_std_minmax<unsigned int>/65536            28012 ns         1961 ns
BM_std_minmax<unsigned int>/70000            29899 ns         2087 ns
BM_std_minmax<unsigned long long>/1          0.707 ns         1.18 ns
BM_std_minmax<unsigned long long>/2          0.909 ns         1.65 ns
BM_std_minmax<unsigned long long>/3           1.65 ns         2.70 ns
BM_std_minmax<unsigned long long>/4           1.93 ns         2.69 ns
BM_std_minmax<unsigned long long>/5           2.45 ns         3.34 ns
BM_std_minmax<unsigned long long>/6           2.78 ns         3.81 ns
BM_std_minmax<unsigned long long>/7           3.28 ns         4.43 ns
BM_std_minmax<unsigned long long>/8           3.70 ns         4.92 ns
BM_std_minmax<unsigned long long>/9           4.12 ns         5.64 ns
BM_std_minmax<unsigned long long>/10          4.44 ns         6.15 ns
BM_std_minmax<unsigned long long>/11          4.91 ns         6.81 ns
BM_std_minmax<unsigned long long>/12          5.31 ns         7.41 ns
BM_std_minmax<unsigned long long>/13          5.72 ns         7.96 ns
BM_std_minmax<unsigned long long>/14          6.05 ns         8.66 ns
BM_std_minmax<unsigned long long>/15          6.55 ns         9.37 ns
BM_std_minmax<unsigned long long>/16          6.89 ns         7.98 ns
BM_std_minmax<unsigned long long>/17          7.34 ns         8.13 ns
BM_std_minmax<unsigned long long>/18          7.73 ns         8.42 ns
BM_std_minmax<unsigned long long>/19          8.26 ns         8.63 ns
BM_std_minmax<unsigned long long>/20          8.54 ns         8.96 ns
BM_std_minmax<unsigned long long>/21          9.14 ns         9.37 ns
BM_std_minmax<unsigned long long>/22          9.39 ns         9.67 ns
BM_std_minmax<unsigned long long>/23          10.1 ns         10.1 ns
BM_std_minmax<unsigned long long>/24          10.4 ns         10.6 ns
BM_std_minmax<unsigned long long>/25          11.0 ns         11.3 ns
BM_std_minmax<unsigned long long>/26          11.3 ns         12.1 ns
BM_std_minmax<unsigned long long>/27          11.8 ns         14.2 ns
BM_std_minmax<unsigned long long>/28          12.1 ns         15.8 ns
BM_std_minmax<unsigned long long>/29          12.6 ns         17.4 ns
BM_std_minmax<unsigned long long>/30          13.1 ns         18.1 ns
BM_std_minmax<unsigned long long>/31          13.4 ns         18.8 ns
BM_std_minmax<unsigned long long>/32          13.8 ns         10.4 ns
BM_std_minmax<unsigned long long>/64          27.3 ns         15.5 ns
BM_std_minmax<unsigned long long>/512          222 ns         80.6 ns
BM_std_minmax<unsigned long long>/1024         443 ns          156 ns
BM_std_minmax<unsigned long long>/4000        1731 ns          591 ns
BM_std_minmax<unsigned long long>/4096        1752 ns          609 ns
BM_std_minmax<unsigned long long>/5500        2340 ns          819 ns
BM_std_minmax<unsigned long long>/64000      27166 ns         9652 ns
BM_std_minmax<unsigned long long>/65536      27869 ns         9876 ns
BM_std_minmax<unsigned long long>/70000      29920 ns        10680 ns
```



>From 62a191cbd33ccc62aa66611a33262b3deff8c8ee Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser at berlin.de>
Date: Mon, 11 Mar 2024 13:18:12 +0100
Subject: [PATCH] [libc++] Optimize ranges::minmax

---
 libcxx/benchmarks/CMakeLists.txt              |  1 +
 libcxx/benchmarks/algorithms/minmax.bench.cpp | 68 +++++++++++++++++++
 libcxx/docs/ReleaseNotes/19.rst               |  2 +
 libcxx/include/__algorithm/comp.h             |  4 ++
 libcxx/include/__algorithm/ranges_minmax.h    | 14 +++-
 libcxx/include/__functional/operations.h      |  7 ++
 .../include/__functional/ranges_operations.h  |  3 +
 .../include/__type_traits/operation_traits.h  |  1 +
 8 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/benchmarks/algorithms/minmax.bench.cpp

diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index 387e013afeb6c4..928238c1ac69ba 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -182,6 +182,7 @@ set(BENCHMARK_TESTS
     algorithms/make_heap.bench.cpp
     algorithms/make_heap_then_sort_heap.bench.cpp
     algorithms/min.bench.cpp
+    algorithms/minmax.bench.cpp
     algorithms/min_max_element.bench.cpp
     algorithms/mismatch.bench.cpp
     algorithms/pop_heap.bench.cpp
diff --git a/libcxx/benchmarks/algorithms/minmax.bench.cpp b/libcxx/benchmarks/algorithms/minmax.bench.cpp
new file mode 100644
index 00000000000000..b0ff7f91c19939
--- /dev/null
+++ b/libcxx/benchmarks/algorithms/minmax.bench.cpp
@@ -0,0 +1,68 @@
+#include <algorithm>
+#include <cassert>
+
+#include <benchmark/benchmark.h>
+
+void run_sizes(auto benchmark) {
+  benchmark->Arg(1)
+      ->Arg(2)
+      ->Arg(3)
+      ->Arg(4)
+      ->Arg(5)
+      ->Arg(6)
+      ->Arg(7)
+      ->Arg(8)
+      ->Arg(9)
+      ->Arg(10)
+      ->Arg(11)
+      ->Arg(12)
+      ->Arg(13)
+      ->Arg(14)
+      ->Arg(15)
+      ->Arg(16)
+      ->Arg(17)
+      ->Arg(18)
+      ->Arg(19)
+      ->Arg(20)
+      ->Arg(21)
+      ->Arg(22)
+      ->Arg(23)
+      ->Arg(24)
+      ->Arg(25)
+      ->Arg(26)
+      ->Arg(27)
+      ->Arg(28)
+      ->Arg(29)
+      ->Arg(30)
+      ->Arg(31)
+      ->Arg(32)
+      ->Arg(64)
+      ->Arg(512)
+      ->Arg(1024)
+      ->Arg(4000)
+      ->Arg(4096)
+      ->Arg(5500)
+      ->Arg(64000)
+      ->Arg(65536)
+      ->Arg(70000);
+}
+
+template <class T>
+static void BM_std_minmax(benchmark::State& state) {
+  std::vector<T> vec(state.range(), 3);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(vec);
+    benchmark::DoNotOptimize(std::ranges::minmax(vec));
+  }
+}
+BENCHMARK(BM_std_minmax<char>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<short>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<int>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<long long>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned char>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned short>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned int>)->Apply(run_sizes);
+BENCHMARK(BM_std_minmax<unsigned long long>)->Apply(run_sizes);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index dd39c1bbbc78a3..b46ac1f443f756 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -53,6 +53,8 @@ Improvements and New Features
   resulting in a performance increase of up to 1400x.
 - The ``std::mismatch`` algorithm has been optimized for integral types, which can lead up to 40x performance
   improvements.
+- The ``std::ranges::minmax`` algorithm has been optimized for integral types, resulting in a performance increase of
+  up to 100x.
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/include/__algorithm/comp.h b/libcxx/include/__algorithm/comp.h
index 3902f7560304a1..5741eac376dc13 100644
--- a/libcxx/include/__algorithm/comp.h
+++ b/libcxx/include/__algorithm/comp.h
@@ -11,6 +11,7 @@
 
 #include <__config>
 #include <__type_traits/integral_constant.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/operation_traits.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -42,6 +43,9 @@ struct __less<void, void> {
   }
 };
 
+template <class _Tp>
+struct __desugars_to<__totally_ordered_less_tag, __less<>, _Tp, _Tp> : is_integral<_Tp> {};
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___ALGORITHM_COMP_H
diff --git a/libcxx/include/__algorithm/ranges_minmax.h b/libcxx/include/__algorithm/ranges_minmax.h
index 22a62b620c936f..30e31c1a455bcd 100644
--- a/libcxx/include/__algorithm/ranges_minmax.h
+++ b/libcxx/include/__algorithm/ranges_minmax.h
@@ -24,6 +24,7 @@
 #include <__ranges/access.h>
 #include <__ranges/concepts.h>
 #include <__type_traits/is_reference.h>
+#include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/forward.h>
 #include <__utility/move.h>
@@ -83,7 +84,18 @@ struct __fn {
 
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__first != __last, "range has to contain at least one element");
 
-    if constexpr (forward_range<_Range>) {
+    if constexpr (contiguous_range<_Range> && is_integral_v<_ValueT> &&
+                  __is_cheap_to_copy<_ValueT> & __is_identity<_Proj>::value &&
+                  __desugars_to<__totally_ordered_less_tag, _Comp, _ValueT, _ValueT>::value) {
+      minmax_result<_ValueT> __result = {__r[0], __r[0]};
+      for (auto __e : __r) {
+        if (__e < __result.min)
+          __result.min = __e;
+        if (__result.max < __e)
+          __result.max = __e;
+      }
+      return __result;
+    } else if constexpr (forward_range<_Range>) {
       // Special-case the one element case. Avoid repeatedly initializing objects from the result of an iterator
       // dereference when doing so might not be idempotent. The `if constexpr` avoids the extra branch in cases where
       // it's not needed.
diff --git a/libcxx/include/__functional/operations.h b/libcxx/include/__functional/operations.h
index 7ddc00650f162f..ee3daba9bcbcb0 100644
--- a/libcxx/include/__functional/operations.h
+++ b/libcxx/include/__functional/operations.h
@@ -14,6 +14,7 @@
 #include <__functional/binary_function.h>
 #include <__functional/unary_function.h>
 #include <__type_traits/integral_constant.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/operation_traits.h>
 #include <__utility/forward.h>
 
@@ -360,6 +361,9 @@ struct _LIBCPP_TEMPLATE_VIS less : __binary_function<_Tp, _Tp, bool> {
 };
 _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(less);
 
+template <class _Tp>
+struct __desugars_to<__totally_ordered_less_tag, less<_Tp>, _Tp, _Tp> : is_integral<_Tp> {};
+
 #if _LIBCPP_STD_VER >= 14
 template <>
 struct _LIBCPP_TEMPLATE_VIS less<void> {
@@ -371,6 +375,9 @@ struct _LIBCPP_TEMPLATE_VIS less<void> {
   }
   typedef void is_transparent;
 };
+
+template <class _Tp>
+struct __desugars_to<__totally_ordered_less_tag, less<>, _Tp, _Tp> : is_integral<_Tp> {};
 #endif
 
 #if _LIBCPP_STD_VER >= 14
diff --git a/libcxx/include/__functional/ranges_operations.h b/libcxx/include/__functional/ranges_operations.h
index 38b28018049eb7..c49913105577ab 100644
--- a/libcxx/include/__functional/ranges_operations.h
+++ b/libcxx/include/__functional/ranges_operations.h
@@ -100,6 +100,9 @@ struct greater_equal {
 template <class _Tp, class _Up>
 struct __desugars_to<__equal_tag, ranges::equal_to, _Tp, _Up> : true_type {};
 
+template <class _Tp, class _Up>
+struct __desugars_to<__totally_ordered_less_tag, ranges::less, _Tp, _Up> : true_type {};
+
 #endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__type_traits/operation_traits.h b/libcxx/include/__type_traits/operation_traits.h
index ef6e71693430d0..99ae37d0a5254d 100644
--- a/libcxx/include/__type_traits/operation_traits.h
+++ b/libcxx/include/__type_traits/operation_traits.h
@@ -21,6 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // Tags to represent the canonical operations
 struct __equal_tag {};
 struct __plus_tag {};
+struct __totally_ordered_less_tag {};
 
 // This class template is used to determine whether an operation "desugars"
 // (or boils down) to a given canonical operation.



More information about the libcxx-commits mailing list