[libc-commits] [libc] 66a0329 - [libc] Implement branchless head-tail comparison for bcmp (#107540)

via libc-commits libc-commits at lists.llvm.org
Fri Sep 6 02:19:06 PDT 2024


Author: Vitaly Goldshteyn
Date: 2024-09-06T11:19:01+02:00
New Revision: 66a03295de26c61a2178bb3a697d355592cb0eb5

URL: https://github.com/llvm/llvm-project/commit/66a03295de26c61a2178bb3a697d355592cb0eb5
DIFF: https://github.com/llvm/llvm-project/commit/66a03295de26c61a2178bb3a697d355592cb0eb5.diff

LOG: [libc] Implement branchless head-tail comparison for bcmp (#107540)

Binary size changes:

| Bytes (cache lines) | before   | after   |
|---------------------|----------|---------|
| sse4                | 419 (7)  | 288 (5) |
| avx                 | 430 (7)  | 308 (5) |
| avx512f             | 589 (10) | 390 (7) |

Benchmarks for different CPUs using
https://github.com/google/fleetbench.

 - indus-cascadelake

```
name                                                       old speed            new speed            delta
BM_LIBC_Bcmp_Fleet_L1                                      1.96GB/s ± 1%        2.19GB/s ± 0%  +11.49%  (p=0.000 n=29+24)
BM_LIBC_Bcmp_Fleet_L2                                      1.90GB/s ± 1%        2.14GB/s ± 1%  +12.68%  (p=0.000 n=29+24)
BM_LIBC_Bcmp_Fleet_LLC                                      513MB/s ± 4%         531MB/s ± 4%   +3.53%  (p=0.000 n=24+24)
BM_LIBC_Bcmp_Fleet_Cold                                     452MB/s ± 3%         456MB/s ± 4%     ~     (p=0.103 n=30+30)
BM_LIBC_Bcmp_0_L1                                [Bcmp_0]  2.98GB/s ± 1%        3.15GB/s ± 1%   +5.59%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_0_L2                                [Bcmp_0]  2.86GB/s ± 1%        3.07GB/s ± 1%   +7.21%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_0_LLC                               [Bcmp_0]   738MB/s ± 7%         751MB/s ± 3%   +1.68%  (p=0.000 n=24+25)
BM_LIBC_Bcmp_0_Cold                              [Bcmp_0]   643MB/s ± 3%         642MB/s ± 4%     ~     (p=0.522 n=29+30)
BM_LIBC_Bcmp_1_L1                                [Bcmp_1]  3.08GB/s ± 0%        3.25GB/s ± 0%   +5.35%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_1_L2                                [Bcmp_1]  2.97GB/s ± 1%        3.17GB/s ± 1%   +6.65%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_1_LLC                               [Bcmp_1]   901MB/s ±59%         871MB/s ±36%     ~     (p=0.676 n=29+27)
BM_LIBC_Bcmp_1_Cold                              [Bcmp_1]   686MB/s ± 4%         686MB/s ± 3%     ~     (p=0.934 n=29+30)
BM_LIBC_Bcmp_2_L1                                [Bcmp_2]  1.63GB/s ± 0%        1.80GB/s ± 1%  +10.19%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_2_L2                                [Bcmp_2]  1.57GB/s ± 1%        1.75GB/s ± 1%  +11.46%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_2_LLC                               [Bcmp_2]   451MB/s ±61%         427MB/s ±28%     ~     (p=0.469 n=29+25)
BM_LIBC_Bcmp_2_Cold                              [Bcmp_2]   353MB/s ± 4%         354MB/s ± 5%     ~     (p=0.467 n=30+30)
BM_LIBC_Bcmp_3_L1                                [Bcmp_3]  1.91GB/s ± 1%        2.10GB/s ± 1%   +9.90%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_3_L2                                [Bcmp_3]  1.84GB/s ± 1%        2.03GB/s ± 1%  +10.63%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_3_LLC                               [Bcmp_3]   491MB/s ±24%         538MB/s ±24%   +9.66%  (p=0.000 n=24+27)
BM_LIBC_Bcmp_3_Cold                              [Bcmp_3]   417MB/s ± 4%         421MB/s ± 3%     ~     (p=0.063 n=30+29)
BM_LIBC_Bcmp_4_L1                                [Bcmp_4]   761MB/s ± 1%         867MB/s ± 1%  +14.02%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_4_L2                                [Bcmp_4]   748MB/s ± 1%         860MB/s ± 1%  +15.04%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_4_LLC                               [Bcmp_4]   227MB/s ±29%         260MB/s ±64%  +14.70%  (p=0.000 n=26+27)
BM_LIBC_Bcmp_4_Cold                              [Bcmp_4]   187MB/s ± 3%         191MB/s ± 5%   +2.26%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_5_L1                                [Bcmp_5]  1.48GB/s ± 1%        1.71GB/s ± 1%  +15.26%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_5_L2                                [Bcmp_5]  1.42GB/s ± 1%        1.67GB/s ± 1%  +17.68%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_5_LLC                               [Bcmp_5]   412MB/s ±34%         519MB/s ±80%  +25.87%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_5_Cold                              [Bcmp_5]   336MB/s ± 4%         343MB/s ± 6%   +2.05%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_6_L1                                [Bcmp_6]  2.87GB/s ± 0%        3.24GB/s ± 1%  +12.88%  (p=0.000 n=26+30)
BM_LIBC_Bcmp_6_L2                                [Bcmp_6]  2.78GB/s ± 1%        3.20GB/s ± 1%  +15.15%  (p=0.000 n=26+30)
BM_LIBC_Bcmp_6_LLC                               [Bcmp_6]   926MB/s ±43%        1227MB/s ±76%  +32.53%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_6_Cold                              [Bcmp_6]   716MB/s ± 4%         737MB/s ± 6%   +3.02%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_7_L1                                [Bcmp_7]  1.54GB/s ± 1%        1.56GB/s ± 0%   +1.40%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_7_L2                                [Bcmp_7]  1.47GB/s ± 1%        1.52GB/s ± 1%   +2.97%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_7_LLC                               [Bcmp_7]   351MB/s ±23%         436MB/s ±83%  +24.04%  (p=0.005 n=24+29)
BM_LIBC_Bcmp_7_Cold                              [Bcmp_7]   283MB/s ± 4%         282MB/s ± 4%     ~     (p=0.644 n=30+30)
BM_LIBC_Bcmp_8_L1                                [Bcmp_8]   824MB/s ± 1%        1048MB/s ± 1%  +27.18%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_8_L2                                [Bcmp_8]   808MB/s ± 1%        1027MB/s ± 1%  +27.12%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_8_LLC                               [Bcmp_8]   317MB/s ±79%         332MB/s ±74%     ~     (p=0.338 n=30+29)
BM_LIBC_Bcmp_8_Cold                              [Bcmp_8]   207MB/s ± 5%         212MB/s ± 5%   +2.27%  (p=0.000 n=30+30)
```

 - indus-skylake

```
name                                                       old speed            new speed            delta
BM_LIBC_Bcmp_Fleet_L1                                      2.06GB/s ± 2%        2.25GB/s ± 3%   +9.66%  (p=0.000 n=27+24)
BM_LIBC_Bcmp_Fleet_L2                                      1.96GB/s ± 2%        2.17GB/s ± 2%  +10.61%  (p=0.000 n=30+24)
BM_LIBC_Bcmp_Fleet_LLC                                     1.18GB/s ± 6%        1.32GB/s ± 5%  +12.27%  (p=0.000 n=28+28)
BM_LIBC_Bcmp_Fleet_Cold                                     456MB/s ± 2%         466MB/s ± 2%   +2.22%  (p=0.000 n=28+28)
BM_LIBC_Bcmp_0_L1                                [Bcmp_0]  3.08GB/s ± 2%        3.20GB/s ± 1%   +3.72%  (p=0.000 n=28+22)
BM_LIBC_Bcmp_0_L2                                [Bcmp_0]  2.92GB/s ± 1%        3.05GB/s ± 2%   +4.49%  (p=0.000 n=23+23)
BM_LIBC_Bcmp_0_LLC                               [Bcmp_0]  1.83GB/s ± 8%        1.94GB/s ± 4%   +6.24%  (p=0.000 n=25+27)
BM_LIBC_Bcmp_0_Cold                              [Bcmp_0]   654MB/s ± 2%         659MB/s ± 2%   +0.76%  (p=0.012 n=30+29)
BM_LIBC_Bcmp_1_L1                                [Bcmp_1]  3.19GB/s ± 2%        3.34GB/s ± 2%   +4.41%  (p=0.000 n=26+23)
BM_LIBC_Bcmp_1_L2                                [Bcmp_1]  3.05GB/s ± 2%        3.21GB/s ± 2%   +5.32%  (p=0.000 n=28+25)
BM_LIBC_Bcmp_1_LLC                               [Bcmp_1]  1.95GB/s ± 4%        2.03GB/s ±10%   +3.61%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_1_Cold                              [Bcmp_1]   700MB/s ± 2%         702MB/s ± 2%     ~     (p=0.150 n=30+30)
BM_LIBC_Bcmp_2_L1                                [Bcmp_2]  1.69GB/s ± 2%        1.85GB/s ± 1%   +9.31%  (p=0.000 n=30+26)
BM_LIBC_Bcmp_2_L2                                [Bcmp_2]  1.60GB/s ± 2%        1.78GB/s ± 2%  +10.90%  (p=0.000 n=26+27)
BM_LIBC_Bcmp_2_LLC                               [Bcmp_2]  1.01GB/s ± 5%        1.12GB/s ± 5%  +11.40%  (p=0.000 n=27+28)
BM_LIBC_Bcmp_2_Cold                              [Bcmp_2]   355MB/s ± 3%         360MB/s ± 3%   +1.46%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_3_L1                                [Bcmp_3]  1.98GB/s ± 2%        2.15GB/s ± 2%   +8.89%  (p=0.000 n=29+27)
BM_LIBC_Bcmp_3_L2                                [Bcmp_3]  1.87GB/s ± 3%        2.05GB/s ± 2%  +10.06%  (p=0.000 n=30+26)
BM_LIBC_Bcmp_3_LLC                               [Bcmp_3]  1.19GB/s ± 4%        1.31GB/s ± 6%   +9.82%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_3_Cold                              [Bcmp_3]   424MB/s ± 3%         431MB/s ± 3%   +1.58%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_4_L1                                [Bcmp_4]   849MB/s ± 2%         949MB/s ± 2%  +11.84%  (p=0.000 n=27+28)
BM_LIBC_Bcmp_4_L2                                [Bcmp_4]   815MB/s ± 3%         913MB/s ± 3%  +12.06%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_4_LLC                               [Bcmp_4]   512MB/s ± 9%         571MB/s ± 7%  +11.40%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_4_Cold                              [Bcmp_4]   187MB/s ± 3%         192MB/s ± 2%   +2.56%  (p=0.000 n=30+28)
BM_LIBC_Bcmp_5_L1                                [Bcmp_5]  1.55GB/s ± 2%        1.77GB/s ± 3%  +13.93%  (p=0.000 n=30+28)
BM_LIBC_Bcmp_5_L2                                [Bcmp_5]  1.47GB/s ± 2%        1.70GB/s ± 2%  +15.96%  (p=0.000 n=27+26)
BM_LIBC_Bcmp_5_LLC                               [Bcmp_5]   939MB/s ± 5%        1084MB/s ± 4%  +15.36%  (p=0.000 n=28+27)
BM_LIBC_Bcmp_5_Cold                              [Bcmp_5]   340MB/s ± 2%         347MB/s ± 3%   +1.93%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_6_L1                                [Bcmp_6]  3.06GB/s ± 3%        3.40GB/s ± 2%  +11.13%  (p=0.000 n=30+28)
BM_LIBC_Bcmp_6_L2                                [Bcmp_6]  2.89GB/s ± 3%        3.24GB/s ± 2%  +12.20%  (p=0.000 n=29+26)
BM_LIBC_Bcmp_6_LLC                               [Bcmp_6]  1.93GB/s ± 4%        2.09GB/s ±11%   +8.16%  (p=0.000 n=26+30)
BM_LIBC_Bcmp_6_Cold                              [Bcmp_6]   746MB/s ± 2%         762MB/s ± 2%   +2.11%  (p=0.000 n=30+28)
BM_LIBC_Bcmp_7_L1                                [Bcmp_7]  1.59GB/s ± 2%        1.62GB/s ± 2%   +1.72%  (p=0.000 n=25+27)
BM_LIBC_Bcmp_7_L2                                [Bcmp_7]  1.49GB/s ± 2%        1.53GB/s ± 2%   +2.62%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_7_LLC                               [Bcmp_7]   852MB/s ±10%         909MB/s ± 6%   +6.71%  (p=0.000 n=30+29)
BM_LIBC_Bcmp_7_Cold                              [Bcmp_7]   283MB/s ± 3%         283MB/s ± 2%     ~     (p=0.617 n=30+27)
BM_LIBC_Bcmp_8_L1                                [Bcmp_8]   891MB/s ± 2%        1083MB/s ± 2%  +21.64%  (p=0.000 n=27+24)
BM_LIBC_Bcmp_8_L2                                [Bcmp_8]   855MB/s ± 2%        1045MB/s ± 1%  +22.31%  (p=0.000 n=25+23)
BM_LIBC_Bcmp_8_LLC                               [Bcmp_8]   568MB/s ± 7%         659MB/s ± 8%  +16.04%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_8_Cold                              [Bcmp_8]   207MB/s ± 2%         212MB/s ± 2%   +2.31%  (p=0.000 n=30+27)
```

 - arcadia-rome

```
name                                                       old speed            new speed            delta
BM_LIBC_Bcmp_Fleet_L1                                      2.16GB/s ± 2%        2.27GB/s ± 2%   +5.13%  (p=0.000 n=26+30)
BM_LIBC_Bcmp_Fleet_L2                                      2.15GB/s ± 2%        2.25GB/s ± 2%   +4.64%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_Fleet_LLC                                     1.73GB/s ± 3%        1.81GB/s ± 3%   +4.66%  (p=0.000 n=25+28)
BM_LIBC_Bcmp_Fleet_Cold                                     494MB/s ± 1%         496MB/s ± 2%   +0.45%  (p=0.023 n=22+24)
BM_LIBC_Bcmp_0_L1                                [Bcmp_0]  3.30GB/s ± 1%        3.24GB/s ± 2%   -1.70%  (p=0.000 n=27+30)
BM_LIBC_Bcmp_0_L2                                [Bcmp_0]  3.23GB/s ± 2%        3.19GB/s ± 2%   -1.28%  (p=0.000 n=28+28)
BM_LIBC_Bcmp_0_LLC                               [Bcmp_0]  2.59GB/s ± 3%        2.58GB/s ± 2%   -0.65%  (p=0.010 n=26+26)
BM_LIBC_Bcmp_0_Cold                              [Bcmp_0]   720MB/s ± 1%         707MB/s ± 3%   -1.75%  (p=0.000 n=22+25)
BM_LIBC_Bcmp_1_L1                                [Bcmp_1]  3.37GB/s ± 1%        3.36GB/s ± 2%     ~     (p=0.102 n=28+29)
BM_LIBC_Bcmp_1_L2                                [Bcmp_1]  3.32GB/s ± 2%        3.30GB/s ± 2%   -0.51%  (p=0.038 n=28+29)
BM_LIBC_Bcmp_1_LLC                               [Bcmp_1]  2.67GB/s ± 4%        2.70GB/s ± 4%   +0.96%  (p=0.009 n=28+27)
BM_LIBC_Bcmp_1_Cold                              [Bcmp_1]   755MB/s ± 1%         751MB/s ± 2%   -0.57%  (p=0.000 n=22+25)
BM_LIBC_Bcmp_2_L1                                [Bcmp_2]  1.79GB/s ± 1%        1.86GB/s ± 2%   +3.92%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_2_L2                                [Bcmp_2]  1.77GB/s ± 2%        1.82GB/s ± 2%   +2.99%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_2_LLC                               [Bcmp_2]  1.41GB/s ± 4%        1.47GB/s ± 3%   +3.97%  (p=0.000 n=28+28)
BM_LIBC_Bcmp_2_Cold                              [Bcmp_2]   386MB/s ± 1%         389MB/s ± 1%   +0.60%  (p=0.000 n=21+23)
BM_LIBC_Bcmp_3_L1                                [Bcmp_3]  2.07GB/s ± 2%        2.17GB/s ± 2%   +4.87%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_3_L2                                [Bcmp_3]  2.07GB/s ± 2%        2.13GB/s ± 2%   +3.02%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_3_LLC                               [Bcmp_3]  1.66GB/s ± 2%        1.73GB/s ± 2%   +4.08%  (p=0.000 n=29+26)
BM_LIBC_Bcmp_3_Cold                              [Bcmp_3]   466MB/s ± 2%         469MB/s ± 3%   +0.66%  (p=0.001 n=22+25)
BM_LIBC_Bcmp_4_L1                                [Bcmp_4]   861MB/s ± 1%         964MB/s ± 2%  +11.98%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_4_L2                                [Bcmp_4]   853MB/s ± 2%         935MB/s ± 2%   +9.54%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_4_LLC                               [Bcmp_4]   707MB/s ± 3%         743MB/s ± 4%   +5.08%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_4_Cold                              [Bcmp_4]   199MB/s ± 3%         199MB/s ± 2%     ~     (p=0.107 n=29+25)
BM_LIBC_Bcmp_5_L1                                [Bcmp_5]  1.65GB/s ± 1%        1.75GB/s ± 2%   +6.15%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_5_L2                                [Bcmp_5]  1.64GB/s ± 3%        1.73GB/s ± 2%   +5.37%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_5_LLC                               [Bcmp_5]  1.32GB/s ± 2%        1.40GB/s ± 2%   +6.21%  (p=0.000 n=28+27)
BM_LIBC_Bcmp_5_Cold                              [Bcmp_5]   370MB/s ± 3%         371MB/s ± 2%   +0.16%  (p=0.008 n=29+25)
BM_LIBC_Bcmp_6_L1                                [Bcmp_6]  3.25GB/s ± 2%        3.47GB/s ± 2%   +6.74%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_6_L2                                [Bcmp_6]  3.26GB/s ± 1%        3.44GB/s ± 1%   +5.43%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_6_LLC                               [Bcmp_6]  2.66GB/s ± 2%        2.79GB/s ± 3%   +4.90%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_6_Cold                              [Bcmp_6]   812MB/s ± 3%         799MB/s ± 2%   -1.57%  (p=0.000 n=29+25)
BM_LIBC_Bcmp_7_L1                                [Bcmp_7]  1.71GB/s ± 2%        1.66GB/s ± 2%   -3.14%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_7_L2                                [Bcmp_7]  1.63GB/s ± 2%        1.59GB/s ± 2%   -2.50%  (p=0.000 n=29+28)
BM_LIBC_Bcmp_7_LLC                               [Bcmp_7]  1.25GB/s ± 4%        1.25GB/s ± 2%     ~     (p=0.530 n=28+26)
BM_LIBC_Bcmp_7_Cold                              [Bcmp_7]   311MB/s ± 3%         308MB/s ± 1%     ~     (p=0.127 n=29+24)
BM_LIBC_Bcmp_8_L1                                [Bcmp_8]   869MB/s ± 2%        1098MB/s ± 2%  +26.28%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_8_L2                                [Bcmp_8]   873MB/s ± 2%        1075MB/s ± 1%  +23.06%  (p=0.000 n=27+29)
BM_LIBC_Bcmp_8_LLC                               [Bcmp_8]   743MB/s ± 4%         859MB/s ± 4%  +15.58%  (p=0.000 n=27+27)
BM_LIBC_Bcmp_8_Cold                              [Bcmp_8]   221MB/s ± 4%         221MB/s ± 3%   +0.14%  (p=0.034 n=29+25)
```

 - ixion-haswell

```
name                                                       old speed            new speed            delta
BM_LIBC_Bcmp_Fleet_L1                                      2.27GB/s ± 5%        2.41GB/s ± 6%   +6.10%  (p=0.000 n=29+28)
BM_LIBC_Bcmp_Fleet_L2                                      2.14GB/s ± 6%        2.33GB/s ± 5%   +9.21%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_Fleet_LLC                                     1.30GB/s ± 9%        1.43GB/s ± 8%   +9.85%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_Fleet_Cold                                     475MB/s ± 6%         475MB/s ± 5%     ~     (p=0.839 n=30+29)
BM_LIBC_Bcmp_0_L1                                [Bcmp_0]  3.38GB/s ± 7%        3.46GB/s ± 6%   +2.35%  (p=0.009 n=30+29)
BM_LIBC_Bcmp_0_L2                                [Bcmp_0]  3.20GB/s ± 5%        3.32GB/s ± 6%   +3.52%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_0_LLC                               [Bcmp_0]  1.88GB/s ± 9%        2.00GB/s ± 6%   +6.63%  (p=0.000 n=30+28)
BM_LIBC_Bcmp_0_Cold                              [Bcmp_0]   664MB/s ± 6%         655MB/s ± 6%   -1.32%  (p=0.025 n=30+30)
BM_LIBC_Bcmp_1_L1                                [Bcmp_1]  3.50GB/s ± 8%        3.61GB/s ±10%   +3.09%  (p=0.001 n=29+30)
BM_LIBC_Bcmp_1_L2                                [Bcmp_1]  3.32GB/s ± 7%        3.48GB/s ± 8%   +4.89%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_1_LLC                               [Bcmp_1]  2.02GB/s ± 7%        2.14GB/s ± 9%   +5.82%  (p=0.000 n=28+29)
BM_LIBC_Bcmp_1_Cold                              [Bcmp_1]   716MB/s ± 6%         709MB/s ± 5%   -0.97%  (p=0.040 n=30+28)
BM_LIBC_Bcmp_2_L1                                [Bcmp_2]  1.83GB/s ± 7%        1.97GB/s ± 8%   +7.90%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_2_L2                                [Bcmp_2]  1.74GB/s ± 6%        1.92GB/s ± 6%  +10.29%  (p=0.000 n=30+29)
BM_LIBC_Bcmp_2_LLC                               [Bcmp_2]  1.05GB/s ± 9%        1.15GB/s ± 9%   +9.73%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_2_Cold                              [Bcmp_2]   379MB/s ± 6%         372MB/s ± 6%   -1.74%  (p=0.012 n=30+30)
BM_LIBC_Bcmp_3_L1                                [Bcmp_3]  2.17GB/s ± 5%        2.29GB/s ± 6%   +5.61%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_3_L2                                [Bcmp_3]  2.02GB/s ± 6%        2.20GB/s ± 6%   +8.75%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_3_LLC                               [Bcmp_3]  1.22GB/s ± 8%        1.34GB/s ± 9%   +9.19%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_3_Cold                              [Bcmp_3]   447MB/s ± 3%         441MB/s ± 7%   -1.40%  (p=0.033 n=30+30)
BM_LIBC_Bcmp_4_L1                                [Bcmp_4]   902MB/s ± 6%         995MB/s ±10%  +10.37%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_4_L2                                [Bcmp_4]   863MB/s ± 5%         945MB/s ±11%   +9.50%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_4_LLC                               [Bcmp_4]   528MB/s ±11%         559MB/s ±12%   +5.75%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_4_Cold                              [Bcmp_4]   183MB/s ± 4%         181MB/s ± 7%     ~     (p=0.088 n=28+30)
BM_LIBC_Bcmp_5_L1                                [Bcmp_5]  1.70GB/s ± 6%        1.87GB/s ± 8%  +10.14%  (p=0.000 n=29+29)
BM_LIBC_Bcmp_5_L2                                [Bcmp_5]  1.60GB/s ± 5%        1.80GB/s ± 9%  +12.61%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_5_LLC                               [Bcmp_5]   994MB/s ±13%        1094MB/s ± 8%  +10.10%  (p=0.000 n=29+30)
BM_LIBC_Bcmp_5_Cold                              [Bcmp_5]   362MB/s ± 6%         358MB/s ± 7%     ~     (p=0.123 n=30+30)
BM_LIBC_Bcmp_6_L1                                [Bcmp_6]  3.31GB/s ± 5%        3.67GB/s ± 6%  +10.90%  (p=0.000 n=28+30)
BM_LIBC_Bcmp_6_L2                                [Bcmp_6]  3.11GB/s ± 5%        3.53GB/s ± 5%  +13.59%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_6_LLC                               [Bcmp_6]  1.98GB/s ± 9%        2.18GB/s ± 8%  +10.34%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_6_Cold                              [Bcmp_6]   754MB/s ± 5%         752MB/s ± 5%     ~     (p=0.592 n=30+30)
BM_LIBC_Bcmp_7_L1                                [Bcmp_7]  1.72GB/s ± 5%        1.72GB/s ± 6%     ~     (p=0.549 n=29+29)
BM_LIBC_Bcmp_7_L2                                [Bcmp_7]  1.61GB/s ± 7%        1.63GB/s ± 8%     ~     (p=0.191 n=30+29)
BM_LIBC_Bcmp_7_LLC                               [Bcmp_7]   913MB/s ± 8%         905MB/s ± 9%     ~     (p=0.423 n=30+30)
BM_LIBC_Bcmp_7_Cold                              [Bcmp_7]   304MB/s ± 6%         287MB/s ± 4%   -5.57%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_8_L1                                [Bcmp_8]   961MB/s ± 5%        1124MB/s ± 6%  +16.94%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_8_L2                                [Bcmp_8]   915MB/s ± 8%        1100MB/s ± 7%  +20.16%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_8_LLC                               [Bcmp_8]   593MB/s ± 8%         669MB/s ± 8%  +12.92%  (p=0.000 n=30+30)
BM_LIBC_Bcmp_8_Cold                              [Bcmp_8]   220MB/s ± 4%         220MB/s ± 6%     ~     (p=0.572 n=30+30)
```

Co-authored-by: goldvitaly at google.com <%username%@google.com>

Added: 
    

Modified: 
    libc/src/string/memory_utils/op_x86.h
    libc/src/string/memory_utils/x86_64/inline_bcmp.h

Removed: 
    


################################################################################
diff  --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index cf9667283818d8..ab694e25fe0fe1 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -63,6 +63,15 @@ struct Memcpy {
 namespace LIBC_NAMESPACE_DECL {
 namespace generic {
 
+// Not equals: returns non-zero iff values at head or tail 
diff er.
+// This function typically loads more data than necessary when the two buffer
+// 
diff ers.
+template <typename T>
+LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
+  static_assert(cpp::is_integral_v<T>);
+  return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for uint16_t
 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
@@ -133,6 +142,11 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
 #if defined(__SSE4_1__)
 template <> struct is_vector<__m128i> : cpp::true_type {};
 template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
+LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
+  const auto a = load<__m128i>(p1, offset);
+  const auto b = load<__m128i>(p2, offset);
+  return _mm_xor_si128(a, b);
+}
 LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
   return _mm_max_epu8(a, b);
 }
@@ -144,17 +158,21 @@ LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
   return static_cast<uint16_t>(
       _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
 }
+LIBC_INLINE bool is_zero(__m128i value) {
+  return _mm_testz_si128(value, value) == 1;
+}
 template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
-  const auto a = load<__m128i>(p1, offset);
-  const auto b = load<__m128i>(p2, offset);
-  const auto xored = _mm_xor_si128(a, b);
-  return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
+  return is_zero(load_and_xor_m128i(p1, p2, offset));
 }
 template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
-  const auto a = load<__m128i>(p1, offset);
-  const auto b = load<__m128i>(p2, offset);
-  const auto xored = _mm_xor_si128(a, b);
-  return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
+  return !is_zero(load_and_xor_m128i(p1, p2, offset));
+}
+template <>
+LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
+                                                       size_t count) {
+  const __m128i head = load_and_xor_m128i(p1, p2, 0);
+  const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
+  return !is_zero(_mm_or_si128(head, tail));
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
@@ -173,19 +191,34 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
 #if defined(__AVX__)
 template <> struct is_vector<__m256i> : cpp::true_type {};
 template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
-template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
-  const auto a = load<__m256i>(p1, offset);
-  const auto b = load<__m256i>(p2, offset);
-  const auto xored = _mm256_castps_si256(
+LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
+  return _mm256_castps_si256(
       _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-  return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
 }
-template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
+LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
+  return _mm256_castps_si256(
+      _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m256i>(p1, offset);
   const auto b = load<__m256i>(p2, offset);
-  const auto xored = _mm256_castps_si256(
-      _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
-  return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
+  return xor_m256i(a, b);
+}
+LIBC_INLINE bool is_zero(__m256i value) {
+  return _mm256_testz_si256(value, value) == 1;
+}
+template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
+  return is_zero(load_and_xor_m256i(p1, p2, offset));
+}
+template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
+  return !is_zero(load_and_xor_m256i(p1, p2, offset));
+}
+template <>
+LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
+                                                       size_t count) {
+  const __m256i head = load_and_xor_m256i(p1, p2, 0);
+  const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
+  return !is_zero(or_m256i(head, tail));
 }
 #endif // __AVX__
 
@@ -300,9 +333,22 @@ template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
 template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m512i>(p1, offset);
   const auto b = load<__m512i>(p2, offset);
-  const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
-  return static_cast<uint32_t>(xored >> 32) |
-         static_cast<uint32_t>(xored & 0xFFFFFFFF);
+  return _mm512_cmpneq_epi8_mask(a, b) != 0;
+}
+LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
+  const auto a = load<__m512i>(p1, offset);
+  const auto b = load<__m512i>(p2, offset);
+  return _mm512_xor_epi64(a, b);
+}
+LIBC_INLINE bool is_zero(__m512i value) {
+  return _mm512_test_epi32_mask(value, value) == 0;
+}
+template <>
+LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
+                                                       size_t count) {
+  const __m512i head = load_and_xor_m512i(p1, p2, 0);
+  const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
+  return !is_zero(_mm512_or_epi64(head, tail));
 }
 template <>
 LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {

diff  --git a/libc/src/string/memory_utils/x86_64/inline_bcmp.h b/libc/src/string/memory_utils/x86_64/inline_bcmp.h
index 49fe08fb0501b5..cc54c4140ee6e6 100644
--- a/libc/src/string/memory_utils/x86_64/inline_bcmp.h
+++ b/libc/src/string/memory_utils/x86_64/inline_bcmp.h
@@ -27,7 +27,7 @@ inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
 inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
   if (count <= 32)
-    return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
   return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
 }
 #endif // __SSE4_1__
@@ -36,9 +36,9 @@ inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
 inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
   if (count <= 32)
-    return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
   if (count <= 64)
-    return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
   return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
 }
 #endif // __AVX__
@@ -47,11 +47,11 @@ inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
 inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
   if (count <= 32)
-    return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
   if (count <= 64)
-    return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
   if (count <= 128)
-    return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<__m512i>(p1, p2, count);
   return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
 }
 #endif // __AVX512BW__
@@ -62,22 +62,12 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
     return BcmpReturnType::zero();
   if (count == 1)
     return generic::Bcmp<uint8_t>::block(p1, p2);
-  if (count == 2)
-    return generic::Bcmp<uint16_t>::block(p1, p2);
-  if (count == 3)
-    return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
-  if (count == 4)
-    return generic::Bcmp<uint32_t>::block(p1, p2);
-  if (count == 5)
-    return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
-  if (count == 6)
-    return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
-  if (count == 7)
-    return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
-  if (count == 8)
-    return generic::Bcmp<uint64_t>::block(p1, p2);
+  if (count <= 4)
+    return generic::branchless_head_tail_neq<uint16_t>(p1, p2, count);
+  if (count <= 8)
+    return generic::branchless_head_tail_neq<uint32_t>(p1, p2, count);
   if (count <= 16)
-    return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
+    return generic::branchless_head_tail_neq<uint64_t>(p1, p2, count);
 #if defined(__AVX512BW__)
   return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
 #elif defined(__AVX__)


        


More information about the libc-commits mailing list