[libc-commits] [libc] libc: Implement branchless head-tail comparison for bcmp. (PR #107540)
Vitaly Goldshteyn via libc-commits
libc-commits at lists.llvm.org
Fri Sep 6 01:02:25 PDT 2024
https://github.com/goldvitaly created https://github.com/llvm/llvm-project/pull/107540
Binary size changes:
| Bytes (cache lines) | before | after |
|---------------------|----------|---------|
| sse4 | 419 (7) | 288 (5) |
| avx | 430 (7) | 308 (5) |
| avx512f | 589 (10) | 390 (7) |
Benchmarks for different CPUs using https://github.com/google/fleetbench.
- indus-cascadelake
```
name old speed new speed delta
BM_LIBC_Bcmp_Fleet_L1 1.96GB/s ± 1% 2.19GB/s ± 0% +11.49% (p=0.000 n=29+24)
BM_LIBC_Bcmp_Fleet_L2 1.90GB/s ± 1% 2.14GB/s ± 1% +12.68% (p=0.000 n=29+24)
BM_LIBC_Bcmp_Fleet_LLC 513MB/s ± 4% 531MB/s ± 4% +3.53% (p=0.000 n=24+24)
BM_LIBC_Bcmp_Fleet_Cold 452MB/s ± 3% 456MB/s ± 4% ~ (p=0.103 n=30+30)
BM_LIBC_Bcmp_adgroup-server_L1 [Bcmp_0] 2.98GB/s ± 1% 3.15GB/s ± 1% +5.59% (p=0.000 n=29+30)
BM_LIBC_Bcmp_adgroup-server_L2 [Bcmp_0] 2.86GB/s ± 1% 3.07GB/s ± 1% +7.21% (p=0.000 n=29+30)
BM_LIBC_Bcmp_adgroup-server_LLC [Bcmp_0] 738MB/s ± 7% 751MB/s ± 3% +1.68% (p=0.000 n=24+25)
BM_LIBC_Bcmp_adgroup-server_Cold [Bcmp_0] 643MB/s ± 3% 642MB/s ± 4% ~ (p=0.522 n=29+30)
BM_LIBC_Bcmp_bigtable_tablet_server_L1 [Bcmp_1] 3.08GB/s ± 0% 3.25GB/s ± 0% +5.35% (p=0.000 n=28+30)
BM_LIBC_Bcmp_bigtable_tablet_server_L2 [Bcmp_1] 2.97GB/s ± 1% 3.17GB/s ± 1% +6.65% (p=0.000 n=29+30)
BM_LIBC_Bcmp_bigtable_tablet_server_LLC [Bcmp_1] 901MB/s ±59% 871MB/s ±36% ~ (p=0.676 n=29+27)
BM_LIBC_Bcmp_bigtable_tablet_server_Cold [Bcmp_1] 686MB/s ± 4% 686MB/s ± 3% ~ (p=0.934 n=29+30)
BM_LIBC_Bcmp_d_L1 [Bcmp_2] 1.63GB/s ± 0% 1.80GB/s ± 1% +10.19% (p=0.000 n=29+30)
BM_LIBC_Bcmp_d_L2 [Bcmp_2] 1.57GB/s ± 1% 1.75GB/s ± 1% +11.46% (p=0.000 n=29+30)
BM_LIBC_Bcmp_d_LLC [Bcmp_2] 451MB/s ±61% 427MB/s ±28% ~ (p=0.469 n=29+25)
BM_LIBC_Bcmp_d_Cold [Bcmp_2] 353MB/s ± 4% 354MB/s ± 5% ~ (p=0.467 n=30+30)
BM_LIBC_Bcmp_dremel-server_L1 [Bcmp_3] 1.91GB/s ± 1% 2.10GB/s ± 1% +9.90% (p=0.000 n=29+29)
BM_LIBC_Bcmp_dremel-server_L2 [Bcmp_3] 1.84GB/s ± 1% 2.03GB/s ± 1% +10.63% (p=0.000 n=29+30)
BM_LIBC_Bcmp_dremel-server_LLC [Bcmp_3] 491MB/s ±24% 538MB/s ±24% +9.66% (p=0.000 n=24+27)
BM_LIBC_Bcmp_dremel-server_Cold [Bcmp_3] 417MB/s ± 4% 421MB/s ± 3% ~ (p=0.063 n=30+29)
BM_LIBC_Bcmp_logs_proxy_L1 [Bcmp_4] 761MB/s ± 1% 867MB/s ± 1% +14.02% (p=0.000 n=28+30)
BM_LIBC_Bcmp_logs_proxy_L2 [Bcmp_4] 748MB/s ± 1% 860MB/s ± 1% +15.04% (p=0.000 n=30+30)
BM_LIBC_Bcmp_logs_proxy_LLC [Bcmp_4] 227MB/s ±29% 260MB/s ±64% +14.70% (p=0.000 n=26+27)
BM_LIBC_Bcmp_logs_proxy_Cold [Bcmp_4] 187MB/s ± 3% 191MB/s ± 5% +2.26% (p=0.000 n=30+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L1 [Bcmp_5] 1.48GB/s ± 1% 1.71GB/s ± 1% +15.26% (p=0.000 n=29+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L2 [Bcmp_5] 1.42GB/s ± 1% 1.67GB/s ± 1% +17.68% (p=0.000 n=29+29)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_LLC [Bcmp_5] 412MB/s ±34% 519MB/s ±80% +25.87% (p=0.000 n=27+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_Cold [Bcmp_5] 336MB/s ± 4% 343MB/s ± 6% +2.05% (p=0.000 n=30+30)
BM_LIBC_Bcmp_span_server_L1 [Bcmp_6] 2.87GB/s ± 0% 3.24GB/s ± 1% +12.88% (p=0.000 n=26+30)
BM_LIBC_Bcmp_span_server_L2 [Bcmp_6] 2.78GB/s ± 1% 3.20GB/s ± 1% +15.15% (p=0.000 n=26+30)
BM_LIBC_Bcmp_span_server_LLC [Bcmp_6] 926MB/s ±43% 1227MB/s ±76% +32.53% (p=0.000 n=27+30)
BM_LIBC_Bcmp_span_server_Cold [Bcmp_6] 716MB/s ± 4% 737MB/s ± 6% +3.02% (p=0.000 n=28+29)
BM_LIBC_Bcmp_sr_www_L1 [Bcmp_7] 1.54GB/s ± 1% 1.56GB/s ± 0% +1.40% (p=0.000 n=29+30)
BM_LIBC_Bcmp_sr_www_L2 [Bcmp_7] 1.47GB/s ± 1% 1.52GB/s ± 1% +2.97% (p=0.000 n=27+30)
BM_LIBC_Bcmp_sr_www_LLC [Bcmp_7] 351MB/s ±23% 436MB/s ±83% +24.04% (p=0.005 n=24+29)
BM_LIBC_Bcmp_sr_www_Cold [Bcmp_7] 283MB/s ± 4% 282MB/s ± 4% ~ (p=0.644 n=30+30)
BM_LIBC_Bcmp_union_worker_L1 [Bcmp_8] 824MB/s ± 1% 1048MB/s ± 1% +27.18% (p=0.000 n=29+30)
BM_LIBC_Bcmp_union_worker_L2 [Bcmp_8] 808MB/s ± 1% 1027MB/s ± 1% +27.12% (p=0.000 n=29+29)
BM_LIBC_Bcmp_union_worker_LLC [Bcmp_8] 317MB/s ±79% 332MB/s ±74% ~ (p=0.338 n=30+29)
BM_LIBC_Bcmp_union_worker_Cold [Bcmp_8] 207MB/s ± 5% 212MB/s ± 5% +2.27% (p=0.000 n=30+30)
```
- indus-skylake
```
name old speed new speed delta
BM_LIBC_Bcmp_Fleet_L1 2.06GB/s ± 2% 2.25GB/s ± 3% +9.66% (p=0.000 n=27+24)
BM_LIBC_Bcmp_Fleet_L2 1.96GB/s ± 2% 2.17GB/s ± 2% +10.61% (p=0.000 n=30+24)
BM_LIBC_Bcmp_Fleet_LLC 1.18GB/s ± 6% 1.32GB/s ± 5% +12.27% (p=0.000 n=28+28)
BM_LIBC_Bcmp_Fleet_Cold 456MB/s ± 2% 466MB/s ± 2% +2.22% (p=0.000 n=28+28)
BM_LIBC_Bcmp_adgroup-server_L1 [Bcmp_0] 3.08GB/s ± 2% 3.20GB/s ± 1% +3.72% (p=0.000 n=28+22)
BM_LIBC_Bcmp_adgroup-server_L2 [Bcmp_0] 2.92GB/s ± 1% 3.05GB/s ± 2% +4.49% (p=0.000 n=23+23)
BM_LIBC_Bcmp_adgroup-server_LLC [Bcmp_0] 1.83GB/s ± 8% 1.94GB/s ± 4% +6.24% (p=0.000 n=25+27)
BM_LIBC_Bcmp_adgroup-server_Cold [Bcmp_0] 654MB/s ± 2% 659MB/s ± 2% +0.76% (p=0.012 n=30+29)
BM_LIBC_Bcmp_bigtable_tablet_server_L1 [Bcmp_1] 3.19GB/s ± 2% 3.34GB/s ± 2% +4.41% (p=0.000 n=26+23)
BM_LIBC_Bcmp_bigtable_tablet_server_L2 [Bcmp_1] 3.05GB/s ± 2% 3.21GB/s ± 2% +5.32% (p=0.000 n=28+25)
BM_LIBC_Bcmp_bigtable_tablet_server_LLC [Bcmp_1] 1.95GB/s ± 4% 2.03GB/s ±10% +3.61% (p=0.000 n=27+30)
BM_LIBC_Bcmp_bigtable_tablet_server_Cold [Bcmp_1] 700MB/s ± 2% 702MB/s ± 2% ~ (p=0.150 n=30+30)
BM_LIBC_Bcmp_d_L1 [Bcmp_2] 1.69GB/s ± 2% 1.85GB/s ± 1% +9.31% (p=0.000 n=30+26)
BM_LIBC_Bcmp_d_L2 [Bcmp_2] 1.60GB/s ± 2% 1.78GB/s ± 2% +10.90% (p=0.000 n=26+27)
BM_LIBC_Bcmp_d_LLC [Bcmp_2] 1.01GB/s ± 5% 1.12GB/s ± 5% +11.40% (p=0.000 n=27+28)
BM_LIBC_Bcmp_d_Cold [Bcmp_2] 355MB/s ± 3% 360MB/s ± 3% +1.46% (p=0.000 n=30+30)
BM_LIBC_Bcmp_dremel-server_L1 [Bcmp_3] 1.98GB/s ± 2% 2.15GB/s ± 2% +8.89% (p=0.000 n=29+27)
BM_LIBC_Bcmp_dremel-server_L2 [Bcmp_3] 1.87GB/s ± 3% 2.05GB/s ± 2% +10.06% (p=0.000 n=30+26)
BM_LIBC_Bcmp_dremel-server_LLC [Bcmp_3] 1.19GB/s ± 4% 1.31GB/s ± 6% +9.82% (p=0.000 n=27+29)
BM_LIBC_Bcmp_dremel-server_Cold [Bcmp_3] 424MB/s ± 3% 431MB/s ± 3% +1.58% (p=0.000 n=28+30)
BM_LIBC_Bcmp_logs_proxy_L1 [Bcmp_4] 849MB/s ± 2% 949MB/s ± 2% +11.84% (p=0.000 n=27+28)
BM_LIBC_Bcmp_logs_proxy_L2 [Bcmp_4] 815MB/s ± 3% 913MB/s ± 3% +12.06% (p=0.000 n=29+30)
BM_LIBC_Bcmp_logs_proxy_LLC [Bcmp_4] 512MB/s ± 9% 571MB/s ± 7% +11.40% (p=0.000 n=30+30)
BM_LIBC_Bcmp_logs_proxy_Cold [Bcmp_4] 187MB/s ± 3% 192MB/s ± 2% +2.56% (p=0.000 n=30+28)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L1 [Bcmp_5] 1.55GB/s ± 2% 1.77GB/s ± 3% +13.93% (p=0.000 n=30+28)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L2 [Bcmp_5] 1.47GB/s ± 2% 1.70GB/s ± 2% +15.96% (p=0.000 n=27+26)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_LLC [Bcmp_5] 939MB/s ± 5% 1084MB/s ± 4% +15.36% (p=0.000 n=28+27)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_Cold [Bcmp_5] 340MB/s ± 2% 347MB/s ± 3% +1.93% (p=0.000 n=30+30)
BM_LIBC_Bcmp_span_server_L1 [Bcmp_6] 3.06GB/s ± 3% 3.40GB/s ± 2% +11.13% (p=0.000 n=30+28)
BM_LIBC_Bcmp_span_server_L2 [Bcmp_6] 2.89GB/s ± 3% 3.24GB/s ± 2% +12.20% (p=0.000 n=29+26)
BM_LIBC_Bcmp_span_server_LLC [Bcmp_6] 1.93GB/s ± 4% 2.09GB/s ±11% +8.16% (p=0.000 n=26+30)
BM_LIBC_Bcmp_span_server_Cold [Bcmp_6] 746MB/s ± 2% 762MB/s ± 2% +2.11% (p=0.000 n=30+28)
BM_LIBC_Bcmp_sr_www_L1 [Bcmp_7] 1.59GB/s ± 2% 1.62GB/s ± 2% +1.72% (p=0.000 n=25+27)
BM_LIBC_Bcmp_sr_www_L2 [Bcmp_7] 1.49GB/s ± 2% 1.53GB/s ± 2% +2.62% (p=0.000 n=27+29)
BM_LIBC_Bcmp_sr_www_LLC [Bcmp_7] 852MB/s ±10% 909MB/s ± 6% +6.71% (p=0.000 n=30+29)
BM_LIBC_Bcmp_sr_www_Cold [Bcmp_7] 283MB/s ± 3% 283MB/s ± 2% ~ (p=0.617 n=30+27)
BM_LIBC_Bcmp_union_worker_L1 [Bcmp_8] 891MB/s ± 2% 1083MB/s ± 2% +21.64% (p=0.000 n=27+24)
BM_LIBC_Bcmp_union_worker_L2 [Bcmp_8] 855MB/s ± 2% 1045MB/s ± 1% +22.31% (p=0.000 n=25+23)
BM_LIBC_Bcmp_union_worker_LLC [Bcmp_8] 568MB/s ± 7% 659MB/s ± 8% +16.04% (p=0.000 n=29+30)
BM_LIBC_Bcmp_union_worker_Cold [Bcmp_8] 207MB/s ± 2% 212MB/s ± 2% +2.31% (p=0.000 n=30+27)
```
- arcadia-rome
```
name old speed new speed delta
BM_LIBC_Bcmp_Fleet_L1 2.16GB/s ± 2% 2.27GB/s ± 2% +5.13% (p=0.000 n=26+30)
BM_LIBC_Bcmp_Fleet_L2 2.15GB/s ± 2% 2.25GB/s ± 2% +4.64% (p=0.000 n=27+30)
BM_LIBC_Bcmp_Fleet_LLC 1.73GB/s ± 3% 1.81GB/s ± 3% +4.66% (p=0.000 n=25+28)
BM_LIBC_Bcmp_Fleet_Cold 494MB/s ± 1% 496MB/s ± 2% +0.45% (p=0.023 n=22+24)
BM_LIBC_Bcmp_adgroup-server_L1 [Bcmp_0] 3.30GB/s ± 1% 3.24GB/s ± 2% -1.70% (p=0.000 n=27+30)
BM_LIBC_Bcmp_adgroup-server_L2 [Bcmp_0] 3.23GB/s ± 2% 3.19GB/s ± 2% -1.28% (p=0.000 n=28+28)
BM_LIBC_Bcmp_adgroup-server_LLC [Bcmp_0] 2.59GB/s ± 3% 2.58GB/s ± 2% -0.65% (p=0.010 n=26+26)
BM_LIBC_Bcmp_adgroup-server_Cold [Bcmp_0] 720MB/s ± 1% 707MB/s ± 3% -1.75% (p=0.000 n=22+25)
BM_LIBC_Bcmp_bigtable_tablet_server_L1 [Bcmp_1] 3.37GB/s ± 1% 3.36GB/s ± 2% ~ (p=0.102 n=28+29)
BM_LIBC_Bcmp_bigtable_tablet_server_L2 [Bcmp_1] 3.32GB/s ± 2% 3.30GB/s ± 2% -0.51% (p=0.038 n=28+29)
BM_LIBC_Bcmp_bigtable_tablet_server_LLC [Bcmp_1] 2.67GB/s ± 4% 2.70GB/s ± 4% +0.96% (p=0.009 n=28+27)
BM_LIBC_Bcmp_bigtable_tablet_server_Cold [Bcmp_1] 755MB/s ± 1% 751MB/s ± 2% -0.57% (p=0.000 n=22+25)
BM_LIBC_Bcmp_d_L1 [Bcmp_2] 1.79GB/s ± 1% 1.86GB/s ± 2% +3.92% (p=0.000 n=27+29)
BM_LIBC_Bcmp_d_L2 [Bcmp_2] 1.77GB/s ± 2% 1.82GB/s ± 2% +2.99% (p=0.000 n=28+29)
BM_LIBC_Bcmp_d_LLC [Bcmp_2] 1.41GB/s ± 4% 1.47GB/s ± 3% +3.97% (p=0.000 n=28+28)
BM_LIBC_Bcmp_d_Cold [Bcmp_2] 386MB/s ± 1% 389MB/s ± 1% +0.60% (p=0.000 n=21+23)
BM_LIBC_Bcmp_dremel-server_L1 [Bcmp_3] 2.07GB/s ± 2% 2.17GB/s ± 2% +4.87% (p=0.000 n=29+30)
BM_LIBC_Bcmp_dremel-server_L2 [Bcmp_3] 2.07GB/s ± 2% 2.13GB/s ± 2% +3.02% (p=0.000 n=28+30)
BM_LIBC_Bcmp_dremel-server_LLC [Bcmp_3] 1.66GB/s ± 2% 1.73GB/s ± 2% +4.08% (p=0.000 n=29+26)
BM_LIBC_Bcmp_dremel-server_Cold [Bcmp_3] 466MB/s ± 2% 469MB/s ± 3% +0.66% (p=0.001 n=22+25)
BM_LIBC_Bcmp_logs_proxy_L1 [Bcmp_4] 861MB/s ± 1% 964MB/s ± 2% +11.98% (p=0.000 n=29+29)
BM_LIBC_Bcmp_logs_proxy_L2 [Bcmp_4] 853MB/s ± 2% 935MB/s ± 2% +9.54% (p=0.000 n=28+29)
BM_LIBC_Bcmp_logs_proxy_LLC [Bcmp_4] 707MB/s ± 3% 743MB/s ± 4% +5.08% (p=0.000 n=29+29)
BM_LIBC_Bcmp_logs_proxy_Cold [Bcmp_4] 199MB/s ± 3% 199MB/s ± 2% ~ (p=0.107 n=29+25)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L1 [Bcmp_5] 1.65GB/s ± 1% 1.75GB/s ± 2% +6.15% (p=0.000 n=29+29)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L2 [Bcmp_5] 1.64GB/s ± 3% 1.73GB/s ± 2% +5.37% (p=0.000 n=29+29)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_LLC [Bcmp_5] 1.32GB/s ± 2% 1.40GB/s ± 2% +6.21% (p=0.000 n=28+27)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_Cold [Bcmp_5] 370MB/s ± 3% 371MB/s ± 2% +0.16% (p=0.008 n=29+25)
BM_LIBC_Bcmp_span_server_L1 [Bcmp_6] 3.25GB/s ± 2% 3.47GB/s ± 2% +6.74% (p=0.000 n=28+29)
BM_LIBC_Bcmp_span_server_L2 [Bcmp_6] 3.26GB/s ± 1% 3.44GB/s ± 1% +5.43% (p=0.000 n=28+29)
BM_LIBC_Bcmp_span_server_LLC [Bcmp_6] 2.66GB/s ± 2% 2.79GB/s ± 3% +4.90% (p=0.000 n=27+29)
BM_LIBC_Bcmp_span_server_Cold [Bcmp_6] 812MB/s ± 3% 799MB/s ± 2% -1.57% (p=0.000 n=29+25)
BM_LIBC_Bcmp_sr_www_L1 [Bcmp_7] 1.71GB/s ± 2% 1.66GB/s ± 2% -3.14% (p=0.000 n=29+29)
BM_LIBC_Bcmp_sr_www_L2 [Bcmp_7] 1.63GB/s ± 2% 1.59GB/s ± 2% -2.50% (p=0.000 n=29+28)
BM_LIBC_Bcmp_sr_www_LLC [Bcmp_7] 1.25GB/s ± 4% 1.25GB/s ± 2% ~ (p=0.530 n=28+26)
BM_LIBC_Bcmp_sr_www_Cold [Bcmp_7] 311MB/s ± 3% 308MB/s ± 1% ~ (p=0.127 n=29+24)
BM_LIBC_Bcmp_union_worker_L1 [Bcmp_8] 869MB/s ± 2% 1098MB/s ± 2% +26.28% (p=0.000 n=27+29)
BM_LIBC_Bcmp_union_worker_L2 [Bcmp_8] 873MB/s ± 2% 1075MB/s ± 1% +23.06% (p=0.000 n=27+29)
BM_LIBC_Bcmp_union_worker_LLC [Bcmp_8] 743MB/s ± 4% 859MB/s ± 4% +15.58% (p=0.000 n=27+27)
BM_LIBC_Bcmp_union_worker_Cold [Bcmp_8] 221MB/s ± 4% 221MB/s ± 3% +0.14% (p=0.034 n=29+25)
```
- ixion-haswell
```
name old speed new speed delta
BM_LIBC_Bcmp_Fleet_L1 2.27GB/s ± 5% 2.41GB/s ± 6% +6.10% (p=0.000 n=29+28)
BM_LIBC_Bcmp_Fleet_L2 2.14GB/s ± 6% 2.33GB/s ± 5% +9.21% (p=0.000 n=29+30)
BM_LIBC_Bcmp_Fleet_LLC 1.30GB/s ± 9% 1.43GB/s ± 8% +9.85% (p=0.000 n=30+30)
BM_LIBC_Bcmp_Fleet_Cold 475MB/s ± 6% 475MB/s ± 5% ~ (p=0.839 n=30+29)
BM_LIBC_Bcmp_adgroup-server_L1 [Bcmp_0] 3.38GB/s ± 7% 3.46GB/s ± 6% +2.35% (p=0.009 n=30+29)
BM_LIBC_Bcmp_adgroup-server_L2 [Bcmp_0] 3.20GB/s ± 5% 3.32GB/s ± 6% +3.52% (p=0.000 n=28+30)
BM_LIBC_Bcmp_adgroup-server_LLC [Bcmp_0] 1.88GB/s ± 9% 2.00GB/s ± 6% +6.63% (p=0.000 n=30+28)
BM_LIBC_Bcmp_adgroup-server_Cold [Bcmp_0] 664MB/s ± 6% 655MB/s ± 6% -1.32% (p=0.025 n=30+30)
BM_LIBC_Bcmp_bigtable_tablet_server_L1 [Bcmp_1] 3.50GB/s ± 8% 3.61GB/s ±10% +3.09% (p=0.001 n=29+30)
BM_LIBC_Bcmp_bigtable_tablet_server_L2 [Bcmp_1] 3.32GB/s ± 7% 3.48GB/s ± 8% +4.89% (p=0.000 n=29+30)
BM_LIBC_Bcmp_bigtable_tablet_server_LLC [Bcmp_1] 2.02GB/s ± 7% 2.14GB/s ± 9% +5.82% (p=0.000 n=28+29)
BM_LIBC_Bcmp_bigtable_tablet_server_Cold [Bcmp_1] 716MB/s ± 6% 709MB/s ± 5% -0.97% (p=0.040 n=30+28)
BM_LIBC_Bcmp_d_L1 [Bcmp_2] 1.83GB/s ± 7% 1.97GB/s ± 8% +7.90% (p=0.000 n=30+30)
BM_LIBC_Bcmp_d_L2 [Bcmp_2] 1.74GB/s ± 6% 1.92GB/s ± 6% +10.29% (p=0.000 n=30+29)
BM_LIBC_Bcmp_d_LLC [Bcmp_2] 1.05GB/s ± 9% 1.15GB/s ± 9% +9.73% (p=0.000 n=30+30)
BM_LIBC_Bcmp_d_Cold [Bcmp_2] 379MB/s ± 6% 372MB/s ± 6% -1.74% (p=0.012 n=30+30)
BM_LIBC_Bcmp_dremel-server_L1 [Bcmp_3] 2.17GB/s ± 5% 2.29GB/s ± 6% +5.61% (p=0.000 n=29+30)
BM_LIBC_Bcmp_dremel-server_L2 [Bcmp_3] 2.02GB/s ± 6% 2.20GB/s ± 6% +8.75% (p=0.000 n=29+30)
BM_LIBC_Bcmp_dremel-server_LLC [Bcmp_3] 1.22GB/s ± 8% 1.34GB/s ± 9% +9.19% (p=0.000 n=30+30)
BM_LIBC_Bcmp_dremel-server_Cold [Bcmp_3] 447MB/s ± 3% 441MB/s ± 7% -1.40% (p=0.033 n=30+30)
BM_LIBC_Bcmp_logs_proxy_L1 [Bcmp_4] 902MB/s ± 6% 995MB/s ±10% +10.37% (p=0.000 n=30+30)
BM_LIBC_Bcmp_logs_proxy_L2 [Bcmp_4] 863MB/s ± 5% 945MB/s ±11% +9.50% (p=0.000 n=29+30)
BM_LIBC_Bcmp_logs_proxy_LLC [Bcmp_4] 528MB/s ±11% 559MB/s ±12% +5.75% (p=0.000 n=30+30)
BM_LIBC_Bcmp_logs_proxy_Cold [Bcmp_4] 183MB/s ± 4% 181MB/s ± 7% ~ (p=0.088 n=28+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L1 [Bcmp_5] 1.70GB/s ± 6% 1.87GB/s ± 8% +10.14% (p=0.000 n=29+29)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L2 [Bcmp_5] 1.60GB/s ± 5% 1.80GB/s ± 9% +12.61% (p=0.000 n=29+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_LLC [Bcmp_5] 994MB/s ±13% 1094MB/s ± 8% +10.10% (p=0.000 n=29+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_Cold [Bcmp_5] 362MB/s ± 6% 358MB/s ± 7% ~ (p=0.123 n=30+30)
BM_LIBC_Bcmp_span_server_L1 [Bcmp_6] 3.31GB/s ± 5% 3.67GB/s ± 6% +10.90% (p=0.000 n=28+30)
BM_LIBC_Bcmp_span_server_L2 [Bcmp_6] 3.11GB/s ± 5% 3.53GB/s ± 5% +13.59% (p=0.000 n=30+30)
BM_LIBC_Bcmp_span_server_LLC [Bcmp_6] 1.98GB/s ± 9% 2.18GB/s ± 8% +10.34% (p=0.000 n=30+30)
BM_LIBC_Bcmp_span_server_Cold [Bcmp_6] 754MB/s ± 5% 752MB/s ± 5% ~ (p=0.592 n=30+30)
BM_LIBC_Bcmp_sr_www_L1 [Bcmp_7] 1.72GB/s ± 5% 1.72GB/s ± 6% ~ (p=0.549 n=29+29)
BM_LIBC_Bcmp_sr_www_L2 [Bcmp_7] 1.61GB/s ± 7% 1.63GB/s ± 8% ~ (p=0.191 n=30+29)
BM_LIBC_Bcmp_sr_www_LLC [Bcmp_7] 913MB/s ± 8% 905MB/s ± 9% ~ (p=0.423 n=30+30)
BM_LIBC_Bcmp_sr_www_Cold [Bcmp_7] 304MB/s ± 6% 287MB/s ± 4% -5.57% (p=0.000 n=30+30)
BM_LIBC_Bcmp_union_worker_L1 [Bcmp_8] 961MB/s ± 5% 1124MB/s ± 6% +16.94% (p=0.000 n=30+30)
BM_LIBC_Bcmp_union_worker_L2 [Bcmp_8] 915MB/s ± 8% 1100MB/s ± 7% +20.16% (p=0.000 n=30+30)
BM_LIBC_Bcmp_union_worker_LLC [Bcmp_8] 593MB/s ± 8% 669MB/s ± 8% +12.92% (p=0.000 n=30+30)
BM_LIBC_Bcmp_union_worker_Cold [Bcmp_8] 220MB/s ± 4% 220MB/s ± 6% ~ (p=0.572 n=30+30)
```
>From 48346ffff52579106abc306fd92c86beefce7613 Mon Sep 17 00:00:00 2001
From: "goldvitaly at google.com" <%username%@google.com>
Date: Fri, 6 Sep 2024 09:38:04 +0200
Subject: [PATCH] libc: Implement branchless head-tail comparison for bcmp.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Binary size changes:
| Bytes (cache lines) | before | after |
|---------------------|----------|---------|
| sse4 | 419 (7) | 288 (5) |
| avx | 430 (7) | 308 (5) |
| avx512f | 589 (10) | 390 (7) |
Benchmarks for different CPUs using https://github.com/google/fleetbench.
- indus-cascadelake
```
name old speed new speed delta
BM_LIBC_Bcmp_Fleet_L1 1.96GB/s ± 1% 2.19GB/s ± 0% +11.49% (p=0.000 n=29+24)
BM_LIBC_Bcmp_Fleet_L2 1.90GB/s ± 1% 2.14GB/s ± 1% +12.68% (p=0.000 n=29+24)
BM_LIBC_Bcmp_Fleet_LLC 513MB/s ± 4% 531MB/s ± 4% +3.53% (p=0.000 n=24+24)
BM_LIBC_Bcmp_Fleet_Cold 452MB/s ± 3% 456MB/s ± 4% ~ (p=0.103 n=30+30)
BM_LIBC_Bcmp_adgroup-server_L1 [Bcmp_0] 2.98GB/s ± 1% 3.15GB/s ± 1% +5.59% (p=0.000 n=29+30)
BM_LIBC_Bcmp_adgroup-server_L2 [Bcmp_0] 2.86GB/s ± 1% 3.07GB/s ± 1% +7.21% (p=0.000 n=29+30)
BM_LIBC_Bcmp_adgroup-server_LLC [Bcmp_0] 738MB/s ± 7% 751MB/s ± 3% +1.68% (p=0.000 n=24+25)
BM_LIBC_Bcmp_adgroup-server_Cold [Bcmp_0] 643MB/s ± 3% 642MB/s ± 4% ~ (p=0.522 n=29+30)
BM_LIBC_Bcmp_bigtable_tablet_server_L1 [Bcmp_1] 3.08GB/s ± 0% 3.25GB/s ± 0% +5.35% (p=0.000 n=28+30)
BM_LIBC_Bcmp_bigtable_tablet_server_L2 [Bcmp_1] 2.97GB/s ± 1% 3.17GB/s ± 1% +6.65% (p=0.000 n=29+30)
BM_LIBC_Bcmp_bigtable_tablet_server_LLC [Bcmp_1] 901MB/s ±59% 871MB/s ±36% ~ (p=0.676 n=29+27)
BM_LIBC_Bcmp_bigtable_tablet_server_Cold [Bcmp_1] 686MB/s ± 4% 686MB/s ± 3% ~ (p=0.934 n=29+30)
BM_LIBC_Bcmp_d_L1 [Bcmp_2] 1.63GB/s ± 0% 1.80GB/s ± 1% +10.19% (p=0.000 n=29+30)
BM_LIBC_Bcmp_d_L2 [Bcmp_2] 1.57GB/s ± 1% 1.75GB/s ± 1% +11.46% (p=0.000 n=29+30)
BM_LIBC_Bcmp_d_LLC [Bcmp_2] 451MB/s ±61% 427MB/s ±28% ~ (p=0.469 n=29+25)
BM_LIBC_Bcmp_d_Cold [Bcmp_2] 353MB/s ± 4% 354MB/s ± 5% ~ (p=0.467 n=30+30)
BM_LIBC_Bcmp_dremel-server_L1 [Bcmp_3] 1.91GB/s ± 1% 2.10GB/s ± 1% +9.90% (p=0.000 n=29+29)
BM_LIBC_Bcmp_dremel-server_L2 [Bcmp_3] 1.84GB/s ± 1% 2.03GB/s ± 1% +10.63% (p=0.000 n=29+30)
BM_LIBC_Bcmp_dremel-server_LLC [Bcmp_3] 491MB/s ±24% 538MB/s ±24% +9.66% (p=0.000 n=24+27)
BM_LIBC_Bcmp_dremel-server_Cold [Bcmp_3] 417MB/s ± 4% 421MB/s ± 3% ~ (p=0.063 n=30+29)
BM_LIBC_Bcmp_logs_proxy_L1 [Bcmp_4] 761MB/s ± 1% 867MB/s ± 1% +14.02% (p=0.000 n=28+30)
BM_LIBC_Bcmp_logs_proxy_L2 [Bcmp_4] 748MB/s ± 1% 860MB/s ± 1% +15.04% (p=0.000 n=30+30)
BM_LIBC_Bcmp_logs_proxy_LLC [Bcmp_4] 227MB/s ±29% 260MB/s ±64% +14.70% (p=0.000 n=26+27)
BM_LIBC_Bcmp_logs_proxy_Cold [Bcmp_4] 187MB/s ± 3% 191MB/s ± 5% +2.26% (p=0.000 n=30+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L1 [Bcmp_5] 1.48GB/s ± 1% 1.71GB/s ± 1% +15.26% (p=0.000 n=29+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L2 [Bcmp_5] 1.42GB/s ± 1% 1.67GB/s ± 1% +17.68% (p=0.000 n=29+29)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_LLC [Bcmp_5] 412MB/s ±34% 519MB/s ±80% +25.87% (p=0.000 n=27+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_Cold [Bcmp_5] 336MB/s ± 4% 343MB/s ± 6% +2.05% (p=0.000 n=30+30)
BM_LIBC_Bcmp_span_server_L1 [Bcmp_6] 2.87GB/s ± 0% 3.24GB/s ± 1% +12.88% (p=0.000 n=26+30)
BM_LIBC_Bcmp_span_server_L2 [Bcmp_6] 2.78GB/s ± 1% 3.20GB/s ± 1% +15.15% (p=0.000 n=26+30)
BM_LIBC_Bcmp_span_server_LLC [Bcmp_6] 926MB/s ±43% 1227MB/s ±76% +32.53% (p=0.000 n=27+30)
BM_LIBC_Bcmp_span_server_Cold [Bcmp_6] 716MB/s ± 4% 737MB/s ± 6% +3.02% (p=0.000 n=28+29)
BM_LIBC_Bcmp_sr_www_L1 [Bcmp_7] 1.54GB/s ± 1% 1.56GB/s ± 0% +1.40% (p=0.000 n=29+30)
BM_LIBC_Bcmp_sr_www_L2 [Bcmp_7] 1.47GB/s ± 1% 1.52GB/s ± 1% +2.97% (p=0.000 n=27+30)
BM_LIBC_Bcmp_sr_www_LLC [Bcmp_7] 351MB/s ±23% 436MB/s ±83% +24.04% (p=0.005 n=24+29)
BM_LIBC_Bcmp_sr_www_Cold [Bcmp_7] 283MB/s ± 4% 282MB/s ± 4% ~ (p=0.644 n=30+30)
BM_LIBC_Bcmp_union_worker_L1 [Bcmp_8] 824MB/s ± 1% 1048MB/s ± 1% +27.18% (p=0.000 n=29+30)
BM_LIBC_Bcmp_union_worker_L2 [Bcmp_8] 808MB/s ± 1% 1027MB/s ± 1% +27.12% (p=0.000 n=29+29)
BM_LIBC_Bcmp_union_worker_LLC [Bcmp_8] 317MB/s ±79% 332MB/s ±74% ~ (p=0.338 n=30+29)
BM_LIBC_Bcmp_union_worker_Cold [Bcmp_8] 207MB/s ± 5% 212MB/s ± 5% +2.27% (p=0.000 n=30+30)
```
- indus-skylake
```
name old speed new speed delta
BM_LIBC_Bcmp_Fleet_L1 2.06GB/s ± 2% 2.25GB/s ± 3% +9.66% (p=0.000 n=27+24)
BM_LIBC_Bcmp_Fleet_L2 1.96GB/s ± 2% 2.17GB/s ± 2% +10.61% (p=0.000 n=30+24)
BM_LIBC_Bcmp_Fleet_LLC 1.18GB/s ± 6% 1.32GB/s ± 5% +12.27% (p=0.000 n=28+28)
BM_LIBC_Bcmp_Fleet_Cold 456MB/s ± 2% 466MB/s ± 2% +2.22% (p=0.000 n=28+28)
BM_LIBC_Bcmp_adgroup-server_L1 [Bcmp_0] 3.08GB/s ± 2% 3.20GB/s ± 1% +3.72% (p=0.000 n=28+22)
BM_LIBC_Bcmp_adgroup-server_L2 [Bcmp_0] 2.92GB/s ± 1% 3.05GB/s ± 2% +4.49% (p=0.000 n=23+23)
BM_LIBC_Bcmp_adgroup-server_LLC [Bcmp_0] 1.83GB/s ± 8% 1.94GB/s ± 4% +6.24% (p=0.000 n=25+27)
BM_LIBC_Bcmp_adgroup-server_Cold [Bcmp_0] 654MB/s ± 2% 659MB/s ± 2% +0.76% (p=0.012 n=30+29)
BM_LIBC_Bcmp_bigtable_tablet_server_L1 [Bcmp_1] 3.19GB/s ± 2% 3.34GB/s ± 2% +4.41% (p=0.000 n=26+23)
BM_LIBC_Bcmp_bigtable_tablet_server_L2 [Bcmp_1] 3.05GB/s ± 2% 3.21GB/s ± 2% +5.32% (p=0.000 n=28+25)
BM_LIBC_Bcmp_bigtable_tablet_server_LLC [Bcmp_1] 1.95GB/s ± 4% 2.03GB/s ±10% +3.61% (p=0.000 n=27+30)
BM_LIBC_Bcmp_bigtable_tablet_server_Cold [Bcmp_1] 700MB/s ± 2% 702MB/s ± 2% ~ (p=0.150 n=30+30)
BM_LIBC_Bcmp_d_L1 [Bcmp_2] 1.69GB/s ± 2% 1.85GB/s ± 1% +9.31% (p=0.000 n=30+26)
BM_LIBC_Bcmp_d_L2 [Bcmp_2] 1.60GB/s ± 2% 1.78GB/s ± 2% +10.90% (p=0.000 n=26+27)
BM_LIBC_Bcmp_d_LLC [Bcmp_2] 1.01GB/s ± 5% 1.12GB/s ± 5% +11.40% (p=0.000 n=27+28)
BM_LIBC_Bcmp_d_Cold [Bcmp_2] 355MB/s ± 3% 360MB/s ± 3% +1.46% (p=0.000 n=30+30)
BM_LIBC_Bcmp_dremel-server_L1 [Bcmp_3] 1.98GB/s ± 2% 2.15GB/s ± 2% +8.89% (p=0.000 n=29+27)
BM_LIBC_Bcmp_dremel-server_L2 [Bcmp_3] 1.87GB/s ± 3% 2.05GB/s ± 2% +10.06% (p=0.000 n=30+26)
BM_LIBC_Bcmp_dremel-server_LLC [Bcmp_3] 1.19GB/s ± 4% 1.31GB/s ± 6% +9.82% (p=0.000 n=27+29)
BM_LIBC_Bcmp_dremel-server_Cold [Bcmp_3] 424MB/s ± 3% 431MB/s ± 3% +1.58% (p=0.000 n=28+30)
BM_LIBC_Bcmp_logs_proxy_L1 [Bcmp_4] 849MB/s ± 2% 949MB/s ± 2% +11.84% (p=0.000 n=27+28)
BM_LIBC_Bcmp_logs_proxy_L2 [Bcmp_4] 815MB/s ± 3% 913MB/s ± 3% +12.06% (p=0.000 n=29+30)
BM_LIBC_Bcmp_logs_proxy_LLC [Bcmp_4] 512MB/s ± 9% 571MB/s ± 7% +11.40% (p=0.000 n=30+30)
BM_LIBC_Bcmp_logs_proxy_Cold [Bcmp_4] 187MB/s ± 3% 192MB/s ± 2% +2.56% (p=0.000 n=30+28)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L1 [Bcmp_5] 1.55GB/s ± 2% 1.77GB/s ± 3% +13.93% (p=0.000 n=30+28)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L2 [Bcmp_5] 1.47GB/s ± 2% 1.70GB/s ± 2% +15.96% (p=0.000 n=27+26)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_LLC [Bcmp_5] 939MB/s ± 5% 1084MB/s ± 4% +15.36% (p=0.000 n=28+27)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_Cold [Bcmp_5] 340MB/s ± 2% 347MB/s ± 3% +1.93% (p=0.000 n=30+30)
BM_LIBC_Bcmp_span_server_L1 [Bcmp_6] 3.06GB/s ± 3% 3.40GB/s ± 2% +11.13% (p=0.000 n=30+28)
BM_LIBC_Bcmp_span_server_L2 [Bcmp_6] 2.89GB/s ± 3% 3.24GB/s ± 2% +12.20% (p=0.000 n=29+26)
BM_LIBC_Bcmp_span_server_LLC [Bcmp_6] 1.93GB/s ± 4% 2.09GB/s ±11% +8.16% (p=0.000 n=26+30)
BM_LIBC_Bcmp_span_server_Cold [Bcmp_6] 746MB/s ± 2% 762MB/s ± 2% +2.11% (p=0.000 n=30+28)
BM_LIBC_Bcmp_sr_www_L1 [Bcmp_7] 1.59GB/s ± 2% 1.62GB/s ± 2% +1.72% (p=0.000 n=25+27)
BM_LIBC_Bcmp_sr_www_L2 [Bcmp_7] 1.49GB/s ± 2% 1.53GB/s ± 2% +2.62% (p=0.000 n=27+29)
BM_LIBC_Bcmp_sr_www_LLC [Bcmp_7] 852MB/s ±10% 909MB/s ± 6% +6.71% (p=0.000 n=30+29)
BM_LIBC_Bcmp_sr_www_Cold [Bcmp_7] 283MB/s ± 3% 283MB/s ± 2% ~ (p=0.617 n=30+27)
BM_LIBC_Bcmp_union_worker_L1 [Bcmp_8] 891MB/s ± 2% 1083MB/s ± 2% +21.64% (p=0.000 n=27+24)
BM_LIBC_Bcmp_union_worker_L2 [Bcmp_8] 855MB/s ± 2% 1045MB/s ± 1% +22.31% (p=0.000 n=25+23)
BM_LIBC_Bcmp_union_worker_LLC [Bcmp_8] 568MB/s ± 7% 659MB/s ± 8% +16.04% (p=0.000 n=29+30)
BM_LIBC_Bcmp_union_worker_Cold [Bcmp_8] 207MB/s ± 2% 212MB/s ± 2% +2.31% (p=0.000 n=30+27)
```
- arcadia-rome
```
name old speed new speed delta
BM_LIBC_Bcmp_Fleet_L1 2.16GB/s ± 2% 2.27GB/s ± 2% +5.13% (p=0.000 n=26+30)
BM_LIBC_Bcmp_Fleet_L2 2.15GB/s ± 2% 2.25GB/s ± 2% +4.64% (p=0.000 n=27+30)
BM_LIBC_Bcmp_Fleet_LLC 1.73GB/s ± 3% 1.81GB/s ± 3% +4.66% (p=0.000 n=25+28)
BM_LIBC_Bcmp_Fleet_Cold 494MB/s ± 1% 496MB/s ± 2% +0.45% (p=0.023 n=22+24)
BM_LIBC_Bcmp_adgroup-server_L1 [Bcmp_0] 3.30GB/s ± 1% 3.24GB/s ± 2% -1.70% (p=0.000 n=27+30)
BM_LIBC_Bcmp_adgroup-server_L2 [Bcmp_0] 3.23GB/s ± 2% 3.19GB/s ± 2% -1.28% (p=0.000 n=28+28)
BM_LIBC_Bcmp_adgroup-server_LLC [Bcmp_0] 2.59GB/s ± 3% 2.58GB/s ± 2% -0.65% (p=0.010 n=26+26)
BM_LIBC_Bcmp_adgroup-server_Cold [Bcmp_0] 720MB/s ± 1% 707MB/s ± 3% -1.75% (p=0.000 n=22+25)
BM_LIBC_Bcmp_bigtable_tablet_server_L1 [Bcmp_1] 3.37GB/s ± 1% 3.36GB/s ± 2% ~ (p=0.102 n=28+29)
BM_LIBC_Bcmp_bigtable_tablet_server_L2 [Bcmp_1] 3.32GB/s ± 2% 3.30GB/s ± 2% -0.51% (p=0.038 n=28+29)
BM_LIBC_Bcmp_bigtable_tablet_server_LLC [Bcmp_1] 2.67GB/s ± 4% 2.70GB/s ± 4% +0.96% (p=0.009 n=28+27)
BM_LIBC_Bcmp_bigtable_tablet_server_Cold [Bcmp_1] 755MB/s ± 1% 751MB/s ± 2% -0.57% (p=0.000 n=22+25)
BM_LIBC_Bcmp_d_L1 [Bcmp_2] 1.79GB/s ± 1% 1.86GB/s ± 2% +3.92% (p=0.000 n=27+29)
BM_LIBC_Bcmp_d_L2 [Bcmp_2] 1.77GB/s ± 2% 1.82GB/s ± 2% +2.99% (p=0.000 n=28+29)
BM_LIBC_Bcmp_d_LLC [Bcmp_2] 1.41GB/s ± 4% 1.47GB/s ± 3% +3.97% (p=0.000 n=28+28)
BM_LIBC_Bcmp_d_Cold [Bcmp_2] 386MB/s ± 1% 389MB/s ± 1% +0.60% (p=0.000 n=21+23)
BM_LIBC_Bcmp_dremel-server_L1 [Bcmp_3] 2.07GB/s ± 2% 2.17GB/s ± 2% +4.87% (p=0.000 n=29+30)
BM_LIBC_Bcmp_dremel-server_L2 [Bcmp_3] 2.07GB/s ± 2% 2.13GB/s ± 2% +3.02% (p=0.000 n=28+30)
BM_LIBC_Bcmp_dremel-server_LLC [Bcmp_3] 1.66GB/s ± 2% 1.73GB/s ± 2% +4.08% (p=0.000 n=29+26)
BM_LIBC_Bcmp_dremel-server_Cold [Bcmp_3] 466MB/s ± 2% 469MB/s ± 3% +0.66% (p=0.001 n=22+25)
BM_LIBC_Bcmp_logs_proxy_L1 [Bcmp_4] 861MB/s ± 1% 964MB/s ± 2% +11.98% (p=0.000 n=29+29)
BM_LIBC_Bcmp_logs_proxy_L2 [Bcmp_4] 853MB/s ± 2% 935MB/s ± 2% +9.54% (p=0.000 n=28+29)
BM_LIBC_Bcmp_logs_proxy_LLC [Bcmp_4] 707MB/s ± 3% 743MB/s ± 4% +5.08% (p=0.000 n=29+29)
BM_LIBC_Bcmp_logs_proxy_Cold [Bcmp_4] 199MB/s ± 3% 199MB/s ± 2% ~ (p=0.107 n=29+25)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L1 [Bcmp_5] 1.65GB/s ± 1% 1.75GB/s ± 2% +6.15% (p=0.000 n=29+29)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L2 [Bcmp_5] 1.64GB/s ± 3% 1.73GB/s ± 2% +5.37% (p=0.000 n=29+29)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_LLC [Bcmp_5] 1.32GB/s ± 2% 1.40GB/s ± 2% +6.21% (p=0.000 n=28+27)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_Cold [Bcmp_5] 370MB/s ± 3% 371MB/s ± 2% +0.16% (p=0.008 n=29+25)
BM_LIBC_Bcmp_span_server_L1 [Bcmp_6] 3.25GB/s ± 2% 3.47GB/s ± 2% +6.74% (p=0.000 n=28+29)
BM_LIBC_Bcmp_span_server_L2 [Bcmp_6] 3.26GB/s ± 1% 3.44GB/s ± 1% +5.43% (p=0.000 n=28+29)
BM_LIBC_Bcmp_span_server_LLC [Bcmp_6] 2.66GB/s ± 2% 2.79GB/s ± 3% +4.90% (p=0.000 n=27+29)
BM_LIBC_Bcmp_span_server_Cold [Bcmp_6] 812MB/s ± 3% 799MB/s ± 2% -1.57% (p=0.000 n=29+25)
BM_LIBC_Bcmp_sr_www_L1 [Bcmp_7] 1.71GB/s ± 2% 1.66GB/s ± 2% -3.14% (p=0.000 n=29+29)
BM_LIBC_Bcmp_sr_www_L2 [Bcmp_7] 1.63GB/s ± 2% 1.59GB/s ± 2% -2.50% (p=0.000 n=29+28)
BM_LIBC_Bcmp_sr_www_LLC [Bcmp_7] 1.25GB/s ± 4% 1.25GB/s ± 2% ~ (p=0.530 n=28+26)
BM_LIBC_Bcmp_sr_www_Cold [Bcmp_7] 311MB/s ± 3% 308MB/s ± 1% ~ (p=0.127 n=29+24)
BM_LIBC_Bcmp_union_worker_L1 [Bcmp_8] 869MB/s ± 2% 1098MB/s ± 2% +26.28% (p=0.000 n=27+29)
BM_LIBC_Bcmp_union_worker_L2 [Bcmp_8] 873MB/s ± 2% 1075MB/s ± 1% +23.06% (p=0.000 n=27+29)
BM_LIBC_Bcmp_union_worker_LLC [Bcmp_8] 743MB/s ± 4% 859MB/s ± 4% +15.58% (p=0.000 n=27+27)
BM_LIBC_Bcmp_union_worker_Cold [Bcmp_8] 221MB/s ± 4% 221MB/s ± 3% +0.14% (p=0.034 n=29+25)
```
- ixion-haswell
```
name old speed new speed delta
BM_LIBC_Bcmp_Fleet_L1 2.27GB/s ± 5% 2.41GB/s ± 6% +6.10% (p=0.000 n=29+28)
BM_LIBC_Bcmp_Fleet_L2 2.14GB/s ± 6% 2.33GB/s ± 5% +9.21% (p=0.000 n=29+30)
BM_LIBC_Bcmp_Fleet_LLC 1.30GB/s ± 9% 1.43GB/s ± 8% +9.85% (p=0.000 n=30+30)
BM_LIBC_Bcmp_Fleet_Cold 475MB/s ± 6% 475MB/s ± 5% ~ (p=0.839 n=30+29)
BM_LIBC_Bcmp_adgroup-server_L1 [Bcmp_0] 3.38GB/s ± 7% 3.46GB/s ± 6% +2.35% (p=0.009 n=30+29)
BM_LIBC_Bcmp_adgroup-server_L2 [Bcmp_0] 3.20GB/s ± 5% 3.32GB/s ± 6% +3.52% (p=0.000 n=28+30)
BM_LIBC_Bcmp_adgroup-server_LLC [Bcmp_0] 1.88GB/s ± 9% 2.00GB/s ± 6% +6.63% (p=0.000 n=30+28)
BM_LIBC_Bcmp_adgroup-server_Cold [Bcmp_0] 664MB/s ± 6% 655MB/s ± 6% -1.32% (p=0.025 n=30+30)
BM_LIBC_Bcmp_bigtable_tablet_server_L1 [Bcmp_1] 3.50GB/s ± 8% 3.61GB/s ±10% +3.09% (p=0.001 n=29+30)
BM_LIBC_Bcmp_bigtable_tablet_server_L2 [Bcmp_1] 3.32GB/s ± 7% 3.48GB/s ± 8% +4.89% (p=0.000 n=29+30)
BM_LIBC_Bcmp_bigtable_tablet_server_LLC [Bcmp_1] 2.02GB/s ± 7% 2.14GB/s ± 9% +5.82% (p=0.000 n=28+29)
BM_LIBC_Bcmp_bigtable_tablet_server_Cold [Bcmp_1] 716MB/s ± 6% 709MB/s ± 5% -0.97% (p=0.040 n=30+28)
BM_LIBC_Bcmp_d_L1 [Bcmp_2] 1.83GB/s ± 7% 1.97GB/s ± 8% +7.90% (p=0.000 n=30+30)
BM_LIBC_Bcmp_d_L2 [Bcmp_2] 1.74GB/s ± 6% 1.92GB/s ± 6% +10.29% (p=0.000 n=30+29)
BM_LIBC_Bcmp_d_LLC [Bcmp_2] 1.05GB/s ± 9% 1.15GB/s ± 9% +9.73% (p=0.000 n=30+30)
BM_LIBC_Bcmp_d_Cold [Bcmp_2] 379MB/s ± 6% 372MB/s ± 6% -1.74% (p=0.012 n=30+30)
BM_LIBC_Bcmp_dremel-server_L1 [Bcmp_3] 2.17GB/s ± 5% 2.29GB/s ± 6% +5.61% (p=0.000 n=29+30)
BM_LIBC_Bcmp_dremel-server_L2 [Bcmp_3] 2.02GB/s ± 6% 2.20GB/s ± 6% +8.75% (p=0.000 n=29+30)
BM_LIBC_Bcmp_dremel-server_LLC [Bcmp_3] 1.22GB/s ± 8% 1.34GB/s ± 9% +9.19% (p=0.000 n=30+30)
BM_LIBC_Bcmp_dremel-server_Cold [Bcmp_3] 447MB/s ± 3% 441MB/s ± 7% -1.40% (p=0.033 n=30+30)
BM_LIBC_Bcmp_logs_proxy_L1 [Bcmp_4] 902MB/s ± 6% 995MB/s ±10% +10.37% (p=0.000 n=30+30)
BM_LIBC_Bcmp_logs_proxy_L2 [Bcmp_4] 863MB/s ± 5% 945MB/s ±11% +9.50% (p=0.000 n=29+30)
BM_LIBC_Bcmp_logs_proxy_LLC [Bcmp_4] 528MB/s ±11% 559MB/s ±12% +5.75% (p=0.000 n=30+30)
BM_LIBC_Bcmp_logs_proxy_Cold [Bcmp_4] 183MB/s ± 4% 181MB/s ± 7% ~ (p=0.088 n=28+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L1 [Bcmp_5] 1.70GB/s ± 6% 1.87GB/s ± 8% +10.14% (p=0.000 n=29+29)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_L2 [Bcmp_5] 1.60GB/s ± 5% 1.80GB/s ± 9% +12.61% (p=0.000 n=29+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_LLC [Bcmp_5] 994MB/s ±13% 1094MB/s ± 8% +10.10% (p=0.000 n=29+30)
BM_LIBC_Bcmp_muppet-unified-leaf-plo-merced_Cold [Bcmp_5] 362MB/s ± 6% 358MB/s ± 7% ~ (p=0.123 n=30+30)
BM_LIBC_Bcmp_span_server_L1 [Bcmp_6] 3.31GB/s ± 5% 3.67GB/s ± 6% +10.90% (p=0.000 n=28+30)
BM_LIBC_Bcmp_span_server_L2 [Bcmp_6] 3.11GB/s ± 5% 3.53GB/s ± 5% +13.59% (p=0.000 n=30+30)
BM_LIBC_Bcmp_span_server_LLC [Bcmp_6] 1.98GB/s ± 9% 2.18GB/s ± 8% +10.34% (p=0.000 n=30+30)
BM_LIBC_Bcmp_span_server_Cold [Bcmp_6] 754MB/s ± 5% 752MB/s ± 5% ~ (p=0.592 n=30+30)
BM_LIBC_Bcmp_sr_www_L1 [Bcmp_7] 1.72GB/s ± 5% 1.72GB/s ± 6% ~ (p=0.549 n=29+29)
BM_LIBC_Bcmp_sr_www_L2 [Bcmp_7] 1.61GB/s ± 7% 1.63GB/s ± 8% ~ (p=0.191 n=30+29)
BM_LIBC_Bcmp_sr_www_LLC [Bcmp_7] 913MB/s ± 8% 905MB/s ± 9% ~ (p=0.423 n=30+30)
BM_LIBC_Bcmp_sr_www_Cold [Bcmp_7] 304MB/s ± 6% 287MB/s ± 4% -5.57% (p=0.000 n=30+30)
BM_LIBC_Bcmp_union_worker_L1 [Bcmp_8] 961MB/s ± 5% 1124MB/s ± 6% +16.94% (p=0.000 n=30+30)
BM_LIBC_Bcmp_union_worker_L2 [Bcmp_8] 915MB/s ± 8% 1100MB/s ± 7% +20.16% (p=0.000 n=30+30)
BM_LIBC_Bcmp_union_worker_LLC [Bcmp_8] 593MB/s ± 8% 669MB/s ± 8% +12.92% (p=0.000 n=30+30)
BM_LIBC_Bcmp_union_worker_Cold [Bcmp_8] 220MB/s ± 4% 220MB/s ± 6% ~ (p=0.572 n=30+30)
```
---
libc/src/string/memory_utils/op_x86.h | 86 ++++++++++++++-----
.../string/memory_utils/x86_64/inline_bcmp.h | 32 +++----
2 files changed, 77 insertions(+), 41 deletions(-)
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index cf9667283818d8..ab694e25fe0fe1 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -63,6 +63,15 @@ struct Memcpy {
namespace LIBC_NAMESPACE_DECL {
namespace generic {
+// Not equals: returns non-zero iff values at head or tail differ.
+// This function typically loads more data than necessary when the two buffer
+// differs.
+template <typename T>
+LIBC_INLINE uint32_t branchless_head_tail_neq(CPtr p1, CPtr p2, size_t count) {
+ static_assert(cpp::is_integral_v<T>);
+ return neq<T>(p1, p2, 0) | neq<T>(p1, p2, count - sizeof(T));
+}
+
///////////////////////////////////////////////////////////////////////////////
// Specializations for uint16_t
template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
@@ -133,6 +142,11 @@ LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
#if defined(__SSE4_1__)
template <> struct is_vector<__m128i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
+LIBC_INLINE __m128i load_and_xor_m128i(CPtr p1, CPtr p2, size_t offset) {
+ const auto a = load<__m128i>(p1, offset);
+ const auto b = load<__m128i>(p2, offset);
+ return _mm_xor_si128(a, b);
+}
LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
return _mm_max_epu8(a, b);
}
@@ -144,17 +158,21 @@ LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
return static_cast<uint16_t>(
_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
}
+LIBC_INLINE bool is_zero(__m128i value) {
+ return _mm_testz_si128(value, value) == 1;
+}
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m128i>(p1, offset);
- const auto b = load<__m128i>(p2, offset);
- const auto xored = _mm_xor_si128(a, b);
- return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
+ return is_zero(load_and_xor_m128i(p1, p2, offset));
}
template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m128i>(p1, offset);
- const auto b = load<__m128i>(p2, offset);
- const auto xored = _mm_xor_si128(a, b);
- return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
+ return !is_zero(load_and_xor_m128i(p1, p2, offset));
+}
+template <>
+LIBC_INLINE uint32_t branchless_head_tail_neq<__m128i>(CPtr p1, CPtr p2,
+ size_t count) {
+ const __m128i head = load_and_xor_m128i(p1, p2, 0);
+ const __m128i tail = load_and_xor_m128i(p1, p2, count - sizeof(__m128i));
+ return !is_zero(_mm_or_si128(head, tail));
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
@@ -173,19 +191,34 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
#if defined(__AVX__)
template <> struct is_vector<__m256i> : cpp::true_type {};
template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
-template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m256i>(p1, offset);
- const auto b = load<__m256i>(p2, offset);
- const auto xored = _mm256_castps_si256(
+LIBC_INLINE __m256i xor_m256i(__m256i a, __m256i b) {
+ return _mm256_castps_si256(
_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
- return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
}
-template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
+LIBC_INLINE __m256i or_m256i(__m256i a, __m256i b) {
+ return _mm256_castps_si256(
+ _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+}
+LIBC_INLINE __m256i load_and_xor_m256i(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m256i>(p1, offset);
const auto b = load<__m256i>(p2, offset);
- const auto xored = _mm256_castps_si256(
- _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
- return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
+ return xor_m256i(a, b);
+}
+LIBC_INLINE bool is_zero(__m256i value) {
+ return _mm256_testz_si256(value, value) == 1;
+}
+template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
+ return is_zero(load_and_xor_m256i(p1, p2, offset));
+}
+template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
+ return !is_zero(load_and_xor_m256i(p1, p2, offset));
+}
+template <>
+LIBC_INLINE uint32_t branchless_head_tail_neq<__m256i>(CPtr p1, CPtr p2,
+ size_t count) {
+ const __m256i head = load_and_xor_m256i(p1, p2, 0);
+ const __m256i tail = load_and_xor_m256i(p1, p2, count - sizeof(__m256i));
+ return !is_zero(or_m256i(head, tail));
}
#endif // __AVX__
@@ -300,9 +333,22 @@ template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
const auto b = load<__m512i>(p2, offset);
- const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
- return static_cast<uint32_t>(xored >> 32) |
- static_cast<uint32_t>(xored & 0xFFFFFFFF);
+ return _mm512_cmpneq_epi8_mask(a, b) != 0;
+}
+LIBC_INLINE __m512i load_and_xor_m512i(CPtr p1, CPtr p2, size_t offset) {
+ const auto a = load<__m512i>(p1, offset);
+ const auto b = load<__m512i>(p2, offset);
+ return _mm512_xor_epi64(a, b);
+}
+LIBC_INLINE bool is_zero(__m512i value) {
+ return _mm512_test_epi32_mask(value, value) == 0;
+}
+template <>
+LIBC_INLINE uint32_t branchless_head_tail_neq<__m512i>(CPtr p1, CPtr p2,
+ size_t count) {
+ const __m512i head = load_and_xor_m512i(p1, p2, 0);
+ const __m512i tail = load_and_xor_m512i(p1, p2, count - sizeof(__m512i));
+ return !is_zero(_mm512_or_epi64(head, tail));
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
diff --git a/libc/src/string/memory_utils/x86_64/inline_bcmp.h b/libc/src/string/memory_utils/x86_64/inline_bcmp.h
index 49fe08fb0501b5..cc54c4140ee6e6 100644
--- a/libc/src/string/memory_utils/x86_64/inline_bcmp.h
+++ b/libc/src/string/memory_utils/x86_64/inline_bcmp.h
@@ -27,7 +27,7 @@ inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+ return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __SSE4_1__
@@ -36,9 +36,9 @@ inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+ return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
if (count <= 64)
- return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
+ return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __AVX__
@@ -47,11 +47,11 @@ inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+ return generic::branchless_head_tail_neq<__m128i>(p1, p2, count);
if (count <= 64)
- return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
+ return generic::branchless_head_tail_neq<__m256i>(p1, p2, count);
if (count <= 128)
- return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
+ return generic::branchless_head_tail_neq<__m512i>(p1, p2, count);
return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
}
#endif // __AVX512BW__
@@ -62,22 +62,12 @@ inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
return BcmpReturnType::zero();
if (count == 1)
return generic::Bcmp<uint8_t>::block(p1, p2);
- if (count == 2)
- return generic::Bcmp<uint16_t>::block(p1, p2);
- if (count == 3)
- return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
- if (count == 4)
- return generic::Bcmp<uint32_t>::block(p1, p2);
- if (count == 5)
- return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
- if (count == 6)
- return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
- if (count == 7)
- return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
- if (count == 8)
- return generic::Bcmp<uint64_t>::block(p1, p2);
+ if (count <= 4)
+ return generic::branchless_head_tail_neq<uint16_t>(p1, p2, count);
+ if (count <= 8)
+ return generic::branchless_head_tail_neq<uint32_t>(p1, p2, count);
if (count <= 16)
- return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
+ return generic::branchless_head_tail_neq<uint64_t>(p1, p2, count);
#if defined(__AVX512BW__)
return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
#elif defined(__AVX__)
More information about the libc-commits
mailing list