[libc-commits] [PATCH] D150567: [libc] Add optimized bcmp for RISCV

Guillaume Chatelet via Phabricator via libc-commits libc-commits at lists.llvm.org
Mon May 15 06:14:57 PDT 2023


gchatelet created this revision.
gchatelet added a reviewer: sivachandra.
Herald added subscribers: libc-commits, VincentWu, vkmr, ecnelises, luismarques, sameer.abuasal, tschuett, s.egerton, Jim, benna, psnobl, PkmX, rogfer01, shiva0217, kito-cheng, simoncook, asb, kristof.beyls, arichardson.
Herald added projects: libc-project, All.
gchatelet requested review of this revision.
Herald added subscribers: pcwang-thead, eopXD.

[libc] Add optimized bcmp for RISCV

This patch adds two versions of bcmp optimized for architectures where unaligned accesses are either illegal or extremely slow.
It is currently enabled for RISCV 64 and RISCV 32 but it could be used for ARM 32 architectures as well.

Here is the before / after output of libc.benchmarks.memory_functions.opt_host --benchmark_filter=BM_Bcmp on a quad core Linux starfive RISCV 64 board running at 1.5GHz.

Before

  Run on (4 X 1500 MHz CPU s)
  CPU Caches:
    L1 Instruction 32 KiB (x4)
    L1 Data 32 KiB (x4)
    L2 Unified 2048 KiB (x1)
  Load Average: 7.03, 5.98, 3.71
  ----------------------------------------------------------------------
  Benchmark            Time             CPU   Iterations UserCounters...
  ----------------------------------------------------------------------
  BM_Bcmp/0/0        102 ns         60.5 ns     11662336 bytes_per_cycle=0.122696/s bytes_per_second=175.518M/s items_per_second=16.5258M/s __llvm_libc::bcmp,memcmp Google A
  BM_Bcmp/1/0        328 ns          172 ns      3737600 bytes_per_cycle=0.15256/s bytes_per_second=218.238M/s items_per_second=5.80575M/s __llvm_libc::bcmp,memcmp Google B
  BM_Bcmp/2/0        199 ns         99.7 ns      7019520 bytes_per_cycle=0.141897/s bytes_per_second=202.986M/s items_per_second=10.032M/s __llvm_libc::bcmp,memcmp Google D
  BM_Bcmp/3/0        173 ns         86.5 ns      8361984 bytes_per_cycle=0.13863/s bytes_per_second=198.312M/s items_per_second=11.5669M/s __llvm_libc::bcmp,memcmp Google L
  BM_Bcmp/4/0        105 ns         51.8 ns     13213696 bytes_per_cycle=0.116399/s bytes_per_second=166.51M/s items_per_second=19.2931M/s __llvm_libc::bcmp,memcmp Google M
  BM_Bcmp/5/0        167 ns         93.9 ns      7853056 bytes_per_cycle=0.139432/s bytes_per_second=199.459M/s items_per_second=10.6503M/s __llvm_libc::bcmp,memcmp Google Q
  BM_Bcmp/6/0        262 ns          165 ns      3931136 bytes_per_cycle=0.151516/s bytes_per_second=216.745M/s items_per_second=6.07091M/s __llvm_libc::bcmp,memcmp Google S
  BM_Bcmp/7/0        168 ns          105 ns      6665216 bytes_per_cycle=0.143159/s bytes_per_second=204.791M/s items_per_second=9.52163M/s __llvm_libc::bcmp,memcmp Google U
  BM_Bcmp/8/0        108 ns         68.0 ns     10175488 bytes_per_cycle=0.125504/s bytes_per_second=179.535M/s items_per_second=14.701M/s __llvm_libc::bcmp,memcmp Google W
  BM_Bcmp/9/0      15371 ns         9007 ns        78848 bytes_per_cycle=0.166128/s bytes_per_second=237.648M/s items_per_second=111.031k/s __llvm_libc::bcmp,uniform 384 to 4096

After

  BM_Bcmp/0/0       31.2 ns         18.4 ns     36732928 bytes_per_cycle=0.415372/s bytes_per_second=594.195M/s items_per_second=54.322M/s __llvm_libc::bcmp,memcmp Google A
  BM_Bcmp/1/0       38.7 ns         20.5 ns     33854464 bytes_per_cycle=1.36738/s bytes_per_second=1.91021G/s items_per_second=48.8896M/s __llvm_libc::bcmp,memcmp Google B
  BM_Bcmp/2/0       30.1 ns         21.7 ns     34470912 bytes_per_cycle=0.662537/s bytes_per_second=947.767M/s items_per_second=46.1837M/s __llvm_libc::bcmp,memcmp Google D
  BM_Bcmp/3/0       37.4 ns         21.2 ns     34407424 bytes_per_cycle=0.55713/s bytes_per_second=796.981M/s items_per_second=47.2556M/s __llvm_libc::bcmp,memcmp Google L
  BM_Bcmp/4/0       27.6 ns         17.7 ns     41377792 bytes_per_cycle=0.335887/s bytes_per_second=480.49M/s items_per_second=56.589M/s __llvm_libc::bcmp,memcmp Google M
  BM_Bcmp/5/0       30.6 ns         18.5 ns     37997568 bytes_per_cycle=0.632823/s bytes_per_second=905.261M/s items_per_second=53.947M/s __llvm_libc::bcmp,memcmp Google Q
  BM_Bcmp/6/0       30.3 ns         20.5 ns     33495040 bytes_per_cycle=1.22539/s bytes_per_second=1.71185G/s items_per_second=48.8604M/s __llvm_libc::bcmp,memcmp Google S
  BM_Bcmp/7/0       37.0 ns         20.0 ns     35427328 bytes_per_cycle=0.762227/s bytes_per_second=1090.37M/s items_per_second=49.9523M/s __llvm_libc::bcmp,memcmp Google U
  BM_Bcmp/8/0       27.4 ns         18.6 ns     37340160 bytes_per_cycle=0.477747/s bytes_per_second=683.423M/s items_per_second=53.7991M/s __llvm_libc::bcmp,memcmp Google W
  BM_Bcmp/9/0        195 ns          120 ns      6651904 bytes_per_cycle=12.353/s bytes_per_second=17.2569G/s items_per_second=8.35025M/s __llvm_libc::bcmp,uniform 384 to 4096

For comparison with glibc

  BM_Bcmp/0/0        106 ns         52.6 ns     12906496 bytes_per_cycle=0.142072/s bytes_per_second=203.235M/s items_per_second=19.0271M/s glibc::bcmp,memcmp Google A
  BM_Bcmp/1/0        132 ns         77.1 ns      8905728 bytes_per_cycle=0.365072/s bytes_per_second=522.239M/s items_per_second=12.9782M/s glibc::bcmp,memcmp Google B
  BM_Bcmp/2/0        122 ns         62.3 ns     10909696 bytes_per_cycle=0.222667/s bytes_per_second=318.527M/s items_per_second=16.0563M/s glibc::bcmp,memcmp Google D
  BM_Bcmp/3/0       99.5 ns         64.2 ns     11074560 bytes_per_cycle=0.185126/s bytes_per_second=264.825M/s items_per_second=15.5674M/s glibc::bcmp,memcmp Google L
  BM_Bcmp/4/0       86.6 ns         50.2 ns     13488128 bytes_per_cycle=0.117941/s bytes_per_second=168.717M/s items_per_second=19.9053M/s glibc::bcmp,memcmp Google M
  BM_Bcmp/5/0        106 ns         61.4 ns     11344896 bytes_per_cycle=0.248968/s bytes_per_second=356.151M/s items_per_second=16.284M/s glibc::bcmp,memcmp Google Q
  BM_Bcmp/6/0        145 ns         71.9 ns     10046464 bytes_per_cycle=0.389814/s bytes_per_second=557.633M/s items_per_second=13.9019M/s glibc::bcmp,memcmp Google S
  BM_Bcmp/7/0        119 ns         65.6 ns     10718208 bytes_per_cycle=0.243756/s bytes_per_second=348.696M/s items_per_second=15.2329M/s glibc::bcmp,memcmp Google U
  BM_Bcmp/8/0       86.4 ns         54.5 ns     13250560 bytes_per_cycle=0.154831/s bytes_per_second=221.488M/s items_per_second=18.3532M/s glibc::bcmp,memcmp Google W
  BM_Bcmp/9/0       1090 ns          604 ns      1186816 bytes_per_cycle=2.53848/s bytes_per_second=3.54622G/s items_per_second=1.65598M/s glibc::bcmp,uniform 384 to 4096


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D150567

Files:
  libc/src/string/memory_utils/bcmp_implementations.h


Index: libc/src/string/memory_utils/bcmp_implementations.h
===================================================================
--- libc/src/string/memory_utils/bcmp_implementations.h
+++ libc/src/string/memory_utils/bcmp_implementations.h
@@ -22,14 +22,67 @@
 namespace __llvm_libc {
 
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
-inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
+inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
   LIBC_LOOP_NOUNROLL
-  for (size_t offset = 0; offset < count; ++offset)
-    if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset))
+  for (; offset < count; ++offset)
+    if (uint32_t value = (p1[offset] == p2[offset]))
       return value;
   return BcmpReturnType::ZERO();
 }
 
+[[maybe_unused]] LIBC_INLINE BcmpReturnType
+inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
+  constexpr size_t kAlign = sizeof(uint64_t);
+  if (count <= 2 * kAlign)
+    return inline_bcmp_byte_per_byte(p1, p2, 0, count);
+  size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
+  if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
+    return value;
+  size_t offset = bytes_to_p1_align;
+  size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
+  for (; offset < count - kAlign; offset += kAlign) {
+    uint64_t a;
+    if (p2_alignment == 0)
+      a = load64_aligned<uint64_t>(p2, offset);
+    else if (p2_alignment == 4)
+      a = load64_aligned<uint32_t, uint32_t>(p2, offset);
+    else if (p2_alignment == 2)
+      a = load64_aligned<uint16_t, uint16_t, uint16_t, uint16_t>(p2, offset);
+    else
+      a = load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
+          p2, offset);
+    uint64_t b = load64_aligned<uint64_t>(p1, offset);
+    if (a != b)
+      return 1U;
+  }
+  return inline_bcmp_byte_per_byte(p1, p2, offset, count);
+}
+
+[[maybe_unused]] LIBC_INLINE BcmpReturnType
+inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
+  constexpr size_t kAlign = sizeof(uint32_t);
+  if (count <= 2 * kAlign)
+    return inline_bcmp_byte_per_byte(p1, p2, 0, count);
+  size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
+  if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
+    return value;
+  size_t offset = bytes_to_p1_align;
+  size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
+  for (; offset < count - kAlign; offset += kAlign) {
+    uint32_t a;
+    if (p2_alignment == 0)
+      a = load32_aligned<uint32_t>(p2, offset);
+    else if (p2_alignment == 2)
+      a = load32_aligned<uint16_t, uint16_t>(p2, offset);
+    else
+      a = load32_aligned<uint8_t, uint16_t, uint8_t>(p2, offset);
+    uint32_t b = load32_aligned<uint32_t>(p1, offset);
+    if (a != b)
+      return 1U;
+  }
+  return inline_bcmp_byte_per_byte(p1, p2, offset, count);
+}
+
 #if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
 [[maybe_unused]] LIBC_INLINE BcmpReturnType
 inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
@@ -167,8 +220,12 @@
   return inline_bcmp_x86(p1, p2, count);
 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
   return inline_bcmp_aarch64(p1, p2, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV64)
+  return inline_bcmp_aligned_access_64bit(p1, p2, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
+  return inline_bcmp_aligned_access_32bit(p1, p2, count);
 #else
-  return inline_bcmp_embedded_tiny(p1, p2, count);
+  return inline_bcmp_byte_per_byte(p1, p2, 0, count);
 #endif
 }
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D150567.522154.patch
Type: text/x-patch
Size: 3563 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/libc-commits/attachments/20230515/aacdb6f8/attachment-0001.bin>


More information about the libc-commits mailing list