[libc-commits] [PATCH] D150663: [libc] Add optimized memcmp for RISCV

Guillaume Chatelet via Phabricator via libc-commits libc-commits at lists.llvm.org
Tue May 16 05:46:03 PDT 2023


gchatelet created this revision.
gchatelet added a reviewer: sivachandra.
Herald added subscribers: libc-commits, VincentWu, vkmr, ecnelises, luismarques, sameer.abuasal, tschuett, s.egerton, Jim, benna, psnobl, PkmX, rogfer01, shiva0217, kito-cheng, simoncook, asb, kristof.beyls, arichardson.
Herald added projects: libc-project, All.
gchatelet requested review of this revision.
Herald added subscribers: pcwang-thead, eopXD.

This patch adds two versions of `bcmp` optimized for architectures where unaligned accesses are either illegal or extremely slow.
It is currently enabled for RISCV 64 and RISCV 32 but it could be used for ARM 32 architectures as well.

Here is the before / after output of `libc.benchmarks.memory_functions.opt_host --benchmark_filter=BM_memcmp` on a quad core Linux starfive RISCV 64 board running at 1.5GHz.

Before

  Run on (4 X 1500 MHz CPU s)
  CPU Caches:
    L1 Instruction 32 KiB (x4)
    L1 Data 32 KiB (x4)
    L2 Unified 2048 KiB (x1)
  ----------------------------------------------------------------------
  Benchmark            Time             CPU   Iterations UserCounters...
  ----------------------------------------------------------------------
  BM_Memcmp/0/0        110 ns         66.4 ns     10404864 bytes_per_cycle=0.107646/s bytes_per_second=153.989M/s items_per_second=15.071M/s __llvm_libc::memcmp,memcmp Google A
  BM_Memcmp/1/0        318 ns          211 ns      3026944 bytes_per_cycle=0.131539/s bytes_per_second=188.167M/s items_per_second=4.73691M/s __llvm_libc::memcmp,memcmp Google B
  BM_Memcmp/2/0        204 ns          115 ns      6118400 bytes_per_cycle=0.121675/s bytes_per_second=174.058M/s items_per_second=8.70241M/s __llvm_libc::memcmp,memcmp Google D
  BM_Memcmp/3/0        143 ns         99.6 ns      7013376 bytes_per_cycle=0.117974/s bytes_per_second=168.763M/s items_per_second=10.0437M/s __llvm_libc::memcmp,memcmp Google L
  BM_Memcmp/4/0       81.3 ns         58.2 ns     11426816 bytes_per_cycle=0.101125/s bytes_per_second=144.661M/s items_per_second=17.1805M/s __llvm_libc::memcmp,memcmp Google M
  BM_Memcmp/5/0        177 ns          118 ns      5952512 bytes_per_cycle=0.120612/s bytes_per_second=172.537M/s items_per_second=8.45549M/s __llvm_libc::memcmp,memcmp Google Q
  BM_Memcmp/6/0        342 ns          220 ns      3483648 bytes_per_cycle=0.132004/s bytes_per_second=188.834M/s items_per_second=4.54739M/s __llvm_libc::memcmp,memcmp Google S
  BM_Memcmp/7/0        208 ns          130 ns      5681152 bytes_per_cycle=0.12468/s bytes_per_second=178.356M/s items_per_second=7.6674M/s __llvm_libc::memcmp,memcmp Google U
  BM_Memcmp/8/0        123 ns         79.1 ns      8387584 bytes_per_cycle=0.110593/s bytes_per_second=158.204M/s items_per_second=12.6439M/s __llvm_libc::memcmp,memcmp Google W
  BM_Memcmp/9/0      20707 ns        10643 ns        67584 bytes_per_cycle=0.142401/s bytes_per_second=203.707M/s items_per_second=93.9559k/s __llvm_libc::memcmp,uniform 384 to 4096

After

  BM_Memcmp/0/0       90.3 ns         55.3 ns     12578816 bytes_per_cycle=0.133409/s bytes_per_second=190.843M/s items_per_second=18.0749M/s __llvm_libc::memcmp,memcmp Google A
  BM_Memcmp/1/0        152 ns         77.7 ns      8671232 bytes_per_cycle=0.344299/s bytes_per_second=492.524M/s items_per_second=12.8726M/s __llvm_libc::memcmp,memcmp Google B
  BM_Memcmp/2/0        104 ns         63.6 ns     11100160 bytes_per_cycle=0.226234/s bytes_per_second=323.631M/s items_per_second=15.7338M/s __llvm_libc::memcmp,memcmp Google D
  BM_Memcmp/3/0        114 ns         63.4 ns     11161600 bytes_per_cycle=0.186642/s bytes_per_second=266.993M/s items_per_second=15.7665M/s __llvm_libc::memcmp,memcmp Google L
  BM_Memcmp/4/0       86.0 ns         52.3 ns     13200384 bytes_per_cycle=0.11276/s bytes_per_second=161.305M/s items_per_second=19.1127M/s __llvm_libc::memcmp,memcmp Google M
  BM_Memcmp/5/0        107 ns         63.5 ns     10646528 bytes_per_cycle=0.197951/s bytes_per_second=283.171M/s items_per_second=15.7565M/s __llvm_libc::memcmp,memcmp Google Q
  BM_Memcmp/6/0        131 ns         74.7 ns      9230336 bytes_per_cycle=0.358615/s bytes_per_second=513.003M/s items_per_second=13.3941M/s __llvm_libc::memcmp,memcmp Google S
  BM_Memcmp/7/0        137 ns         68.8 ns     10306560 bytes_per_cycle=0.229953/s bytes_per_second=328.95M/s items_per_second=14.5448M/s __llvm_libc::memcmp,memcmp Google U
  BM_Memcmp/8/0       98.5 ns         55.3 ns     12645376 bytes_per_cycle=0.156614/s bytes_per_second=224.038M/s items_per_second=18.0858M/s __llvm_libc::memcmp,memcmp Google W
  BM_Memcmp/9/0       1572 ns         1002 ns       719872 bytes_per_cycle=1.49528/s bytes_per_second=2.08888G/s items_per_second=997.737k/s __llvm_libc::memcmp,uniform 384 to 4096

glibc

  BM_Memcmp/0/0       72.6 ns         51.7 ns     12963840 bytes_per_cycle=0.141261/s bytes_per_second=202.075M/s items_per_second=19.3246M/s glibc::memcmp,memcmp Google A
  BM_Memcmp/1/0        118 ns         75.2 ns      9280512 bytes_per_cycle=0.354054/s bytes_per_second=506.478M/s items_per_second=13.3046M/s glibc::memcmp,memcmp Google B
  BM_Memcmp/2/0        114 ns         62.9 ns     11152384 bytes_per_cycle=0.222675/s bytes_per_second=318.539M/s items_per_second=15.8943M/s glibc::memcmp,memcmp Google D
  BM_Memcmp/3/0       84.0 ns         63.5 ns     11030528 bytes_per_cycle=0.186353/s bytes_per_second=266.581M/s items_per_second=15.7378M/s glibc::memcmp,memcmp Google L
  BM_Memcmp/4/0       93.5 ns         51.2 ns     13462528 bytes_per_cycle=0.119215/s bytes_per_second=170.539M/s items_per_second=19.5384M/s glibc::memcmp,memcmp Google M
  BM_Memcmp/5/0        123 ns         61.7 ns     11376640 bytes_per_cycle=0.225262/s bytes_per_second=322.239M/s items_per_second=16.1993M/s glibc::memcmp,memcmp Google Q
  BM_Memcmp/6/0        122 ns         71.6 ns      9967616 bytes_per_cycle=0.380844/s bytes_per_second=544.802M/s items_per_second=13.9579M/s glibc::memcmp,memcmp Google S
  BM_Memcmp/7/0        118 ns         65.6 ns     10555392 bytes_per_cycle=0.238677/s bytes_per_second=341.43M/s items_per_second=15.2334M/s glibc::memcmp,memcmp Google U
  BM_Memcmp/8/0       90.4 ns         54.0 ns     12920832 bytes_per_cycle=0.161987/s bytes_per_second=231.724M/s items_per_second=18.5169M/s glibc::memcmp,memcmp Google W
  BM_Memcmp/9/0       1045 ns          601 ns      1195008 bytes_per_cycle=2.53677/s bytes_per_second=3.54383G/s items_per_second=1.66423M/s glibc::memcmp,uniform 384 to 4096


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D150663

Files:
  libc/src/string/memory_utils/memcmp_implementations.h


Index: libc/src/string/memory_utils/memcmp_implementations.h
===================================================================
--- libc/src/string/memory_utils/memcmp_implementations.h
+++ libc/src/string/memory_utils/memcmp_implementations.h
@@ -26,21 +26,86 @@
 namespace __llvm_libc {
 
 [[maybe_unused]] LIBC_INLINE MemcmpReturnType
-inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
+inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
   LIBC_LOOP_NOUNROLL
-  for (size_t offset = 0; offset < count; ++offset)
-    if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
+  for (; offset < count; ++offset)
+    if (auto value = (uint8_t)p1[offset] - (uint8_t)p2[offset])
       return value;
   return MemcmpReturnType::ZERO();
 }
 
+[[maybe_unused]] LIBC_INLINE MemcmpReturnType
+inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
+  constexpr size_t kAlign = sizeof(uint64_t);
+  if (count <= 2 * kAlign)
+    return inline_memcmp_byte_per_byte(p1, p2, 0, count);
+  size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
+  if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
+    return value;
+  size_t offset = bytes_to_p1_align;
+  size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
+  for (; offset < count - kAlign; offset += kAlign) {
+    uint64_t a;
+    if (p2_alignment == 0)
+      a = load64_aligned<uint64_t>(p2, offset);
+    else if (p2_alignment == 4)
+      a = load64_aligned<uint32_t, uint32_t>(p2, offset);
+    else if (p2_alignment == 2)
+      a = load64_aligned<uint16_t, uint16_t, uint16_t, uint16_t>(p2, offset);
+    else
+      a = load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
+          p2, offset);
+    uint64_t b = load64_aligned<uint64_t>(p1, offset);
+    if (a != b) {
+      // TODO use cmp_neq_uint64_t from D148717 once it's submitted.
+      return Endian::to_big_endian(a) < Endian::to_big_endian(b) ? -1 : 1;
+    }
+  }
+  return inline_memcmp_byte_per_byte(p1, p2, offset, count);
+}
+
+[[maybe_unused]] LIBC_INLINE MemcmpReturnType
+inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
+  constexpr size_t kAlign = sizeof(uint32_t);
+  if (count <= 2 * kAlign)
+    return inline_memcmp_byte_per_byte(p1, p2, 0, count);
+  size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
+  if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
+    return value;
+  size_t offset = bytes_to_p1_align;
+  size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
+  for (; offset < count - kAlign; offset += kAlign) {
+    uint32_t a;
+    if (p2_alignment == 0)
+      a = load32_aligned<uint32_t>(p2, offset);
+    else if (p2_alignment == 2)
+      a = load32_aligned<uint16_t, uint16_t>(p2, offset);
+    else
+      a = load32_aligned<uint8_t, uint16_t, uint8_t>(p2, offset);
+    uint32_t b = load32_aligned<uint32_t>(p1, offset);
+    if (a != b) {
+      // TODO use cmp_uint32_t from D148717 once it's submitted.
+      // We perform the difference as an uint64_t.
+      const int64_t diff = static_cast<int64_t>(Endian::to_big_endian(a)) -
+                           static_cast<int64_t>(Endian::to_big_endian(b));
+      // And reduce the uint64_t into an uint32_t.
+      return static_cast<int32_t>((diff >> 1) | (diff & 0xFFFF));
+    }
+  }
+  return inline_memcmp_byte_per_byte(p1, p2, offset, count);
+}
+
 LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
 #if defined(LIBC_TARGET_ARCH_IS_X86)
   return inline_memcmp_x86(p1, p2, count);
 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
   return inline_memcmp_aarch64(p1, p2, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV64)
+  return inline_memcmp_aligned_access_64bit(p1, p2, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
+  return inline_memcmp_aligned_access_32bit(p1, p2, count);
 #else
-  return inline_memcmp_embedded_tiny(p1, p2, count);
+  return inline_memcmp_byte_per_byte(p1, p2, 0, count);
 #endif
 }
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D150663.522561.patch
Type: text/x-patch
Size: 4052 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/libc-commits/attachments/20230516/89575e5f/attachment-0001.bin>


More information about the libc-commits mailing list