[libc-commits] [PATCH] D150663: [libc] Add optimized memcmp for RISCV
Guillaume Chatelet via Phabricator via libc-commits
libc-commits at lists.llvm.org
Tue May 16 05:46:03 PDT 2023
gchatelet created this revision.
gchatelet added a reviewer: sivachandra.
Herald added subscribers: libc-commits, VincentWu, vkmr, ecnelises, luismarques, sameer.abuasal, tschuett, s.egerton, Jim, benna, psnobl, PkmX, rogfer01, shiva0217, kito-cheng, simoncook, asb, kristof.beyls, arichardson.
Herald added projects: libc-project, All.
gchatelet requested review of this revision.
Herald added subscribers: pcwang-thead, eopXD.
This patch adds two versions of `bcmp` optimized for architectures where unaligned accesses are either illegal or extremely slow.
It is currently enabled for RISCV 64 and RISCV 32 but it could be used for ARM 32 architectures as well.
Here is the before / after output of `libc.benchmarks.memory_functions.opt_host --benchmark_filter=BM_memcmp` on a quad core Linux starfive RISCV 64 board running at 1.5GHz.
Before
Run on (4 X 1500 MHz CPU s)
CPU Caches:
L1 Instruction 32 KiB (x4)
L1 Data 32 KiB (x4)
L2 Unified 2048 KiB (x1)
----------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
----------------------------------------------------------------------
BM_Memcmp/0/0 110 ns 66.4 ns 10404864 bytes_per_cycle=0.107646/s bytes_per_second=153.989M/s items_per_second=15.071M/s __llvm_libc::memcmp,memcmp Google A
BM_Memcmp/1/0 318 ns 211 ns 3026944 bytes_per_cycle=0.131539/s bytes_per_second=188.167M/s items_per_second=4.73691M/s __llvm_libc::memcmp,memcmp Google B
BM_Memcmp/2/0 204 ns 115 ns 6118400 bytes_per_cycle=0.121675/s bytes_per_second=174.058M/s items_per_second=8.70241M/s __llvm_libc::memcmp,memcmp Google D
BM_Memcmp/3/0 143 ns 99.6 ns 7013376 bytes_per_cycle=0.117974/s bytes_per_second=168.763M/s items_per_second=10.0437M/s __llvm_libc::memcmp,memcmp Google L
BM_Memcmp/4/0 81.3 ns 58.2 ns 11426816 bytes_per_cycle=0.101125/s bytes_per_second=144.661M/s items_per_second=17.1805M/s __llvm_libc::memcmp,memcmp Google M
BM_Memcmp/5/0 177 ns 118 ns 5952512 bytes_per_cycle=0.120612/s bytes_per_second=172.537M/s items_per_second=8.45549M/s __llvm_libc::memcmp,memcmp Google Q
BM_Memcmp/6/0 342 ns 220 ns 3483648 bytes_per_cycle=0.132004/s bytes_per_second=188.834M/s items_per_second=4.54739M/s __llvm_libc::memcmp,memcmp Google S
BM_Memcmp/7/0 208 ns 130 ns 5681152 bytes_per_cycle=0.12468/s bytes_per_second=178.356M/s items_per_second=7.6674M/s __llvm_libc::memcmp,memcmp Google U
BM_Memcmp/8/0 123 ns 79.1 ns 8387584 bytes_per_cycle=0.110593/s bytes_per_second=158.204M/s items_per_second=12.6439M/s __llvm_libc::memcmp,memcmp Google W
BM_Memcmp/9/0 20707 ns 10643 ns 67584 bytes_per_cycle=0.142401/s bytes_per_second=203.707M/s items_per_second=93.9559k/s __llvm_libc::memcmp,uniform 384 to 4096
After
BM_Memcmp/0/0 90.3 ns 55.3 ns 12578816 bytes_per_cycle=0.133409/s bytes_per_second=190.843M/s items_per_second=18.0749M/s __llvm_libc::memcmp,memcmp Google A
BM_Memcmp/1/0 152 ns 77.7 ns 8671232 bytes_per_cycle=0.344299/s bytes_per_second=492.524M/s items_per_second=12.8726M/s __llvm_libc::memcmp,memcmp Google B
BM_Memcmp/2/0 104 ns 63.6 ns 11100160 bytes_per_cycle=0.226234/s bytes_per_second=323.631M/s items_per_second=15.7338M/s __llvm_libc::memcmp,memcmp Google D
BM_Memcmp/3/0 114 ns 63.4 ns 11161600 bytes_per_cycle=0.186642/s bytes_per_second=266.993M/s items_per_second=15.7665M/s __llvm_libc::memcmp,memcmp Google L
BM_Memcmp/4/0 86.0 ns 52.3 ns 13200384 bytes_per_cycle=0.11276/s bytes_per_second=161.305M/s items_per_second=19.1127M/s __llvm_libc::memcmp,memcmp Google M
BM_Memcmp/5/0 107 ns 63.5 ns 10646528 bytes_per_cycle=0.197951/s bytes_per_second=283.171M/s items_per_second=15.7565M/s __llvm_libc::memcmp,memcmp Google Q
BM_Memcmp/6/0 131 ns 74.7 ns 9230336 bytes_per_cycle=0.358615/s bytes_per_second=513.003M/s items_per_second=13.3941M/s __llvm_libc::memcmp,memcmp Google S
BM_Memcmp/7/0 137 ns 68.8 ns 10306560 bytes_per_cycle=0.229953/s bytes_per_second=328.95M/s items_per_second=14.5448M/s __llvm_libc::memcmp,memcmp Google U
BM_Memcmp/8/0 98.5 ns 55.3 ns 12645376 bytes_per_cycle=0.156614/s bytes_per_second=224.038M/s items_per_second=18.0858M/s __llvm_libc::memcmp,memcmp Google W
BM_Memcmp/9/0 1572 ns 1002 ns 719872 bytes_per_cycle=1.49528/s bytes_per_second=2.08888G/s items_per_second=997.737k/s __llvm_libc::memcmp,uniform 384 to 4096
glibc
BM_Memcmp/0/0 72.6 ns 51.7 ns 12963840 bytes_per_cycle=0.141261/s bytes_per_second=202.075M/s items_per_second=19.3246M/s glibc::memcmp,memcmp Google A
BM_Memcmp/1/0 118 ns 75.2 ns 9280512 bytes_per_cycle=0.354054/s bytes_per_second=506.478M/s items_per_second=13.3046M/s glibc::memcmp,memcmp Google B
BM_Memcmp/2/0 114 ns 62.9 ns 11152384 bytes_per_cycle=0.222675/s bytes_per_second=318.539M/s items_per_second=15.8943M/s glibc::memcmp,memcmp Google D
BM_Memcmp/3/0 84.0 ns 63.5 ns 11030528 bytes_per_cycle=0.186353/s bytes_per_second=266.581M/s items_per_second=15.7378M/s glibc::memcmp,memcmp Google L
BM_Memcmp/4/0 93.5 ns 51.2 ns 13462528 bytes_per_cycle=0.119215/s bytes_per_second=170.539M/s items_per_second=19.5384M/s glibc::memcmp,memcmp Google M
BM_Memcmp/5/0 123 ns 61.7 ns 11376640 bytes_per_cycle=0.225262/s bytes_per_second=322.239M/s items_per_second=16.1993M/s glibc::memcmp,memcmp Google Q
BM_Memcmp/6/0 122 ns 71.6 ns 9967616 bytes_per_cycle=0.380844/s bytes_per_second=544.802M/s items_per_second=13.9579M/s glibc::memcmp,memcmp Google S
BM_Memcmp/7/0 118 ns 65.6 ns 10555392 bytes_per_cycle=0.238677/s bytes_per_second=341.43M/s items_per_second=15.2334M/s glibc::memcmp,memcmp Google U
BM_Memcmp/8/0 90.4 ns 54.0 ns 12920832 bytes_per_cycle=0.161987/s bytes_per_second=231.724M/s items_per_second=18.5169M/s glibc::memcmp,memcmp Google W
BM_Memcmp/9/0 1045 ns 601 ns 1195008 bytes_per_cycle=2.53677/s bytes_per_second=3.54383G/s items_per_second=1.66423M/s glibc::memcmp,uniform 384 to 4096
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D150663
Files:
libc/src/string/memory_utils/memcmp_implementations.h
Index: libc/src/string/memory_utils/memcmp_implementations.h
===================================================================
--- libc/src/string/memory_utils/memcmp_implementations.h
+++ libc/src/string/memory_utils/memcmp_implementations.h
@@ -26,21 +26,86 @@
namespace __llvm_libc {
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
-inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
+inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
LIBC_LOOP_NOUNROLL
- for (size_t offset = 0; offset < count; ++offset)
- if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
+ for (; offset < count; ++offset)
+ if (auto value = (uint8_t)p1[offset] - (uint8_t)p2[offset])
return value;
return MemcmpReturnType::ZERO();
}
+[[maybe_unused]] LIBC_INLINE MemcmpReturnType
+inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
+ constexpr size_t kAlign = sizeof(uint64_t);
+ if (count <= 2 * kAlign)
+ return inline_memcmp_byte_per_byte(p1, p2, 0, count);
+ size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
+ if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
+ return value;
+ size_t offset = bytes_to_p1_align;
+ size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
+ for (; offset < count - kAlign; offset += kAlign) {
+ uint64_t a;
+ if (p2_alignment == 0)
+ a = load64_aligned<uint64_t>(p2, offset);
+ else if (p2_alignment == 4)
+ a = load64_aligned<uint32_t, uint32_t>(p2, offset);
+ else if (p2_alignment == 2)
+ a = load64_aligned<uint16_t, uint16_t, uint16_t, uint16_t>(p2, offset);
+ else
+ a = load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
+ p2, offset);
+ uint64_t b = load64_aligned<uint64_t>(p1, offset);
+ if (a != b) {
+ // TODO use cmp_neq_uint64_t from D148717 once it's submitted.
+ return Endian::to_big_endian(a) < Endian::to_big_endian(b) ? -1 : 1;
+ }
+ }
+ return inline_memcmp_byte_per_byte(p1, p2, offset, count);
+}
+
+[[maybe_unused]] LIBC_INLINE MemcmpReturnType
+inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
+ constexpr size_t kAlign = sizeof(uint32_t);
+ if (count <= 2 * kAlign)
+ return inline_memcmp_byte_per_byte(p1, p2, 0, count);
+ size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
+ if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
+ return value;
+ size_t offset = bytes_to_p1_align;
+ size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
+ for (; offset < count - kAlign; offset += kAlign) {
+ uint32_t a;
+ if (p2_alignment == 0)
+ a = load32_aligned<uint32_t>(p2, offset);
+ else if (p2_alignment == 2)
+ a = load32_aligned<uint16_t, uint16_t>(p2, offset);
+ else
+ a = load32_aligned<uint8_t, uint16_t, uint8_t>(p2, offset);
+ uint32_t b = load32_aligned<uint32_t>(p1, offset);
+ if (a != b) {
+ // TODO use cmp_uint32_t from D148717 once it's submitted.
+ // We perform the difference as an uint64_t.
+ const int64_t diff = static_cast<int64_t>(Endian::to_big_endian(a)) -
+ static_cast<int64_t>(Endian::to_big_endian(b));
+ // And reduce the uint64_t into an uint32_t.
+ return static_cast<int32_t>((diff >> 1) | (diff & 0xFFFF));
+ }
+ }
+ return inline_memcmp_byte_per_byte(p1, p2, offset, count);
+}
+
LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
#if defined(LIBC_TARGET_ARCH_IS_X86)
return inline_memcmp_x86(p1, p2, count);
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
return inline_memcmp_aarch64(p1, p2, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV64)
+ return inline_memcmp_aligned_access_64bit(p1, p2, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
+ return inline_memcmp_aligned_access_32bit(p1, p2, count);
#else
- return inline_memcmp_embedded_tiny(p1, p2, count);
+ return inline_memcmp_byte_per_byte(p1, p2, 0, count);
#endif
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D150663.522561.patch
Type: text/x-patch
Size: 4052 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/libc-commits/attachments/20230516/89575e5f/attachment-0001.bin>
More information about the libc-commits
mailing list