[libc-commits] [PATCH] D150433: [libc] Add optimized memset for RISCV
Guillaume Chatelet via Phabricator via libc-commits
libc-commits at lists.llvm.org
Fri May 12 02:16:05 PDT 2023
gchatelet created this revision.
gchatelet added a reviewer: sivachandra.
Herald added subscribers: libc-commits, VincentWu, vkmr, ecnelises, evandro, luismarques, sameer.abuasal, tschuett, s.egerton, Jim, benna, psnobl, PkmX, rogfer01, shiva0217, kito-cheng, simoncook, asb, kristof.beyls, arichardson.
Herald added projects: libc-project, All.
gchatelet requested review of this revision.
Herald added subscribers: pcwang-thead, eopXD.
This patch adds two versions of `memset` optimized for architectures where unaligned accesses are either illegal or extremely slow.
It is currently enabled for RISCV 64 and RISCV 32 but it could be used for ARM 32 architectures as well.
Here is the before / after output of libc.benchmarks.memory_functions.opt_host --benchmark_filter=BM_Memset on a quad core Linux starfive RISCV 64 board running at 1.5GHz.
Before
Run on (4 X 1500 MHz CPU s)
CPU Caches:
L1 Instruction 32 KiB (x4)
L1 Data 32 KiB (x4)
L2 Unified 2048 KiB (x1)
------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
------------------------------------------------------------------------
BM_Memset/0/0 506 ns 252 ns 2883584 bytes_per_cycle=0.238312/s bytes_per_second=340.908M/s items_per_second=3.96043M/s __llvm_libc::memset,memset Google A
BM_Memset/1/0 296 ns 189 ns 2900992 bytes_per_cycle=0.234589/s bytes_per_second=335.583M/s items_per_second=5.29382M/s __llvm_libc::memset,memset Google B
BM_Memset/2/0 2110 ns 1049 ns 678912 bytes_per_cycle=0.24687/s bytes_per_second=353.151M/s items_per_second=953.527k/s __llvm_libc::memset,memset Google D
BM_Memset/3/0 397 ns 254 ns 3055616 bytes_per_cycle=0.238479/s bytes_per_second=341.147M/s items_per_second=3.93224M/s __llvm_libc::memset,memset Google L
BM_Memset/4/0 1119 ns 621 ns 1079296 bytes_per_cycle=0.244925/s bytes_per_second=350.368M/s items_per_second=1.61047M/s __llvm_libc::memset,memset Google M
BM_Memset/5/0 605 ns 349 ns 1644544 bytes_per_cycle=0.241364/s bytes_per_second=345.274M/s items_per_second=2.8614M/s __llvm_libc::memset,memset Google Q
BM_Memset/6/0 472 ns 271 ns 2310144 bytes_per_cycle=0.238615/s bytes_per_second=341.341M/s items_per_second=3.68799M/s __llvm_libc::memset,memset Google S
BM_Memset/7/0 262 ns 143 ns 3956736 bytes_per_cycle=0.225812/s bytes_per_second=323.026M/s items_per_second=7.0087M/s __llvm_libc::memset,memset Google U
BM_Memset/8/0 454 ns 261 ns 2940928 bytes_per_cycle=0.238883/s bytes_per_second=341.725M/s items_per_second=3.82716M/s __llvm_libc::memset,memset Google W
BM_Memset/9/0 8768 ns 5998 ns 115712 bytes_per_cycle=0.249196/s bytes_per_second=356.478M/s items_per_second=166.724k/s __llvm_libc::memset,uniform 384 to 4096
After
BM_Memset/0/0 117 ns 69.5 ns 9761792 bytes_per_cycle=0.935152/s bytes_per_second=1.30639G/s items_per_second=14.3834M/s __llvm_libc::memset,memset Google A
BM_Memset/1/0 97.8 ns 58.5 ns 13002752 bytes_per_cycle=0.892814/s bytes_per_second=1.24725G/s items_per_second=17.0848M/s __llvm_libc::memset,memset Google B
BM_Memset/2/0 326 ns 163 ns 5156864 bytes_per_cycle=1.54408/s bytes_per_second=2.15706G/s items_per_second=6.1192M/s __llvm_libc::memset,memset Google D
BM_Memset/3/0 132 ns 65.4 ns 11455488 bytes_per_cycle=0.876411/s bytes_per_second=1.22433G/s items_per_second=15.2803M/s __llvm_libc::memset,memset Google L
BM_Memset/4/0 222 ns 120 ns 6405120 bytes_per_cycle=1.44398/s bytes_per_second=2.01722G/s items_per_second=8.30758M/s __llvm_libc::memset,memset Google M
BM_Memset/5/0 119 ns 79.2 ns 8930304 bytes_per_cycle=1.13327/s bytes_per_second=1.58317G/s items_per_second=12.6189M/s __llvm_libc::memset,memset Google Q
BM_Memset/6/0 123 ns 64.0 ns 11609088 bytes_per_cycle=1.008/s bytes_per_second=1.40817G/s items_per_second=15.6365M/s __llvm_libc::memset,memset Google S
BM_Memset/7/0 85.9 ns 52.1 ns 12423168 bytes_per_cycle=0.641164/s bytes_per_second=917.192M/s items_per_second=19.1937M/s __llvm_libc::memset,memset Google U
BM_Memset/8/0 114 ns 67.1 ns 10347520 bytes_per_cycle=0.911968/s bytes_per_second=1.274G/s items_per_second=14.9015M/s __llvm_libc::memset,memset Google W
BM_Memset/9/0 1326 ns 785 ns 907264 bytes_per_cycle=1.89716/s bytes_per_second=2.6503G/s items_per_second=1.27348M/s __llvm_libc::memset,uniform 384 to 4096
Again not as good as current glibc but it's a first step in the right direction.
BM_Memset/0/0 108 ns 53.6 ns 12894208 bytes_per_cycle=1.02858/s bytes_per_second=1.4369G/s items_per_second=18.668M/s glibc::memset,memset Google A
BM_Memset/1/0 84.6 ns 47.6 ns 14284800 bytes_per_cycle=1.00197/s bytes_per_second=1.39974G/s items_per_second=21.0256M/s glibc::memset,memset Google B
BM_Memset/2/0 160 ns 85.8 ns 8927232 bytes_per_cycle=3.30805/s bytes_per_second=4.62129G/s items_per_second=11.6596M/s glibc::memset,memset Google D
BM_Memset/3/0 78.9 ns 53.6 ns 13326336 bytes_per_cycle=1.14058/s bytes_per_second=1.59338G/s items_per_second=18.674M/s glibc::memset,memset Google L
BM_Memset/4/0 99.2 ns 60.8 ns 11460608 bytes_per_cycle=2.54751/s bytes_per_second=3.55884G/s items_per_second=16.4587M/s glibc::memset,memset Google M
BM_Memset/5/0 93.0 ns 56.1 ns 12219392 bytes_per_cycle=1.73379/s bytes_per_second=2.42207G/s items_per_second=17.8157M/s glibc::memset,memset Google Q
BM_Memset/6/0 89.4 ns 47.2 ns 14692352 bytes_per_cycle=1.34846/s bytes_per_second=1.88377G/s items_per_second=21.1795M/s glibc::memset,memset Google S
BM_Memset/7/0 84.0 ns 50.0 ns 14468096 bytes_per_cycle=0.911198/s bytes_per_second=1.27293G/s items_per_second=19.994M/s glibc::memset,memset Google U
BM_Memset/8/0 93.4 ns 52.8 ns 13063168 bytes_per_cycle=1.06642/s bytes_per_second=1.48977G/s items_per_second=18.9524M/s glibc::memset,memset Google W
BM_Memset/9/0 438 ns 241 ns 2853888 bytes_per_cycle=6.1185/s bytes_per_second=8.54744G/s items_per_second=4.15064M/s glibc::memset,uniform 384 to 4096
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D150433
Files:
libc/src/string/memory_utils/memset_implementations.h
Index: libc/src/string/memory_utils/memset_implementations.h
===================================================================
--- libc/src/string/memory_utils/memset_implementations.h
+++ libc/src/string/memory_utils/memset_implementations.h
@@ -23,12 +23,39 @@
namespace __llvm_libc {
[[maybe_unused]] LIBC_INLINE static void
-inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) {
+inline_memset_byte_per_byte(Ptr dst, size_t offset, uint8_t value,
+ size_t count) {
LIBC_LOOP_NOUNROLL
- for (size_t offset = 0; offset < count; ++offset)
+ for (; offset < count; ++offset)
generic::Memset<uint8_t>::block(dst + offset, value);
}
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_aligned_access_32bit(Ptr dst, uint8_t value, size_t count) {
+ constexpr size_t kAlign = sizeof(uint32_t);
+ if (count <= 2 * kAlign)
+ return inline_memset_byte_per_byte(dst, 0, value, count);
+ size_t bytes_to_dst_align = distance_to_align_up<kAlign>(dst);
+ inline_memset_byte_per_byte(dst, 0, value, bytes_to_dst_align);
+ size_t offset = bytes_to_dst_align;
+ for (; offset < count - kAlign; offset += kAlign)
+ store32_aligned<uint32_t>(generic::splat<uint32_t>(value), dst, offset);
+ inline_memset_byte_per_byte(dst, offset, value, count);
+}
+
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_aligned_access_64bit(Ptr dst, uint8_t value, size_t count) {
+ constexpr size_t kAlign = sizeof(uint64_t);
+ if (count <= 2 * kAlign)
+ return inline_memset_byte_per_byte(dst, 0, value, count);
+ size_t bytes_to_dst_align = distance_to_align_up<kAlign>(dst);
+ inline_memset_byte_per_byte(dst, 0, value, bytes_to_dst_align);
+ size_t offset = bytes_to_dst_align;
+ for (; offset < count - kAlign; offset += kAlign)
+ store64_aligned<uint64_t>(generic::splat<uint64_t>(value), dst, offset);
+ inline_memset_byte_per_byte(dst, offset, value, count);
+}
+
#if defined(LIBC_TARGET_ARCH_IS_X86)
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
@@ -121,8 +148,12 @@
return inline_memset_x86(dst, value, count);
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
return inline_memset_aarch64(dst, value, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV64)
+ return inline_memset_aligned_access_64bit(dst, value, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
+ return inline_memset_aligned_access_32bit(dst, value, count);
#else
- return inline_memset_embedded_tiny(dst, value, count);
+ return inline_memset_byte_per_byte(dst, 0, value, count);
#endif
}
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D150433.521587.patch
Type: text/x-patch
Size: 2598 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/libc-commits/attachments/20230512/d8e73c22/attachment.bin>
More information about the libc-commits
mailing list