[libc-commits] [libc] memmove optimizations (PR #70043)

Dmitry Vyukov via libc-commits libc-commits at lists.llvm.org
Tue Oct 24 06:55:49 PDT 2023


https://github.com/dvyukov created https://github.com/llvm/llvm-project/pull/70043

See individual commits for description.
```
                     │  baseline   │          small-size-check           │
                     │   sec/op    │   sec/op     vs base                │
    memmove/Google_A   3.208n ± 0%   2.909n ± 0%   -9.31% (n=100)
    memmove/0          2.982n ± 0%   2.168n ± 0%  -27.27% (n=50)
    memmove/1          3.253n ± 0%   2.169n ± 0%  -33.34% (n=50)
    memmove/2          3.255n ± 0%   2.168n ± 6%  -33.40% (n=50)
    memmove/3          3.259n ± 2%   2.175n ± 0%  -33.26% (n=50)
    memmove/4          3.259n ± 0%   2.168n ± 0%  -33.45% (p=0.000 n=50)
    memmove/5          2.488n ± 0%   1.926n ± 0%  -22.57% (n=50)
    memmove/6          2.490n ± 0%   1.928n ± 0%  -22.58% (p=0.000 n=50)
    memmove/7          2.492n ± 0%   1.928n ± 0%  -22.63% (n=50)
    memmove/8          2.737n ± 0%   2.711n ± 0%   -0.97% (p=0.000 n=50)
    memmove/9          2.736n ± 0%   2.711n ± 0%   -0.94% (p=0.000 n=50)
    memmove/10         2.739n ± 0%   2.711n ± 0%   -1.04% (p=0.000 n=50)
    memmove/11         2.740n ± 0%   2.711n ± 0%   -1.07% (p=0.000 n=50)
    memmove/12         2.740n ± 0%   2.711n ± 0%   -1.09% (p=0.000 n=50)
    memmove/13         2.744n ± 0%   2.711n ± 0%   -1.22% (p=0.000 n=50)
    memmove/14         2.742n ± 0%   2.711n ± 0%   -1.14% (p=0.000 n=50)
    memmove/15         2.742n ± 0%   2.711n ± 0%   -1.15% (p=0.000 n=50)
    memmove/16         2.997n ± 0%   2.982n ± 0%   -0.52% (p=0.000 n=50)
    memmove/17         2.998n ± 0%   2.982n ± 0%   -0.55% (p=0.000 n=50)
    memmove/18         2.998n ± 0%   2.982n ± 0%   -0.54% (p=0.000 n=50)
    memmove/19         2.999n ± 0%   2.981n ± 0%   -0.59% (p=0.000 n=50)
    memmove/20         2.998n ± 0%   2.982n ± 0%   -0.55% (p=0.000 n=50)
    memmove/21         3.000n ± 0%   2.982n ± 0%   -0.61% (p=0.000 n=50)
    memmove/22         3.002n ± 0%   2.982n ± 0%   -0.68% (p=0.000 n=50)
    memmove/23         3.002n ± 0%   2.981n ± 0%   -0.67% (p=0.000 n=50)
    memmove/24         3.002n ± 0%   2.981n ± 0%   -0.70% (p=0.000 n=50)
    memmove/25         3.002n ± 0%   2.982n ± 0%   -0.68% (p=0.000 n=50)
    memmove/26         3.004n ± 0%   2.982n ± 0%   -0.74% (n=50)
    memmove/27         3.005n ± 0%   2.982n ± 0%   -0.79% (p=0.000 n=50)
    memmove/28         3.005n ± 0%   2.982n ± 0%   -0.77% (p=0.000 n=50)
    memmove/29         3.009n ± 0%   2.982n ± 0%   -0.92% (n=50)
    memmove/30         3.008n ± 0%   2.982n ± 0%   -0.89% (n=50)
    memmove/31         3.007n ± 0%   2.981n ± 0%   -0.86% (n=50)
    memmove/32         3.540n ± 0%   2.999n ± 0%  -15.30% (p=0.000 n=50)
    memmove/33         3.544n ± 0%   2.998n ± 0%  -15.41% (p=0.000 n=50)
    memmove/34         3.546n ± 0%   2.999n ± 0%  -15.42% (n=50)
    memmove/35         3.545n ± 0%   2.999n ± 0%  -15.41% (p=0.000 n=50)
    memmove/36         3.548n ± 0%   2.998n ± 0%  -15.51% (p=0.000 n=50)
    memmove/37         3.546n ± 0%   2.999n ± 0%  -15.44% (n=50)
    memmove/38         3.549n ± 0%   2.999n ± 0%  -15.50% (p=0.000 n=50)
    memmove/39         3.549n ± 0%   2.999n ± 0%  -15.48% (p=0.000 n=50)
    memmove/40         3.549n ± 0%   3.000n ± 0%  -15.48% (p=0.000 n=50)
    memmove/41         3.550n ± 0%   3.000n ± 0%  -15.49% (p=0.000 n=50)
    memmove/42         3.549n ± 0%   3.001n ± 0%  -15.45% (n=50)
    memmove/43         3.552n ± 0%   3.000n ± 0%  -15.54% (p=0.000 n=50)
    memmove/44         3.552n ± 0%   3.002n ± 0%  -15.49% (n=50)
    memmove/45         3.552n ± 0%   3.001n ± 0%  -15.49% (p=0.000 n=50)
    memmove/46         3.554n ± 0%   3.002n ± 0%  -15.52% (p=0.000 n=50)
    memmove/47         3.556n ± 0%   3.003n ± 0%  -15.56% (p=0.000 n=50)
    memmove/48         3.555n ± 0%   3.002n ± 0%  -15.56% (p=0.000 n=50)
    memmove/49         3.557n ± 0%   3.002n ± 0%  -15.58% (p=0.000 n=50)
    memmove/50         3.557n ± 0%   3.003n ± 0%  -15.58% (p=0.000 n=50)
    memmove/51         3.556n ± 0%   3.004n ± 0%  -15.52% (p=0.000 n=50)
    memmove/52         3.561n ± 0%   3.004n ± 0%  -15.63% (p=0.000 n=50)
    memmove/53         3.558n ± 0%   3.004n ± 0%  -15.57% (p=0.000 n=50)
    memmove/54         3.561n ± 0%   3.005n ± 0%  -15.61% (p=0.000 n=50)
    memmove/55         3.560n ± 0%   3.006n ± 0%  -15.58% (n=50)
    memmove/56         3.562n ± 0%   3.006n ± 0%  -15.60% (p=0.000 n=50)
    memmove/57         3.563n ± 0%   3.010n ± 0%  -15.52% (p=0.000 n=50)
    memmove/58         3.565n ± 0%   3.006n ± 0%  -15.66% (p=0.000 n=50)
    memmove/59         3.564n ± 0%   3.006n ± 0%  -15.66% (p=0.000 n=50)
    memmove/60         3.570n ± 0%   3.008n ± 0%  -15.75% (p=0.000 n=50)
    memmove/61         3.566n ± 0%   3.008n ± 0%  -15.67% (p=0.000 n=50)
    memmove/62         3.567n ± 0%   3.008n ± 0%  -15.68% (p=0.000 n=50)
    memmove/63         3.568n ± 0%   3.008n ± 0%  -15.69% (p=0.000 n=50)
    memmove/64         4.104n ± 0%   3.008n ± 0%  -26.70% (p=0.000 n=50)
    memmove/65         4.126n ± 0%   3.662n ± 0%  -11.25% (p=0.000 n=50)
    memmove/66         4.128n ± 0%   3.662n ± 0%  -11.28% (n=50)
    memmove/67         4.129n ± 0%   3.662n ± 0%  -11.31% (p=0.000 n=50)
    memmove/68         4.129n ± 0%   3.661n ± 0%  -11.32% (p=0.000 n=50)
    memmove/69         4.130n ± 0%   3.662n ± 0%  -11.35% (n=50)
    memmove/70         4.130n ± 0%   3.662n ± 0%  -11.34% (p=0.000 n=50)
    memmove/71         4.132n ± 0%   3.662n ± 0%  -11.37% (p=0.000 n=50)
    memmove/72         4.131n ± 0%   3.662n ± 0%  -11.37% (p=0.000 n=50)
    memmove/73         4.135n ± 0%   3.662n ± 0%  -11.43% (p=0.000 n=50)
    memmove/74         4.137n ± 0%   3.662n ± 0%  -11.49% (n=50)
    memmove/75         4.138n ± 0%   3.662n ± 0%  -11.51% (n=50)
    memmove/76         4.139n ± 0%   3.661n ± 0%  -11.55% (p=0.000 n=50)
    memmove/77         4.136n ± 0%   3.661n ± 0%  -11.48% (n=50)
    memmove/78         4.143n ± 0%   3.661n ± 0%  -11.62% (n=50)
    memmove/79         4.142n ± 0%   3.661n ± 0%  -11.60% (n=50)
    memmove/80         4.142n ± 0%   3.661n ± 0%  -11.61% (p=0.000 n=50)
    memmove/81         4.140n ± 0%   3.661n ± 0%  -11.56% (n=50)
    memmove/82         4.146n ± 0%   3.661n ± 0%  -11.68% (p=0.000 n=50)
    memmove/83         4.143n ± 0%   3.661n ± 0%  -11.63% (p=0.000 n=50)
    memmove/84         4.143n ± 0%   3.661n ± 0%  -11.62% (n=50)
    memmove/85         4.147n ± 0%   3.661n ± 0%  -11.72% (n=50)
    memmove/86         4.142n ± 0%   3.661n ± 0%  -11.62% (p=0.000 n=50)
    memmove/87         4.147n ± 0%   3.661n ± 0%  -11.73% (p=0.000 n=50)
    memmove/88         4.148n ± 0%   3.661n ± 0%  -11.75% (p=0.000 n=50)
    memmove/89         4.152n ± 0%   3.661n ± 0%  -11.83% (n=50)
    memmove/90         4.151n ± 0%   3.661n ± 0%  -11.82% (n=50)
    memmove/91         4.150n ± 0%   3.661n ± 0%  -11.78% (p=0.000 n=50)
    memmove/92         4.153n ± 0%   3.660n ± 0%  -11.87% (p=0.000 n=50)
    memmove/93         4.158n ± 0%   3.661n ± 0%  -11.95% (n=50)
    memmove/94         4.157n ± 0%   3.661n ± 0%  -11.95% (p=0.000 n=50)
    memmove/95         4.155n ± 0%   3.661n ± 0%  -11.90% (p=0.000 n=50)
    memmove/96         4.149n ± 0%   3.660n ± 0%  -11.80% (p=0.000 n=50)
    memmove/97         4.157n ± 0%   3.661n ± 0%  -11.94% (n=50)
    memmove/98         4.157n ± 0%   3.661n ± 0%  -11.94% (p=0.000 n=50)
    memmove/99         4.168n ± 0%   3.661n ± 0%  -12.17% (n=50)
    memmove/100        4.159n ± 0%   3.660n ± 0%  -12.00% (n=50)
    memmove/101        4.161n ± 0%   3.661n ± 0%  -12.03% (n=50)
    memmove/102        4.165n ± 0%   3.660n ± 0%  -12.12% (n=50)
    memmove/103        4.164n ± 0%   3.661n ± 0%  -12.08% (p=0.000 n=50)
    memmove/104        4.164n ± 0%   3.660n ± 0%  -12.11% (p=0.000 n=50)
    memmove/105        4.165n ± 0%   3.660n ± 0%  -12.12% (n=50)
    memmove/106        4.166n ± 0%   3.660n ± 0%  -12.15% (n=50)
    memmove/107        4.171n ± 0%   3.660n ± 0%  -12.25% (p=0.000 n=50)
    memmove/108        4.173n ± 0%   3.660n ± 0%  -12.29% (p=0.000 n=50)
    memmove/109        4.170n ± 0%   3.660n ± 0%  -12.24% (p=0.000 n=50)
    memmove/110        4.174n ± 0%   3.660n ± 0%  -12.31% (p=0.000 n=50)
    memmove/111        4.176n ± 0%   3.660n ± 0%  -12.35% (n=50)
    memmove/112        4.174n ± 0%   3.660n ± 0%  -12.33% (p=0.000 n=50)
    memmove/113        4.176n ± 0%   3.660n ± 0%  -12.35% (p=0.000 n=50)
    memmove/114        4.182n ± 0%   3.660n ± 0%  -12.48% (p=0.000 n=50)
    memmove/115        4.185n ± 0%   3.660n ± 0%  -12.55% (n=50)
    memmove/116        4.184n ± 0%   3.660n ± 0%  -12.54% (n=50)
    memmove/117        4.182n ± 0%   3.660n ± 0%  -12.49% (n=50)
    memmove/118        4.188n ± 0%   3.660n ± 0%  -12.60% (n=50)
    memmove/119        4.186n ± 0%   3.660n ± 0%  -12.56% (p=0.000 n=50)
    memmove/120        4.189n ± 0%   3.660n ± 0%  -12.62% (n=50)
    memmove/121        4.187n ± 0%   3.668n ± 0%  -12.40% (n=50)
    memmove/122        4.186n ± 0%   3.667n ± 0%  -12.39% (p=0.000 n=50)
    memmove/123        4.187n ± 0%   3.668n ± 0%  -12.41% (p=0.000 n=50)
    memmove/124        4.189n ± 0%   3.667n ± 0%  -12.46% (n=50)
    memmove/125        4.195n ± 0%   3.662n ± 1%  -12.72% (p=0.000 n=50)
    memmove/126        4.197n ± 0%   3.669n ± 0%  -12.59% (n=50)
    memmove/127        4.194n ± 0%   3.668n ± 0%  -12.53% (p=0.000 n=50)
    memmove/128        5.035n ± 0%   3.656n ± 2%  -27.38% (p=0.000 n=50)
```

>From 3687bd585b8e81afb74eea9b4a85574a828427c8 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Tue, 24 Oct 2023 15:27:06 +0200
Subject: [PATCH 1/3] [libc] Add --sweep-min-size flag for benchmarks

We have --sweep-max-size, it's reasonable to have --sweep-min-size as well.
It can be used when working on the logic for larger sizes,
or to collect a profile for larger sizes only.
---
 libc/benchmarks/LibcMemoryBenchmarkMain.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
index acd7c30717597a1..bc6fd8b38cb6ddc 100644
--- a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
+++ b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
@@ -42,9 +42,15 @@ static cl::opt<std::string>
     SizeDistributionName("size-distribution-name",
                          cl::desc("The name of the distribution to use"));
 
-static cl::opt<bool>
-    SweepMode("sweep-mode",
-              cl::desc("If set, benchmark all sizes from 0 to sweep-max-size"));
+static cl::opt<bool> SweepMode(
+    "sweep-mode",
+    cl::desc(
+        "If set, benchmark all sizes from sweep-min-size to sweep-max-size"));
+
+static cl::opt<uint32_t>
+    SweepMinSize("sweep-min-size",
+                 cl::desc("The minimum size to use in sweep-mode"),
+                 cl::init(0));
 
 static cl::opt<uint32_t>
     SweepMaxSize("sweep-max-size",
@@ -185,7 +191,7 @@ struct MemfunctionBenchmarkSweep final : public MemfunctionBenchmarkBase {
     BO.InitialIterations = 100;
     auto &Measurements = Study.Measurements;
     Measurements.reserve(NumTrials * SweepMaxSize);
-    for (size_t Size = 0; Size <= SweepMaxSize; ++Size) {
+    for (size_t Size = SweepMinSize; Size <= SweepMaxSize; ++Size) {
       CurrentSweepSize = Size;
       runTrials(BO, Measurements);
     }

>From 49e4f79378f9e3226865e72f3da7b0fb1336d115 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Tue, 24 Oct 2023 13:00:22 +0200
Subject: [PATCH 2/3] [libc] Optimize memmove for small sizes

Remove is_disjoint check for smaller sizes and reduce code bloat.

inline_memmove may handle some small sizes as efficiently
as inline_memcpy. For these sizes we may not do is_disjoint check.
This both avoids additional code for the most frequent smaller sizes
and removes code bloat (we don't need the memcpy logic for small sizes).
Here we heavily rely on inlining and dead code elimination: from the first
inline_memmove we should get only handling of small sizes, and from
the second inline_memmove and inline_memcpy we should get only handling
of larger sizes.
---
 libc/src/string/memmove.cpp                   | 19 +++++++++++++++----
 .../memory_utils/aarch64/inline_memmove.h     |  5 ++++-
 .../src/string/memory_utils/generic/builtin.h |  4 ++--
 .../memory_utils/generic/byte_per_byte.h      |  2 +-
 libc/src/string/memory_utils/inline_memmove.h | 13 +++++++++++--
 .../memory_utils/riscv/inline_memmove.h       |  8 +++++---
 .../memory_utils/x86_64/inline_memmove.h      |  5 ++++-
 7 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp
index 7d473afc0b42ee7..a6478629d514027 100644
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@@ -15,10 +15,21 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void *, memmove,
                    (void *dst, const void *src, size_t count)) {
-  if (is_disjoint(dst, src, count))
-    inline_memcpy(dst, src, count);
-  else
-    inline_memmove(dst, src, count);
+  // inline_memmove may handle some small sizes as efficiently
+  // as inline_memcpy. For these sizes we may not do is_disjoint check.
+  // This both avoids additional code for the most frequent smaller sizes
+  // and removes code bloat (we don't need the memcpy logic for small sizes).
+  // Here we heavily rely on inlining and dead code elimination: from the first
+  // inline_memmove we should get only handling of small sizes, and from
+  // the second inline_memmove and inline_memcpy we should get only handling
+  // of larger sizes.
+  inline_memmove(dst, src, count, true);
+  if (count >= LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE) {
+    if (is_disjoint(dst, src, count))
+      inline_memcpy(dst, src, count);
+    else
+      inline_memmove(dst, src, count);
+  }
   return dst;
 }
 
diff --git a/libc/src/string/memory_utils/aarch64/inline_memmove.h b/libc/src/string/memory_utils/aarch64/inline_memmove.h
index ca28655c916820c..5e5c23e34be62d1 100644
--- a/libc/src/string/memory_utils/aarch64/inline_memmove.h
+++ b/libc/src/string/memory_utils/aarch64/inline_memmove.h
@@ -18,7 +18,8 @@
 
 namespace LIBC_NAMESPACE {
 
-LIBC_INLINE void inline_memmove_aarch64(Ptr dst, CPtr src, size_t count) {
+LIBC_INLINE void inline_memmove_aarch64(Ptr dst, CPtr src, size_t count,
+                                        bool fast_only) {
   static_assert(aarch64::kNeon, "aarch64 supports vector types");
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
@@ -39,6 +40,8 @@ LIBC_INLINE void inline_memmove_aarch64(Ptr dst, CPtr src, size_t count) {
     return generic::Memmove<uint256_t>::head_tail(dst, src, count);
   if (count <= 128)
     return generic::Memmove<uint512_t>::head_tail(dst, src, count);
+  if (fast_only)
+    return;
   if (dst < src) {
     generic::Memmove<uint256_t>::align_forward<Arg::Src>(dst, src, count);
     return generic::Memmove<uint512_t>::loop_and_tail_forward(dst, src, count);
diff --git a/libc/src/string/memory_utils/generic/builtin.h b/libc/src/string/memory_utils/generic/builtin.h
index 5239329f653b341..1dabc856053d191 100644
--- a/libc/src/string/memory_utils/generic/builtin.h
+++ b/libc/src/string/memory_utils/generic/builtin.h
@@ -26,8 +26,8 @@ inline_memcpy_builtin(Ptr dst, CPtr src, size_t count, size_t offset = 0) {
   __builtin_memcpy(dst + offset, src + offset, count);
 }
 
-[[maybe_unused]] LIBC_INLINE void inline_memmove_builtin(Ptr dst, CPtr src,
-                                                         size_t count) {
+[[maybe_unused]] LIBC_INLINE void
+inline_memmove_builtin(Ptr dst, CPtr src, size_t count, bool fast_only) {
   __builtin_memmove(dst, src, count);
 }
 
diff --git a/libc/src/string/memory_utils/generic/byte_per_byte.h b/libc/src/string/memory_utils/generic/byte_per_byte.h
index a666c5da3136041..89497382aede338 100644
--- a/libc/src/string/memory_utils/generic/byte_per_byte.h
+++ b/libc/src/string/memory_utils/generic/byte_per_byte.h
@@ -29,7 +29,7 @@ inline_memcpy_byte_per_byte(Ptr dst, CPtr src, size_t count,
 }
 
 [[maybe_unused]] LIBC_INLINE void
-inline_memmove_byte_per_byte(Ptr dst, CPtr src, size_t count) {
+inline_memmove_byte_per_byte(Ptr dst, CPtr src, size_t count, bool fast_only) {
   if (count == 0 || dst == src)
     return;
   if (dst < src) {
diff --git a/libc/src/string/memory_utils/inline_memmove.h b/libc/src/string/memory_utils/inline_memmove.h
index f72ea24ab538d69..0440bbe94d542e9 100644
--- a/libc/src/string/memory_utils/inline_memmove.h
+++ b/libc/src/string/memory_utils/inline_memmove.h
@@ -14,27 +14,36 @@
 #if defined(LIBC_TARGET_ARCH_IS_X86)
 #include "src/string/memory_utils/x86_64/inline_memmove.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_x86
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 129
 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
 #include "src/string/memory_utils/aarch64/inline_memmove.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_aarch64
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 129
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #include "src/string/memory_utils/riscv/inline_memmove.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_riscv
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 0
 #elif defined(LIBC_TARGET_ARCH_IS_ARM)
 #include "src/string/memory_utils/generic/byte_per_byte.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_byte_per_byte
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 0
 #elif defined(LIBC_TARGET_ARCH_IS_GPU)
 #include "src/string/memory_utils/generic/builtin.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_builtin
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 0
 #else
 #error "Unsupported architecture"
 #endif
 
 namespace LIBC_NAMESPACE {
 
-LIBC_INLINE void inline_memmove(void *dst, const void *src, size_t count) {
+LIBC_INLINE void inline_memmove(void *dst, const void *src, size_t count,
+                                bool fast_only = false) {
+  if (LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE == 0 && fast_only)
+    return;
   LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE(reinterpret_cast<Ptr>(dst),
-                                       reinterpret_cast<CPtr>(src), count);
+                                       reinterpret_cast<CPtr>(src), count,
+                                       fast_only);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/string/memory_utils/riscv/inline_memmove.h b/libc/src/string/memory_utils/riscv/inline_memmove.h
index 1c26917a96d9d18..5e34b2817729972 100644
--- a/libc/src/string/memory_utils/riscv/inline_memmove.h
+++ b/libc/src/string/memory_utils/riscv/inline_memmove.h
@@ -17,9 +17,11 @@
 
 namespace LIBC_NAMESPACE {
 
-[[maybe_unused]] LIBC_INLINE void
-inline_memmove_riscv(Ptr __restrict dst, CPtr __restrict src, size_t count) {
-  return inline_memmove_byte_per_byte(dst, src, count);
+[[maybe_unused]] LIBC_INLINE void inline_memmove_riscv(Ptr __restrict dst,
+                                                       CPtr __restrict src,
+                                                       size_t count,
+                                                       bool fast_only) {
+  return inline_memmove_byte_per_byte(dst, src, count, fast_only);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/string/memory_utils/x86_64/inline_memmove.h b/libc/src/string/memory_utils/x86_64/inline_memmove.h
index 95ad07f75219581..229e0dbc4d17d49 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memmove.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memmove.h
@@ -18,7 +18,8 @@
 
 namespace LIBC_NAMESPACE {
 
-LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count) {
+LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count,
+                                    bool fast_only) {
 #if defined(__AVX512F__)
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
@@ -52,6 +53,8 @@ LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count) {
     return generic::Memmove<uint256_t>::head_tail(dst, src, count);
   if (count <= 128)
     return generic::Memmove<uint512_t>::head_tail(dst, src, count);
+  if (fast_only)
+    return;
   if (dst < src) {
     generic::Memmove<uint256_t>::align_forward<Arg::Src>(dst, src, count);
     return generic::Memmove<uint512_t>::loop_and_tail_forward(dst, src, count);

>From 9914fbe2efb5a9cc5d024d220a72eb3301389c48 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Tue, 24 Oct 2023 13:23:34 +0200
Subject: [PATCH 3/3] [libc] Optimize memmove size thresholds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use the memcpy thresholds for memmove.
Memcpy thresholds were more carefully tuned.
This becomes more important since we use
inline_memmove for all small sizes always now.

Also fix boundary conditions for sizes = 16/32/64.
See the added comment for explanations.

For this and the previous commit combined:

                 │  baseline   │          small-size-check           │
                 │   sec/op    │   sec/op     vs base                │
memmove/Google_A   3.208n ± 0%   2.909n ± 0%   -9.31% (n=100)
memmove/0          2.982n ± 0%   2.168n ± 0%  -27.27% (n=50)
memmove/1          3.253n ± 0%   2.169n ± 0%  -33.34% (n=50)
memmove/2          3.255n ± 0%   2.168n ± 6%  -33.40% (n=50)
memmove/3          3.259n ± 2%   2.175n ± 0%  -33.26% (n=50)
memmove/4          3.259n ± 0%   2.168n ± 0%  -33.45% (p=0.000 n=50)
memmove/5          2.488n ± 0%   1.926n ± 0%  -22.57% (n=50)
memmove/6          2.490n ± 0%   1.928n ± 0%  -22.58% (p=0.000 n=50)
memmove/7          2.492n ± 0%   1.928n ± 0%  -22.63% (n=50)
memmove/8          2.737n ± 0%   2.711n ± 0%   -0.97% (p=0.000 n=50)
memmove/9          2.736n ± 0%   2.711n ± 0%   -0.94% (p=0.000 n=50)
memmove/10         2.739n ± 0%   2.711n ± 0%   -1.04% (p=0.000 n=50)
memmove/11         2.740n ± 0%   2.711n ± 0%   -1.07% (p=0.000 n=50)
memmove/12         2.740n ± 0%   2.711n ± 0%   -1.09% (p=0.000 n=50)
memmove/13         2.744n ± 0%   2.711n ± 0%   -1.22% (p=0.000 n=50)
memmove/14         2.742n ± 0%   2.711n ± 0%   -1.14% (p=0.000 n=50)
memmove/15         2.742n ± 0%   2.711n ± 0%   -1.15% (p=0.000 n=50)
memmove/16         2.997n ± 0%   2.982n ± 0%   -0.52% (p=0.000 n=50)
memmove/17         2.998n ± 0%   2.982n ± 0%   -0.55% (p=0.000 n=50)
memmove/18         2.998n ± 0%   2.982n ± 0%   -0.54% (p=0.000 n=50)
memmove/19         2.999n ± 0%   2.981n ± 0%   -0.59% (p=0.000 n=50)
memmove/20         2.998n ± 0%   2.982n ± 0%   -0.55% (p=0.000 n=50)
memmove/21         3.000n ± 0%   2.982n ± 0%   -0.61% (p=0.000 n=50)
memmove/22         3.002n ± 0%   2.982n ± 0%   -0.68% (p=0.000 n=50)
memmove/23         3.002n ± 0%   2.981n ± 0%   -0.67% (p=0.000 n=50)
memmove/24         3.002n ± 0%   2.981n ± 0%   -0.70% (p=0.000 n=50)
memmove/25         3.002n ± 0%   2.982n ± 0%   -0.68% (p=0.000 n=50)
memmove/26         3.004n ± 0%   2.982n ± 0%   -0.74% (n=50)
memmove/27         3.005n ± 0%   2.982n ± 0%   -0.79% (p=0.000 n=50)
memmove/28         3.005n ± 0%   2.982n ± 0%   -0.77% (p=0.000 n=50)
memmove/29         3.009n ± 0%   2.982n ± 0%   -0.92% (n=50)
memmove/30         3.008n ± 0%   2.982n ± 0%   -0.89% (n=50)
memmove/31         3.007n ± 0%   2.981n ± 0%   -0.86% (n=50)
memmove/32         3.540n ± 0%   2.999n ± 0%  -15.30% (p=0.000 n=50)
memmove/33         3.544n ± 0%   2.998n ± 0%  -15.41% (p=0.000 n=50)
memmove/34         3.546n ± 0%   2.999n ± 0%  -15.42% (n=50)
memmove/35         3.545n ± 0%   2.999n ± 0%  -15.41% (p=0.000 n=50)
memmove/36         3.548n ± 0%   2.998n ± 0%  -15.51% (p=0.000 n=50)
memmove/37         3.546n ± 0%   2.999n ± 0%  -15.44% (n=50)
memmove/38         3.549n ± 0%   2.999n ± 0%  -15.50% (p=0.000 n=50)
memmove/39         3.549n ± 0%   2.999n ± 0%  -15.48% (p=0.000 n=50)
memmove/40         3.549n ± 0%   3.000n ± 0%  -15.48% (p=0.000 n=50)
memmove/41         3.550n ± 0%   3.000n ± 0%  -15.49% (p=0.000 n=50)
memmove/42         3.549n ± 0%   3.001n ± 0%  -15.45% (n=50)
memmove/43         3.552n ± 0%   3.000n ± 0%  -15.54% (p=0.000 n=50)
memmove/44         3.552n ± 0%   3.002n ± 0%  -15.49% (n=50)
memmove/45         3.552n ± 0%   3.001n ± 0%  -15.49% (p=0.000 n=50)
memmove/46         3.554n ± 0%   3.002n ± 0%  -15.52% (p=0.000 n=50)
memmove/47         3.556n ± 0%   3.003n ± 0%  -15.56% (p=0.000 n=50)
memmove/48         3.555n ± 0%   3.002n ± 0%  -15.56% (p=0.000 n=50)
memmove/49         3.557n ± 0%   3.002n ± 0%  -15.58% (p=0.000 n=50)
memmove/50         3.557n ± 0%   3.003n ± 0%  -15.58% (p=0.000 n=50)
memmove/51         3.556n ± 0%   3.004n ± 0%  -15.52% (p=0.000 n=50)
memmove/52         3.561n ± 0%   3.004n ± 0%  -15.63% (p=0.000 n=50)
memmove/53         3.558n ± 0%   3.004n ± 0%  -15.57% (p=0.000 n=50)
memmove/54         3.561n ± 0%   3.005n ± 0%  -15.61% (p=0.000 n=50)
memmove/55         3.560n ± 0%   3.006n ± 0%  -15.58% (n=50)
memmove/56         3.562n ± 0%   3.006n ± 0%  -15.60% (p=0.000 n=50)
memmove/57         3.563n ± 0%   3.010n ± 0%  -15.52% (p=0.000 n=50)
memmove/58         3.565n ± 0%   3.006n ± 0%  -15.66% (p=0.000 n=50)
memmove/59         3.564n ± 0%   3.006n ± 0%  -15.66% (p=0.000 n=50)
memmove/60         3.570n ± 0%   3.008n ± 0%  -15.75% (p=0.000 n=50)
memmove/61         3.566n ± 0%   3.008n ± 0%  -15.67% (p=0.000 n=50)
memmove/62         3.567n ± 0%   3.008n ± 0%  -15.68% (p=0.000 n=50)
memmove/63         3.568n ± 0%   3.008n ± 0%  -15.69% (p=0.000 n=50)
memmove/64         4.104n ± 0%   3.008n ± 0%  -26.70% (p=0.000 n=50)
memmove/65         4.126n ± 0%   3.662n ± 0%  -11.25% (p=0.000 n=50)
memmove/66         4.128n ± 0%   3.662n ± 0%  -11.28% (n=50)
memmove/67         4.129n ± 0%   3.662n ± 0%  -11.31% (p=0.000 n=50)
memmove/68         4.129n ± 0%   3.661n ± 0%  -11.32% (p=0.000 n=50)
memmove/69         4.130n ± 0%   3.662n ± 0%  -11.35% (n=50)
memmove/70         4.130n ± 0%   3.662n ± 0%  -11.34% (p=0.000 n=50)
memmove/71         4.132n ± 0%   3.662n ± 0%  -11.37% (p=0.000 n=50)
memmove/72         4.131n ± 0%   3.662n ± 0%  -11.37% (p=0.000 n=50)
memmove/73         4.135n ± 0%   3.662n ± 0%  -11.43% (p=0.000 n=50)
memmove/74         4.137n ± 0%   3.662n ± 0%  -11.49% (n=50)
memmove/75         4.138n ± 0%   3.662n ± 0%  -11.51% (n=50)
memmove/76         4.139n ± 0%   3.661n ± 0%  -11.55% (p=0.000 n=50)
memmove/77         4.136n ± 0%   3.661n ± 0%  -11.48% (n=50)
memmove/78         4.143n ± 0%   3.661n ± 0%  -11.62% (n=50)
memmove/79         4.142n ± 0%   3.661n ± 0%  -11.60% (n=50)
memmove/80         4.142n ± 0%   3.661n ± 0%  -11.61% (p=0.000 n=50)
memmove/81         4.140n ± 0%   3.661n ± 0%  -11.56% (n=50)
memmove/82         4.146n ± 0%   3.661n ± 0%  -11.68% (p=0.000 n=50)
memmove/83         4.143n ± 0%   3.661n ± 0%  -11.63% (p=0.000 n=50)
memmove/84         4.143n ± 0%   3.661n ± 0%  -11.62% (n=50)
memmove/85         4.147n ± 0%   3.661n ± 0%  -11.72% (n=50)
memmove/86         4.142n ± 0%   3.661n ± 0%  -11.62% (p=0.000 n=50)
memmove/87         4.147n ± 0%   3.661n ± 0%  -11.73% (p=0.000 n=50)
memmove/88         4.148n ± 0%   3.661n ± 0%  -11.75% (p=0.000 n=50)
memmove/89         4.152n ± 0%   3.661n ± 0%  -11.83% (n=50)
memmove/90         4.151n ± 0%   3.661n ± 0%  -11.82% (n=50)
memmove/91         4.150n ± 0%   3.661n ± 0%  -11.78% (p=0.000 n=50)
memmove/92         4.153n ± 0%   3.660n ± 0%  -11.87% (p=0.000 n=50)
memmove/93         4.158n ± 0%   3.661n ± 0%  -11.95% (n=50)
memmove/94         4.157n ± 0%   3.661n ± 0%  -11.95% (p=0.000 n=50)
memmove/95         4.155n ± 0%   3.661n ± 0%  -11.90% (p=0.000 n=50)
memmove/96         4.149n ± 0%   3.660n ± 0%  -11.80% (p=0.000 n=50)
memmove/97         4.157n ± 0%   3.661n ± 0%  -11.94% (n=50)
memmove/98         4.157n ± 0%   3.661n ± 0%  -11.94% (p=0.000 n=50)
memmove/99         4.168n ± 0%   3.661n ± 0%  -12.17% (n=50)
memmove/100        4.159n ± 0%   3.660n ± 0%  -12.00% (n=50)
memmove/101        4.161n ± 0%   3.661n ± 0%  -12.03% (n=50)
memmove/102        4.165n ± 0%   3.660n ± 0%  -12.12% (n=50)
memmove/103        4.164n ± 0%   3.661n ± 0%  -12.08% (p=0.000 n=50)
memmove/104        4.164n ± 0%   3.660n ± 0%  -12.11% (p=0.000 n=50)
memmove/105        4.165n ± 0%   3.660n ± 0%  -12.12% (n=50)
memmove/106        4.166n ± 0%   3.660n ± 0%  -12.15% (n=50)
memmove/107        4.171n ± 0%   3.660n ± 0%  -12.25% (p=0.000 n=50)
memmove/108        4.173n ± 0%   3.660n ± 0%  -12.29% (p=0.000 n=50)
memmove/109        4.170n ± 0%   3.660n ± 0%  -12.24% (p=0.000 n=50)
memmove/110        4.174n ± 0%   3.660n ± 0%  -12.31% (p=0.000 n=50)
memmove/111        4.176n ± 0%   3.660n ± 0%  -12.35% (n=50)
memmove/112        4.174n ± 0%   3.660n ± 0%  -12.33% (p=0.000 n=50)
memmove/113        4.176n ± 0%   3.660n ± 0%  -12.35% (p=0.000 n=50)
memmove/114        4.182n ± 0%   3.660n ± 0%  -12.48% (p=0.000 n=50)
memmove/115        4.185n ± 0%   3.660n ± 0%  -12.55% (n=50)
memmove/116        4.184n ± 0%   3.660n ± 0%  -12.54% (n=50)
memmove/117        4.182n ± 0%   3.660n ± 0%  -12.49% (n=50)
memmove/118        4.188n ± 0%   3.660n ± 0%  -12.60% (n=50)
memmove/119        4.186n ± 0%   3.660n ± 0%  -12.56% (p=0.000 n=50)
memmove/120        4.189n ± 0%   3.660n ± 0%  -12.62% (n=50)
memmove/121        4.187n ± 0%   3.668n ± 0%  -12.40% (n=50)
memmove/122        4.186n ± 0%   3.667n ± 0%  -12.39% (p=0.000 n=50)
memmove/123        4.187n ± 0%   3.668n ± 0%  -12.41% (p=0.000 n=50)
memmove/124        4.189n ± 0%   3.667n ± 0%  -12.46% (n=50)
memmove/125        4.195n ± 0%   3.662n ± 1%  -12.72% (p=0.000 n=50)
memmove/126        4.197n ± 0%   3.669n ± 0%  -12.59% (n=50)
memmove/127        4.194n ± 0%   3.668n ± 0%  -12.53% (p=0.000 n=50)
memmove/128        5.035n ± 0%   3.656n ± 2%  -27.38% (p=0.000 n=50)
---
 .../memory_utils/x86_64/inline_memmove.h      | 30 +++++++++++++++----
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/libc/src/string/memory_utils/x86_64/inline_memmove.h b/libc/src/string/memory_utils/x86_64/inline_memmove.h
index 229e0dbc4d17d49..ee397c63471f1ad 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memmove.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memmove.h
@@ -21,35 +21,53 @@ namespace LIBC_NAMESPACE {
 LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count,
                                     bool fast_only) {
 #if defined(__AVX512F__)
+  constexpr size_t vector_size = 64;
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
   using uint512_t = generic_v512;
 #elif defined(__AVX__)
+  constexpr size_t vector_size = 32;
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
   using uint512_t = cpp::array<generic_v256, 2>;
 #elif defined(__SSE2__)
+  constexpr size_t vector_size = 16;
   using uint128_t = generic_v128;
   using uint256_t = cpp::array<generic_v128, 2>;
   using uint512_t = cpp::array<generic_v128, 4>;
 #else
+  constexpr size_t vector_size = 8;
   using uint128_t = cpp::array<uint64_t, 2>;
   using uint256_t = cpp::array<uint64_t, 4>;
   using uint512_t = cpp::array<uint64_t, 8>;
 #endif
+  (void)vector_size;
   if (count == 0)
     return;
   if (count == 1)
     return generic::Memmove<uint8_t>::block(dst, src);
-  if (count <= 4)
-    return generic::Memmove<uint16_t>::head_tail(dst, src, count);
-  if (count <= 8)
+  if (count == 2)
+    return generic::Memmove<uint16_t>::block(dst, src);
+  if (count == 3)
+    return generic::Memmove<cpp::array<uint8_t, 3>>::block(dst, src);
+  if (count == 4)
+    return generic::Memmove<uint32_t>::block(dst, src);
+  if (count < 8)
     return generic::Memmove<uint32_t>::head_tail(dst, src, count);
-  if (count <= 16)
+  // If count is equal to a power of 2, we can handle it as head-tail
+  // of both smaller size and larger size (head-tail are either
+  // non-overlapping for smaller size, or completely collapsed
+  // for larger size). It seems to be more profitable to do the copy
+  // with the larger size, if it's natively supported (e.g. doing
+  // 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
+  // But it's not profitable to use larger size if it's not natively
+  // supported: we will both use more instructions and handle fewer
+  // sizes in earlier branches.
+  if (count < 16 + (vector_size <= sizeof(uint64_t)))
     return generic::Memmove<uint64_t>::head_tail(dst, src, count);
-  if (count <= 32)
+  if (count < 32 + (vector_size <= sizeof(uint128_t)))
     return generic::Memmove<uint128_t>::head_tail(dst, src, count);
-  if (count <= 64)
+  if (count < 64 + (vector_size <= sizeof(uint256_t)))
     return generic::Memmove<uint256_t>::head_tail(dst, src, count);
   if (count <= 128)
     return generic::Memmove<uint512_t>::head_tail(dst, src, count);



More information about the libc-commits mailing list