[libc-commits] [libc] memmove optimizations (PR #70043)

via libc-commits libc-commits at lists.llvm.org
Tue Oct 24 06:57:01 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-libc

Author: Dmitry Vyukov (dvyukov)

<details>
<summary>Changes</summary>

See individual commits for description.
```
                     │  baseline   │          small-size-check           │
                     │   sec/op    │   sec/op     vs base                │
    memmove/Google_A   3.208n ± 0%   2.909n ± 0%   -9.31% (n=100)
    memmove/0          2.982n ± 0%   2.168n ± 0%  -27.27% (n=50)
    memmove/1          3.253n ± 0%   2.169n ± 0%  -33.34% (n=50)
    memmove/2          3.255n ± 0%   2.168n ± 6%  -33.40% (n=50)
    memmove/3          3.259n ± 2%   2.175n ± 0%  -33.26% (n=50)
    memmove/4          3.259n ± 0%   2.168n ± 0%  -33.45% (p=0.000 n=50)
    memmove/5          2.488n ± 0%   1.926n ± 0%  -22.57% (n=50)
    memmove/6          2.490n ± 0%   1.928n ± 0%  -22.58% (p=0.000 n=50)
    memmove/7          2.492n ± 0%   1.928n ± 0%  -22.63% (n=50)
    memmove/8          2.737n ± 0%   2.711n ± 0%   -0.97% (p=0.000 n=50)
    memmove/9          2.736n ± 0%   2.711n ± 0%   -0.94% (p=0.000 n=50)
    memmove/10         2.739n ± 0%   2.711n ± 0%   -1.04% (p=0.000 n=50)
    memmove/11         2.740n ± 0%   2.711n ± 0%   -1.07% (p=0.000 n=50)
    memmove/12         2.740n ± 0%   2.711n ± 0%   -1.09% (p=0.000 n=50)
    memmove/13         2.744n ± 0%   2.711n ± 0%   -1.22% (p=0.000 n=50)
    memmove/14         2.742n ± 0%   2.711n ± 0%   -1.14% (p=0.000 n=50)
    memmove/15         2.742n ± 0%   2.711n ± 0%   -1.15% (p=0.000 n=50)
    memmove/16         2.997n ± 0%   2.982n ± 0%   -0.52% (p=0.000 n=50)
    memmove/17         2.998n ± 0%   2.982n ± 0%   -0.55% (p=0.000 n=50)
    memmove/18         2.998n ± 0%   2.982n ± 0%   -0.54% (p=0.000 n=50)
    memmove/19         2.999n ± 0%   2.981n ± 0%   -0.59% (p=0.000 n=50)
    memmove/20         2.998n ± 0%   2.982n ± 0%   -0.55% (p=0.000 n=50)
    memmove/21         3.000n ± 0%   2.982n ± 0%   -0.61% (p=0.000 n=50)
    memmove/22         3.002n ± 0%   2.982n ± 0%   -0.68% (p=0.000 n=50)
    memmove/23         3.002n ± 0%   2.981n ± 0%   -0.67% (p=0.000 n=50)
    memmove/24         3.002n ± 0%   2.981n ± 0%   -0.70% (p=0.000 n=50)
    memmove/25         3.002n ± 0%   2.982n ± 0%   -0.68% (p=0.000 n=50)
    memmove/26         3.004n ± 0%   2.982n ± 0%   -0.74% (n=50)
    memmove/27         3.005n ± 0%   2.982n ± 0%   -0.79% (p=0.000 n=50)
    memmove/28         3.005n ± 0%   2.982n ± 0%   -0.77% (p=0.000 n=50)
    memmove/29         3.009n ± 0%   2.982n ± 0%   -0.92% (n=50)
    memmove/30         3.008n ± 0%   2.982n ± 0%   -0.89% (n=50)
    memmove/31         3.007n ± 0%   2.981n ± 0%   -0.86% (n=50)
    memmove/32         3.540n ± 0%   2.999n ± 0%  -15.30% (p=0.000 n=50)
    memmove/33         3.544n ± 0%   2.998n ± 0%  -15.41% (p=0.000 n=50)
    memmove/34         3.546n ± 0%   2.999n ± 0%  -15.42% (n=50)
    memmove/35         3.545n ± 0%   2.999n ± 0%  -15.41% (p=0.000 n=50)
    memmove/36         3.548n ± 0%   2.998n ± 0%  -15.51% (p=0.000 n=50)
    memmove/37         3.546n ± 0%   2.999n ± 0%  -15.44% (n=50)
    memmove/38         3.549n ± 0%   2.999n ± 0%  -15.50% (p=0.000 n=50)
    memmove/39         3.549n ± 0%   2.999n ± 0%  -15.48% (p=0.000 n=50)
    memmove/40         3.549n ± 0%   3.000n ± 0%  -15.48% (p=0.000 n=50)
    memmove/41         3.550n ± 0%   3.000n ± 0%  -15.49% (p=0.000 n=50)
    memmove/42         3.549n ± 0%   3.001n ± 0%  -15.45% (n=50)
    memmove/43         3.552n ± 0%   3.000n ± 0%  -15.54% (p=0.000 n=50)
    memmove/44         3.552n ± 0%   3.002n ± 0%  -15.49% (n=50)
    memmove/45         3.552n ± 0%   3.001n ± 0%  -15.49% (p=0.000 n=50)
    memmove/46         3.554n ± 0%   3.002n ± 0%  -15.52% (p=0.000 n=50)
    memmove/47         3.556n ± 0%   3.003n ± 0%  -15.56% (p=0.000 n=50)
    memmove/48         3.555n ± 0%   3.002n ± 0%  -15.56% (p=0.000 n=50)
    memmove/49         3.557n ± 0%   3.002n ± 0%  -15.58% (p=0.000 n=50)
    memmove/50         3.557n ± 0%   3.003n ± 0%  -15.58% (p=0.000 n=50)
    memmove/51         3.556n ± 0%   3.004n ± 0%  -15.52% (p=0.000 n=50)
    memmove/52         3.561n ± 0%   3.004n ± 0%  -15.63% (p=0.000 n=50)
    memmove/53         3.558n ± 0%   3.004n ± 0%  -15.57% (p=0.000 n=50)
    memmove/54         3.561n ± 0%   3.005n ± 0%  -15.61% (p=0.000 n=50)
    memmove/55         3.560n ± 0%   3.006n ± 0%  -15.58% (n=50)
    memmove/56         3.562n ± 0%   3.006n ± 0%  -15.60% (p=0.000 n=50)
    memmove/57         3.563n ± 0%   3.010n ± 0%  -15.52% (p=0.000 n=50)
    memmove/58         3.565n ± 0%   3.006n ± 0%  -15.66% (p=0.000 n=50)
    memmove/59         3.564n ± 0%   3.006n ± 0%  -15.66% (p=0.000 n=50)
    memmove/60         3.570n ± 0%   3.008n ± 0%  -15.75% (p=0.000 n=50)
    memmove/61         3.566n ± 0%   3.008n ± 0%  -15.67% (p=0.000 n=50)
    memmove/62         3.567n ± 0%   3.008n ± 0%  -15.68% (p=0.000 n=50)
    memmove/63         3.568n ± 0%   3.008n ± 0%  -15.69% (p=0.000 n=50)
    memmove/64         4.104n ± 0%   3.008n ± 0%  -26.70% (p=0.000 n=50)
    memmove/65         4.126n ± 0%   3.662n ± 0%  -11.25% (p=0.000 n=50)
    memmove/66         4.128n ± 0%   3.662n ± 0%  -11.28% (n=50)
    memmove/67         4.129n ± 0%   3.662n ± 0%  -11.31% (p=0.000 n=50)
    memmove/68         4.129n ± 0%   3.661n ± 0%  -11.32% (p=0.000 n=50)
    memmove/69         4.130n ± 0%   3.662n ± 0%  -11.35% (n=50)
    memmove/70         4.130n ± 0%   3.662n ± 0%  -11.34% (p=0.000 n=50)
    memmove/71         4.132n ± 0%   3.662n ± 0%  -11.37% (p=0.000 n=50)
    memmove/72         4.131n ± 0%   3.662n ± 0%  -11.37% (p=0.000 n=50)
    memmove/73         4.135n ± 0%   3.662n ± 0%  -11.43% (p=0.000 n=50)
    memmove/74         4.137n ± 0%   3.662n ± 0%  -11.49% (n=50)
    memmove/75         4.138n ± 0%   3.662n ± 0%  -11.51% (n=50)
    memmove/76         4.139n ± 0%   3.661n ± 0%  -11.55% (p=0.000 n=50)
    memmove/77         4.136n ± 0%   3.661n ± 0%  -11.48% (n=50)
    memmove/78         4.143n ± 0%   3.661n ± 0%  -11.62% (n=50)
    memmove/79         4.142n ± 0%   3.661n ± 0%  -11.60% (n=50)
    memmove/80         4.142n ± 0%   3.661n ± 0%  -11.61% (p=0.000 n=50)
    memmove/81         4.140n ± 0%   3.661n ± 0%  -11.56% (n=50)
    memmove/82         4.146n ± 0%   3.661n ± 0%  -11.68% (p=0.000 n=50)
    memmove/83         4.143n ± 0%   3.661n ± 0%  -11.63% (p=0.000 n=50)
    memmove/84         4.143n ± 0%   3.661n ± 0%  -11.62% (n=50)
    memmove/85         4.147n ± 0%   3.661n ± 0%  -11.72% (n=50)
    memmove/86         4.142n ± 0%   3.661n ± 0%  -11.62% (p=0.000 n=50)
    memmove/87         4.147n ± 0%   3.661n ± 0%  -11.73% (p=0.000 n=50)
    memmove/88         4.148n ± 0%   3.661n ± 0%  -11.75% (p=0.000 n=50)
    memmove/89         4.152n ± 0%   3.661n ± 0%  -11.83% (n=50)
    memmove/90         4.151n ± 0%   3.661n ± 0%  -11.82% (n=50)
    memmove/91         4.150n ± 0%   3.661n ± 0%  -11.78% (p=0.000 n=50)
    memmove/92         4.153n ± 0%   3.660n ± 0%  -11.87% (p=0.000 n=50)
    memmove/93         4.158n ± 0%   3.661n ± 0%  -11.95% (n=50)
    memmove/94         4.157n ± 0%   3.661n ± 0%  -11.95% (p=0.000 n=50)
    memmove/95         4.155n ± 0%   3.661n ± 0%  -11.90% (p=0.000 n=50)
    memmove/96         4.149n ± 0%   3.660n ± 0%  -11.80% (p=0.000 n=50)
    memmove/97         4.157n ± 0%   3.661n ± 0%  -11.94% (n=50)
    memmove/98         4.157n ± 0%   3.661n ± 0%  -11.94% (p=0.000 n=50)
    memmove/99         4.168n ± 0%   3.661n ± 0%  -12.17% (n=50)
    memmove/100        4.159n ± 0%   3.660n ± 0%  -12.00% (n=50)
    memmove/101        4.161n ± 0%   3.661n ± 0%  -12.03% (n=50)
    memmove/102        4.165n ± 0%   3.660n ± 0%  -12.12% (n=50)
    memmove/103        4.164n ± 0%   3.661n ± 0%  -12.08% (p=0.000 n=50)
    memmove/104        4.164n ± 0%   3.660n ± 0%  -12.11% (p=0.000 n=50)
    memmove/105        4.165n ± 0%   3.660n ± 0%  -12.12% (n=50)
    memmove/106        4.166n ± 0%   3.660n ± 0%  -12.15% (n=50)
    memmove/107        4.171n ± 0%   3.660n ± 0%  -12.25% (p=0.000 n=50)
    memmove/108        4.173n ± 0%   3.660n ± 0%  -12.29% (p=0.000 n=50)
    memmove/109        4.170n ± 0%   3.660n ± 0%  -12.24% (p=0.000 n=50)
    memmove/110        4.174n ± 0%   3.660n ± 0%  -12.31% (p=0.000 n=50)
    memmove/111        4.176n ± 0%   3.660n ± 0%  -12.35% (n=50)
    memmove/112        4.174n ± 0%   3.660n ± 0%  -12.33% (p=0.000 n=50)
    memmove/113        4.176n ± 0%   3.660n ± 0%  -12.35% (p=0.000 n=50)
    memmove/114        4.182n ± 0%   3.660n ± 0%  -12.48% (p=0.000 n=50)
    memmove/115        4.185n ± 0%   3.660n ± 0%  -12.55% (n=50)
    memmove/116        4.184n ± 0%   3.660n ± 0%  -12.54% (n=50)
    memmove/117        4.182n ± 0%   3.660n ± 0%  -12.49% (n=50)
    memmove/118        4.188n ± 0%   3.660n ± 0%  -12.60% (n=50)
    memmove/119        4.186n ± 0%   3.660n ± 0%  -12.56% (p=0.000 n=50)
    memmove/120        4.189n ± 0%   3.660n ± 0%  -12.62% (n=50)
    memmove/121        4.187n ± 0%   3.668n ± 0%  -12.40% (n=50)
    memmove/122        4.186n ± 0%   3.667n ± 0%  -12.39% (p=0.000 n=50)
    memmove/123        4.187n ± 0%   3.668n ± 0%  -12.41% (p=0.000 n=50)
    memmove/124        4.189n ± 0%   3.667n ± 0%  -12.46% (n=50)
    memmove/125        4.195n ± 0%   3.662n ± 1%  -12.72% (p=0.000 n=50)
    memmove/126        4.197n ± 0%   3.669n ± 0%  -12.59% (n=50)
    memmove/127        4.194n ± 0%   3.668n ± 0%  -12.53% (p=0.000 n=50)
    memmove/128        5.035n ± 0%   3.656n ± 2%  -27.38% (p=0.000 n=50)
```

---
Full diff: https://github.com/llvm/llvm-project/pull/70043.diff


8 Files Affected:

- (modified) libc/benchmarks/LibcMemoryBenchmarkMain.cpp (+10-4) 
- (modified) libc/src/string/memmove.cpp (+15-4) 
- (modified) libc/src/string/memory_utils/aarch64/inline_memmove.h (+4-1) 
- (modified) libc/src/string/memory_utils/generic/builtin.h (+2-2) 
- (modified) libc/src/string/memory_utils/generic/byte_per_byte.h (+1-1) 
- (modified) libc/src/string/memory_utils/inline_memmove.h (+11-2) 
- (modified) libc/src/string/memory_utils/riscv/inline_memmove.h (+5-3) 
- (modified) libc/src/string/memory_utils/x86_64/inline_memmove.h (+28-7) 


``````````diff
diff --git a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
index acd7c30717597a1..bc6fd8b38cb6ddc 100644
--- a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
+++ b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
@@ -42,9 +42,15 @@ static cl::opt<std::string>
     SizeDistributionName("size-distribution-name",
                          cl::desc("The name of the distribution to use"));
 
-static cl::opt<bool>
-    SweepMode("sweep-mode",
-              cl::desc("If set, benchmark all sizes from 0 to sweep-max-size"));
+static cl::opt<bool> SweepMode(
+    "sweep-mode",
+    cl::desc(
+        "If set, benchmark all sizes from sweep-min-size to sweep-max-size"));
+
+static cl::opt<uint32_t>
+    SweepMinSize("sweep-min-size",
+                 cl::desc("The minimum size to use in sweep-mode"),
+                 cl::init(0));
 
 static cl::opt<uint32_t>
     SweepMaxSize("sweep-max-size",
@@ -185,7 +191,7 @@ struct MemfunctionBenchmarkSweep final : public MemfunctionBenchmarkBase {
     BO.InitialIterations = 100;
     auto &Measurements = Study.Measurements;
     Measurements.reserve(NumTrials * SweepMaxSize);
-    for (size_t Size = 0; Size <= SweepMaxSize; ++Size) {
+    for (size_t Size = SweepMinSize; Size <= SweepMaxSize; ++Size) {
       CurrentSweepSize = Size;
       runTrials(BO, Measurements);
     }
diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp
index 7d473afc0b42ee7..a6478629d514027 100644
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@@ -15,10 +15,21 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void *, memmove,
                    (void *dst, const void *src, size_t count)) {
-  if (is_disjoint(dst, src, count))
-    inline_memcpy(dst, src, count);
-  else
-    inline_memmove(dst, src, count);
+  // inline_memmove may handle some small sizes as efficiently
+  // as inline_memcpy. For these sizes we may not do is_disjoint check.
+  // This both avoids additional code for the most frequent smaller sizes
+  // and removes code bloat (we don't need the memcpy logic for small sizes).
+  // Here we heavily rely on inlining and dead code elimination: from the first
+  // inline_memmove we should get only handling of small sizes, and from
+  // the second inline_memmove and inline_memcpy we should get only handling
+  // of larger sizes.
+  inline_memmove(dst, src, count, true);
+  if (count >= LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE) {
+    if (is_disjoint(dst, src, count))
+      inline_memcpy(dst, src, count);
+    else
+      inline_memmove(dst, src, count);
+  }
   return dst;
 }
 
diff --git a/libc/src/string/memory_utils/aarch64/inline_memmove.h b/libc/src/string/memory_utils/aarch64/inline_memmove.h
index ca28655c916820c..5e5c23e34be62d1 100644
--- a/libc/src/string/memory_utils/aarch64/inline_memmove.h
+++ b/libc/src/string/memory_utils/aarch64/inline_memmove.h
@@ -18,7 +18,8 @@
 
 namespace LIBC_NAMESPACE {
 
-LIBC_INLINE void inline_memmove_aarch64(Ptr dst, CPtr src, size_t count) {
+LIBC_INLINE void inline_memmove_aarch64(Ptr dst, CPtr src, size_t count,
+                                        bool fast_only) {
   static_assert(aarch64::kNeon, "aarch64 supports vector types");
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
@@ -39,6 +40,8 @@ LIBC_INLINE void inline_memmove_aarch64(Ptr dst, CPtr src, size_t count) {
     return generic::Memmove<uint256_t>::head_tail(dst, src, count);
   if (count <= 128)
     return generic::Memmove<uint512_t>::head_tail(dst, src, count);
+  if (fast_only)
+    return;
   if (dst < src) {
     generic::Memmove<uint256_t>::align_forward<Arg::Src>(dst, src, count);
     return generic::Memmove<uint512_t>::loop_and_tail_forward(dst, src, count);
diff --git a/libc/src/string/memory_utils/generic/builtin.h b/libc/src/string/memory_utils/generic/builtin.h
index 5239329f653b341..1dabc856053d191 100644
--- a/libc/src/string/memory_utils/generic/builtin.h
+++ b/libc/src/string/memory_utils/generic/builtin.h
@@ -26,8 +26,8 @@ inline_memcpy_builtin(Ptr dst, CPtr src, size_t count, size_t offset = 0) {
   __builtin_memcpy(dst + offset, src + offset, count);
 }
 
-[[maybe_unused]] LIBC_INLINE void inline_memmove_builtin(Ptr dst, CPtr src,
-                                                         size_t count) {
+[[maybe_unused]] LIBC_INLINE void
+inline_memmove_builtin(Ptr dst, CPtr src, size_t count, bool fast_only) {
   __builtin_memmove(dst, src, count);
 }
 
diff --git a/libc/src/string/memory_utils/generic/byte_per_byte.h b/libc/src/string/memory_utils/generic/byte_per_byte.h
index a666c5da3136041..89497382aede338 100644
--- a/libc/src/string/memory_utils/generic/byte_per_byte.h
+++ b/libc/src/string/memory_utils/generic/byte_per_byte.h
@@ -29,7 +29,7 @@ inline_memcpy_byte_per_byte(Ptr dst, CPtr src, size_t count,
 }
 
 [[maybe_unused]] LIBC_INLINE void
-inline_memmove_byte_per_byte(Ptr dst, CPtr src, size_t count) {
+inline_memmove_byte_per_byte(Ptr dst, CPtr src, size_t count, bool fast_only) {
   if (count == 0 || dst == src)
     return;
   if (dst < src) {
diff --git a/libc/src/string/memory_utils/inline_memmove.h b/libc/src/string/memory_utils/inline_memmove.h
index f72ea24ab538d69..0440bbe94d542e9 100644
--- a/libc/src/string/memory_utils/inline_memmove.h
+++ b/libc/src/string/memory_utils/inline_memmove.h
@@ -14,27 +14,36 @@
 #if defined(LIBC_TARGET_ARCH_IS_X86)
 #include "src/string/memory_utils/x86_64/inline_memmove.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_x86
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 129
 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
 #include "src/string/memory_utils/aarch64/inline_memmove.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_aarch64
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 129
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #include "src/string/memory_utils/riscv/inline_memmove.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_riscv
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 0
 #elif defined(LIBC_TARGET_ARCH_IS_ARM)
 #include "src/string/memory_utils/generic/byte_per_byte.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_byte_per_byte
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 0
 #elif defined(LIBC_TARGET_ARCH_IS_GPU)
 #include "src/string/memory_utils/generic/builtin.h"
 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_builtin
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 0
 #else
 #error "Unsupported architecture"
 #endif
 
 namespace LIBC_NAMESPACE {
 
-LIBC_INLINE void inline_memmove(void *dst, const void *src, size_t count) {
+LIBC_INLINE void inline_memmove(void *dst, const void *src, size_t count,
+                                bool fast_only = false) {
+  if (LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE == 0 && fast_only)
+    return;
   LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE(reinterpret_cast<Ptr>(dst),
-                                       reinterpret_cast<CPtr>(src), count);
+                                       reinterpret_cast<CPtr>(src), count,
+                                       fast_only);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/string/memory_utils/riscv/inline_memmove.h b/libc/src/string/memory_utils/riscv/inline_memmove.h
index 1c26917a96d9d18..5e34b2817729972 100644
--- a/libc/src/string/memory_utils/riscv/inline_memmove.h
+++ b/libc/src/string/memory_utils/riscv/inline_memmove.h
@@ -17,9 +17,11 @@
 
 namespace LIBC_NAMESPACE {
 
-[[maybe_unused]] LIBC_INLINE void
-inline_memmove_riscv(Ptr __restrict dst, CPtr __restrict src, size_t count) {
-  return inline_memmove_byte_per_byte(dst, src, count);
+[[maybe_unused]] LIBC_INLINE void inline_memmove_riscv(Ptr __restrict dst,
+                                                       CPtr __restrict src,
+                                                       size_t count,
+                                                       bool fast_only) {
+  return inline_memmove_byte_per_byte(dst, src, count, fast_only);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/string/memory_utils/x86_64/inline_memmove.h b/libc/src/string/memory_utils/x86_64/inline_memmove.h
index 95ad07f75219581..ee397c63471f1ad 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memmove.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memmove.h
@@ -18,40 +18,61 @@
 
 namespace LIBC_NAMESPACE {
 
-LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count) {
+LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count,
+                                    bool fast_only) {
 #if defined(__AVX512F__)
+  constexpr size_t vector_size = 64;
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
   using uint512_t = generic_v512;
 #elif defined(__AVX__)
+  constexpr size_t vector_size = 32;
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
   using uint512_t = cpp::array<generic_v256, 2>;
 #elif defined(__SSE2__)
+  constexpr size_t vector_size = 16;
   using uint128_t = generic_v128;
   using uint256_t = cpp::array<generic_v128, 2>;
   using uint512_t = cpp::array<generic_v128, 4>;
 #else
+  constexpr size_t vector_size = 8;
   using uint128_t = cpp::array<uint64_t, 2>;
   using uint256_t = cpp::array<uint64_t, 4>;
   using uint512_t = cpp::array<uint64_t, 8>;
 #endif
+  (void)vector_size;
   if (count == 0)
     return;
   if (count == 1)
     return generic::Memmove<uint8_t>::block(dst, src);
-  if (count <= 4)
-    return generic::Memmove<uint16_t>::head_tail(dst, src, count);
-  if (count <= 8)
+  if (count == 2)
+    return generic::Memmove<uint16_t>::block(dst, src);
+  if (count == 3)
+    return generic::Memmove<cpp::array<uint8_t, 3>>::block(dst, src);
+  if (count == 4)
+    return generic::Memmove<uint32_t>::block(dst, src);
+  if (count < 8)
     return generic::Memmove<uint32_t>::head_tail(dst, src, count);
-  if (count <= 16)
+  // If count is equal to a power of 2, we can handle it as head-tail
+  // of both smaller size and larger size (head-tail are either
+  // non-overlapping for smaller size, or completely collapsed
+  // for larger size). It seems to be more profitable to do the copy
+  // with the larger size, if it's natively supported (e.g. doing
+  // 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
+  // But it's not profitable to use larger size if it's not natively
+  // supported: we will both use more instructions and handle fewer
+  // sizes in earlier branches.
+  if (count < 16 + (vector_size <= sizeof(uint64_t)))
     return generic::Memmove<uint64_t>::head_tail(dst, src, count);
-  if (count <= 32)
+  if (count < 32 + (vector_size <= sizeof(uint128_t)))
     return generic::Memmove<uint128_t>::head_tail(dst, src, count);
-  if (count <= 64)
+  if (count < 64 + (vector_size <= sizeof(uint256_t)))
     return generic::Memmove<uint256_t>::head_tail(dst, src, count);
   if (count <= 128)
     return generic::Memmove<uint512_t>::head_tail(dst, src, count);
+  if (fast_only)
+    return;
   if (dst < src) {
     generic::Memmove<uint256_t>::align_forward<Arg::Src>(dst, src, count);
     return generic::Memmove<uint512_t>::loop_and_tail_forward(dst, src, count);

``````````

</details>


https://github.com/llvm/llvm-project/pull/70043


More information about the libc-commits mailing list