[libc-commits] [libc] memmove optimizations (PR #70043)
via libc-commits
libc-commits at lists.llvm.org
Tue Oct 24 06:57:01 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-libc
Author: Dmitry Vyukov (dvyukov)
<details>
<summary>Changes</summary>
See individual commits for description.
```
│ baseline │ small-size-check │
│ sec/op │ sec/op vs base │
memmove/Google_A 3.208n ± 0% 2.909n ± 0% -9.31% (n=100)
memmove/0 2.982n ± 0% 2.168n ± 0% -27.27% (n=50)
memmove/1 3.253n ± 0% 2.169n ± 0% -33.34% (n=50)
memmove/2 3.255n ± 0% 2.168n ± 6% -33.40% (n=50)
memmove/3 3.259n ± 2% 2.175n ± 0% -33.26% (n=50)
memmove/4 3.259n ± 0% 2.168n ± 0% -33.45% (p=0.000 n=50)
memmove/5 2.488n ± 0% 1.926n ± 0% -22.57% (n=50)
memmove/6 2.490n ± 0% 1.928n ± 0% -22.58% (p=0.000 n=50)
memmove/7 2.492n ± 0% 1.928n ± 0% -22.63% (n=50)
memmove/8 2.737n ± 0% 2.711n ± 0% -0.97% (p=0.000 n=50)
memmove/9 2.736n ± 0% 2.711n ± 0% -0.94% (p=0.000 n=50)
memmove/10 2.739n ± 0% 2.711n ± 0% -1.04% (p=0.000 n=50)
memmove/11 2.740n ± 0% 2.711n ± 0% -1.07% (p=0.000 n=50)
memmove/12 2.740n ± 0% 2.711n ± 0% -1.09% (p=0.000 n=50)
memmove/13 2.744n ± 0% 2.711n ± 0% -1.22% (p=0.000 n=50)
memmove/14 2.742n ± 0% 2.711n ± 0% -1.14% (p=0.000 n=50)
memmove/15 2.742n ± 0% 2.711n ± 0% -1.15% (p=0.000 n=50)
memmove/16 2.997n ± 0% 2.982n ± 0% -0.52% (p=0.000 n=50)
memmove/17 2.998n ± 0% 2.982n ± 0% -0.55% (p=0.000 n=50)
memmove/18 2.998n ± 0% 2.982n ± 0% -0.54% (p=0.000 n=50)
memmove/19 2.999n ± 0% 2.981n ± 0% -0.59% (p=0.000 n=50)
memmove/20 2.998n ± 0% 2.982n ± 0% -0.55% (p=0.000 n=50)
memmove/21 3.000n ± 0% 2.982n ± 0% -0.61% (p=0.000 n=50)
memmove/22 3.002n ± 0% 2.982n ± 0% -0.68% (p=0.000 n=50)
memmove/23 3.002n ± 0% 2.981n ± 0% -0.67% (p=0.000 n=50)
memmove/24 3.002n ± 0% 2.981n ± 0% -0.70% (p=0.000 n=50)
memmove/25 3.002n ± 0% 2.982n ± 0% -0.68% (p=0.000 n=50)
memmove/26 3.004n ± 0% 2.982n ± 0% -0.74% (n=50)
memmove/27 3.005n ± 0% 2.982n ± 0% -0.79% (p=0.000 n=50)
memmove/28 3.005n ± 0% 2.982n ± 0% -0.77% (p=0.000 n=50)
memmove/29 3.009n ± 0% 2.982n ± 0% -0.92% (n=50)
memmove/30 3.008n ± 0% 2.982n ± 0% -0.89% (n=50)
memmove/31 3.007n ± 0% 2.981n ± 0% -0.86% (n=50)
memmove/32 3.540n ± 0% 2.999n ± 0% -15.30% (p=0.000 n=50)
memmove/33 3.544n ± 0% 2.998n ± 0% -15.41% (p=0.000 n=50)
memmove/34 3.546n ± 0% 2.999n ± 0% -15.42% (n=50)
memmove/35 3.545n ± 0% 2.999n ± 0% -15.41% (p=0.000 n=50)
memmove/36 3.548n ± 0% 2.998n ± 0% -15.51% (p=0.000 n=50)
memmove/37 3.546n ± 0% 2.999n ± 0% -15.44% (n=50)
memmove/38 3.549n ± 0% 2.999n ± 0% -15.50% (p=0.000 n=50)
memmove/39 3.549n ± 0% 2.999n ± 0% -15.48% (p=0.000 n=50)
memmove/40 3.549n ± 0% 3.000n ± 0% -15.48% (p=0.000 n=50)
memmove/41 3.550n ± 0% 3.000n ± 0% -15.49% (p=0.000 n=50)
memmove/42 3.549n ± 0% 3.001n ± 0% -15.45% (n=50)
memmove/43 3.552n ± 0% 3.000n ± 0% -15.54% (p=0.000 n=50)
memmove/44 3.552n ± 0% 3.002n ± 0% -15.49% (n=50)
memmove/45 3.552n ± 0% 3.001n ± 0% -15.49% (p=0.000 n=50)
memmove/46 3.554n ± 0% 3.002n ± 0% -15.52% (p=0.000 n=50)
memmove/47 3.556n ± 0% 3.003n ± 0% -15.56% (p=0.000 n=50)
memmove/48 3.555n ± 0% 3.002n ± 0% -15.56% (p=0.000 n=50)
memmove/49 3.557n ± 0% 3.002n ± 0% -15.58% (p=0.000 n=50)
memmove/50 3.557n ± 0% 3.003n ± 0% -15.58% (p=0.000 n=50)
memmove/51 3.556n ± 0% 3.004n ± 0% -15.52% (p=0.000 n=50)
memmove/52 3.561n ± 0% 3.004n ± 0% -15.63% (p=0.000 n=50)
memmove/53 3.558n ± 0% 3.004n ± 0% -15.57% (p=0.000 n=50)
memmove/54 3.561n ± 0% 3.005n ± 0% -15.61% (p=0.000 n=50)
memmove/55 3.560n ± 0% 3.006n ± 0% -15.58% (n=50)
memmove/56 3.562n ± 0% 3.006n ± 0% -15.60% (p=0.000 n=50)
memmove/57 3.563n ± 0% 3.010n ± 0% -15.52% (p=0.000 n=50)
memmove/58 3.565n ± 0% 3.006n ± 0% -15.66% (p=0.000 n=50)
memmove/59 3.564n ± 0% 3.006n ± 0% -15.66% (p=0.000 n=50)
memmove/60 3.570n ± 0% 3.008n ± 0% -15.75% (p=0.000 n=50)
memmove/61 3.566n ± 0% 3.008n ± 0% -15.67% (p=0.000 n=50)
memmove/62 3.567n ± 0% 3.008n ± 0% -15.68% (p=0.000 n=50)
memmove/63 3.568n ± 0% 3.008n ± 0% -15.69% (p=0.000 n=50)
memmove/64 4.104n ± 0% 3.008n ± 0% -26.70% (p=0.000 n=50)
memmove/65 4.126n ± 0% 3.662n ± 0% -11.25% (p=0.000 n=50)
memmove/66 4.128n ± 0% 3.662n ± 0% -11.28% (n=50)
memmove/67 4.129n ± 0% 3.662n ± 0% -11.31% (p=0.000 n=50)
memmove/68 4.129n ± 0% 3.661n ± 0% -11.32% (p=0.000 n=50)
memmove/69 4.130n ± 0% 3.662n ± 0% -11.35% (n=50)
memmove/70 4.130n ± 0% 3.662n ± 0% -11.34% (p=0.000 n=50)
memmove/71 4.132n ± 0% 3.662n ± 0% -11.37% (p=0.000 n=50)
memmove/72 4.131n ± 0% 3.662n ± 0% -11.37% (p=0.000 n=50)
memmove/73 4.135n ± 0% 3.662n ± 0% -11.43% (p=0.000 n=50)
memmove/74 4.137n ± 0% 3.662n ± 0% -11.49% (n=50)
memmove/75 4.138n ± 0% 3.662n ± 0% -11.51% (n=50)
memmove/76 4.139n ± 0% 3.661n ± 0% -11.55% (p=0.000 n=50)
memmove/77 4.136n ± 0% 3.661n ± 0% -11.48% (n=50)
memmove/78 4.143n ± 0% 3.661n ± 0% -11.62% (n=50)
memmove/79 4.142n ± 0% 3.661n ± 0% -11.60% (n=50)
memmove/80 4.142n ± 0% 3.661n ± 0% -11.61% (p=0.000 n=50)
memmove/81 4.140n ± 0% 3.661n ± 0% -11.56% (n=50)
memmove/82 4.146n ± 0% 3.661n ± 0% -11.68% (p=0.000 n=50)
memmove/83 4.143n ± 0% 3.661n ± 0% -11.63% (p=0.000 n=50)
memmove/84 4.143n ± 0% 3.661n ± 0% -11.62% (n=50)
memmove/85 4.147n ± 0% 3.661n ± 0% -11.72% (n=50)
memmove/86 4.142n ± 0% 3.661n ± 0% -11.62% (p=0.000 n=50)
memmove/87 4.147n ± 0% 3.661n ± 0% -11.73% (p=0.000 n=50)
memmove/88 4.148n ± 0% 3.661n ± 0% -11.75% (p=0.000 n=50)
memmove/89 4.152n ± 0% 3.661n ± 0% -11.83% (n=50)
memmove/90 4.151n ± 0% 3.661n ± 0% -11.82% (n=50)
memmove/91 4.150n ± 0% 3.661n ± 0% -11.78% (p=0.000 n=50)
memmove/92 4.153n ± 0% 3.660n ± 0% -11.87% (p=0.000 n=50)
memmove/93 4.158n ± 0% 3.661n ± 0% -11.95% (n=50)
memmove/94 4.157n ± 0% 3.661n ± 0% -11.95% (p=0.000 n=50)
memmove/95 4.155n ± 0% 3.661n ± 0% -11.90% (p=0.000 n=50)
memmove/96 4.149n ± 0% 3.660n ± 0% -11.80% (p=0.000 n=50)
memmove/97 4.157n ± 0% 3.661n ± 0% -11.94% (n=50)
memmove/98 4.157n ± 0% 3.661n ± 0% -11.94% (p=0.000 n=50)
memmove/99 4.168n ± 0% 3.661n ± 0% -12.17% (n=50)
memmove/100 4.159n ± 0% 3.660n ± 0% -12.00% (n=50)
memmove/101 4.161n ± 0% 3.661n ± 0% -12.03% (n=50)
memmove/102 4.165n ± 0% 3.660n ± 0% -12.12% (n=50)
memmove/103 4.164n ± 0% 3.661n ± 0% -12.08% (p=0.000 n=50)
memmove/104 4.164n ± 0% 3.660n ± 0% -12.11% (p=0.000 n=50)
memmove/105 4.165n ± 0% 3.660n ± 0% -12.12% (n=50)
memmove/106 4.166n ± 0% 3.660n ± 0% -12.15% (n=50)
memmove/107 4.171n ± 0% 3.660n ± 0% -12.25% (p=0.000 n=50)
memmove/108 4.173n ± 0% 3.660n ± 0% -12.29% (p=0.000 n=50)
memmove/109 4.170n ± 0% 3.660n ± 0% -12.24% (p=0.000 n=50)
memmove/110 4.174n ± 0% 3.660n ± 0% -12.31% (p=0.000 n=50)
memmove/111 4.176n ± 0% 3.660n ± 0% -12.35% (n=50)
memmove/112 4.174n ± 0% 3.660n ± 0% -12.33% (p=0.000 n=50)
memmove/113 4.176n ± 0% 3.660n ± 0% -12.35% (p=0.000 n=50)
memmove/114 4.182n ± 0% 3.660n ± 0% -12.48% (p=0.000 n=50)
memmove/115 4.185n ± 0% 3.660n ± 0% -12.55% (n=50)
memmove/116 4.184n ± 0% 3.660n ± 0% -12.54% (n=50)
memmove/117 4.182n ± 0% 3.660n ± 0% -12.49% (n=50)
memmove/118 4.188n ± 0% 3.660n ± 0% -12.60% (n=50)
memmove/119 4.186n ± 0% 3.660n ± 0% -12.56% (p=0.000 n=50)
memmove/120 4.189n ± 0% 3.660n ± 0% -12.62% (n=50)
memmove/121 4.187n ± 0% 3.668n ± 0% -12.40% (n=50)
memmove/122 4.186n ± 0% 3.667n ± 0% -12.39% (p=0.000 n=50)
memmove/123 4.187n ± 0% 3.668n ± 0% -12.41% (p=0.000 n=50)
memmove/124 4.189n ± 0% 3.667n ± 0% -12.46% (n=50)
memmove/125 4.195n ± 0% 3.662n ± 1% -12.72% (p=0.000 n=50)
memmove/126 4.197n ± 0% 3.669n ± 0% -12.59% (n=50)
memmove/127 4.194n ± 0% 3.668n ± 0% -12.53% (p=0.000 n=50)
memmove/128 5.035n ± 0% 3.656n ± 2% -27.38% (p=0.000 n=50)
```
---
Full diff: https://github.com/llvm/llvm-project/pull/70043.diff
8 Files Affected:
- (modified) libc/benchmarks/LibcMemoryBenchmarkMain.cpp (+10-4)
- (modified) libc/src/string/memmove.cpp (+15-4)
- (modified) libc/src/string/memory_utils/aarch64/inline_memmove.h (+4-1)
- (modified) libc/src/string/memory_utils/generic/builtin.h (+2-2)
- (modified) libc/src/string/memory_utils/generic/byte_per_byte.h (+1-1)
- (modified) libc/src/string/memory_utils/inline_memmove.h (+11-2)
- (modified) libc/src/string/memory_utils/riscv/inline_memmove.h (+5-3)
- (modified) libc/src/string/memory_utils/x86_64/inline_memmove.h (+28-7)
``````````diff
diff --git a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
index acd7c30717597a1..bc6fd8b38cb6ddc 100644
--- a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
+++ b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
@@ -42,9 +42,15 @@ static cl::opt<std::string>
SizeDistributionName("size-distribution-name",
cl::desc("The name of the distribution to use"));
-static cl::opt<bool>
- SweepMode("sweep-mode",
- cl::desc("If set, benchmark all sizes from 0 to sweep-max-size"));
+static cl::opt<bool> SweepMode(
+ "sweep-mode",
+ cl::desc(
+ "If set, benchmark all sizes from sweep-min-size to sweep-max-size"));
+
+static cl::opt<uint32_t>
+ SweepMinSize("sweep-min-size",
+ cl::desc("The minimum size to use in sweep-mode"),
+ cl::init(0));
static cl::opt<uint32_t>
SweepMaxSize("sweep-max-size",
@@ -185,7 +191,7 @@ struct MemfunctionBenchmarkSweep final : public MemfunctionBenchmarkBase {
BO.InitialIterations = 100;
auto &Measurements = Study.Measurements;
Measurements.reserve(NumTrials * SweepMaxSize);
- for (size_t Size = 0; Size <= SweepMaxSize; ++Size) {
+ for (size_t Size = SweepMinSize; Size <= SweepMaxSize; ++Size) {
CurrentSweepSize = Size;
runTrials(BO, Measurements);
}
diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp
index 7d473afc0b42ee7..a6478629d514027 100644
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@@ -15,10 +15,21 @@ namespace LIBC_NAMESPACE {
LLVM_LIBC_FUNCTION(void *, memmove,
(void *dst, const void *src, size_t count)) {
- if (is_disjoint(dst, src, count))
- inline_memcpy(dst, src, count);
- else
- inline_memmove(dst, src, count);
+ // inline_memmove may handle some small sizes as efficiently
+ // as inline_memcpy. For these sizes we may not do is_disjoint check.
+ // This both avoids additional code for the most frequent smaller sizes
+ // and removes code bloat (we don't need the memcpy logic for small sizes).
+ // Here we heavily rely on inlining and dead code elimination: from the first
+ // inline_memmove we should get only handling of small sizes, and from
+ // the second inline_memmove and inline_memcpy we should get only handling
+ // of larger sizes.
+ inline_memmove(dst, src, count, true);
+ if (count >= LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE) {
+ if (is_disjoint(dst, src, count))
+ inline_memcpy(dst, src, count);
+ else
+ inline_memmove(dst, src, count);
+ }
return dst;
}
diff --git a/libc/src/string/memory_utils/aarch64/inline_memmove.h b/libc/src/string/memory_utils/aarch64/inline_memmove.h
index ca28655c916820c..5e5c23e34be62d1 100644
--- a/libc/src/string/memory_utils/aarch64/inline_memmove.h
+++ b/libc/src/string/memory_utils/aarch64/inline_memmove.h
@@ -18,7 +18,8 @@
namespace LIBC_NAMESPACE {
-LIBC_INLINE void inline_memmove_aarch64(Ptr dst, CPtr src, size_t count) {
+LIBC_INLINE void inline_memmove_aarch64(Ptr dst, CPtr src, size_t count,
+ bool fast_only) {
static_assert(aarch64::kNeon, "aarch64 supports vector types");
using uint128_t = generic_v128;
using uint256_t = generic_v256;
@@ -39,6 +40,8 @@ LIBC_INLINE void inline_memmove_aarch64(Ptr dst, CPtr src, size_t count) {
return generic::Memmove<uint256_t>::head_tail(dst, src, count);
if (count <= 128)
return generic::Memmove<uint512_t>::head_tail(dst, src, count);
+ if (fast_only)
+ return;
if (dst < src) {
generic::Memmove<uint256_t>::align_forward<Arg::Src>(dst, src, count);
return generic::Memmove<uint512_t>::loop_and_tail_forward(dst, src, count);
diff --git a/libc/src/string/memory_utils/generic/builtin.h b/libc/src/string/memory_utils/generic/builtin.h
index 5239329f653b341..1dabc856053d191 100644
--- a/libc/src/string/memory_utils/generic/builtin.h
+++ b/libc/src/string/memory_utils/generic/builtin.h
@@ -26,8 +26,8 @@ inline_memcpy_builtin(Ptr dst, CPtr src, size_t count, size_t offset = 0) {
__builtin_memcpy(dst + offset, src + offset, count);
}
-[[maybe_unused]] LIBC_INLINE void inline_memmove_builtin(Ptr dst, CPtr src,
- size_t count) {
+[[maybe_unused]] LIBC_INLINE void
+inline_memmove_builtin(Ptr dst, CPtr src, size_t count, bool fast_only) {
__builtin_memmove(dst, src, count);
}
diff --git a/libc/src/string/memory_utils/generic/byte_per_byte.h b/libc/src/string/memory_utils/generic/byte_per_byte.h
index a666c5da3136041..89497382aede338 100644
--- a/libc/src/string/memory_utils/generic/byte_per_byte.h
+++ b/libc/src/string/memory_utils/generic/byte_per_byte.h
@@ -29,7 +29,7 @@ inline_memcpy_byte_per_byte(Ptr dst, CPtr src, size_t count,
}
[[maybe_unused]] LIBC_INLINE void
-inline_memmove_byte_per_byte(Ptr dst, CPtr src, size_t count) {
+inline_memmove_byte_per_byte(Ptr dst, CPtr src, size_t count, bool fast_only) {
if (count == 0 || dst == src)
return;
if (dst < src) {
diff --git a/libc/src/string/memory_utils/inline_memmove.h b/libc/src/string/memory_utils/inline_memmove.h
index f72ea24ab538d69..0440bbe94d542e9 100644
--- a/libc/src/string/memory_utils/inline_memmove.h
+++ b/libc/src/string/memory_utils/inline_memmove.h
@@ -14,27 +14,36 @@
#if defined(LIBC_TARGET_ARCH_IS_X86)
#include "src/string/memory_utils/x86_64/inline_memmove.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_x86
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 129
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
#include "src/string/memory_utils/aarch64/inline_memmove.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_aarch64
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 129
#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
#include "src/string/memory_utils/riscv/inline_memmove.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_riscv
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 0
#elif defined(LIBC_TARGET_ARCH_IS_ARM)
#include "src/string/memory_utils/generic/byte_per_byte.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_byte_per_byte
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 0
#elif defined(LIBC_TARGET_ARCH_IS_GPU)
#include "src/string/memory_utils/generic/builtin.h"
#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_builtin
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE 0
#else
#error "Unsupported architecture"
#endif
namespace LIBC_NAMESPACE {
-LIBC_INLINE void inline_memmove(void *dst, const void *src, size_t count) {
+LIBC_INLINE void inline_memmove(void *dst, const void *src, size_t count,
+ bool fast_only = false) {
+ if (LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SLOW_SIZE == 0 && fast_only)
+ return;
LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE(reinterpret_cast<Ptr>(dst),
- reinterpret_cast<CPtr>(src), count);
+ reinterpret_cast<CPtr>(src), count,
+ fast_only);
}
} // namespace LIBC_NAMESPACE
diff --git a/libc/src/string/memory_utils/riscv/inline_memmove.h b/libc/src/string/memory_utils/riscv/inline_memmove.h
index 1c26917a96d9d18..5e34b2817729972 100644
--- a/libc/src/string/memory_utils/riscv/inline_memmove.h
+++ b/libc/src/string/memory_utils/riscv/inline_memmove.h
@@ -17,9 +17,11 @@
namespace LIBC_NAMESPACE {
-[[maybe_unused]] LIBC_INLINE void
-inline_memmove_riscv(Ptr __restrict dst, CPtr __restrict src, size_t count) {
- return inline_memmove_byte_per_byte(dst, src, count);
+[[maybe_unused]] LIBC_INLINE void inline_memmove_riscv(Ptr __restrict dst,
+ CPtr __restrict src,
+ size_t count,
+ bool fast_only) {
+ return inline_memmove_byte_per_byte(dst, src, count, fast_only);
}
} // namespace LIBC_NAMESPACE
diff --git a/libc/src/string/memory_utils/x86_64/inline_memmove.h b/libc/src/string/memory_utils/x86_64/inline_memmove.h
index 95ad07f75219581..ee397c63471f1ad 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memmove.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memmove.h
@@ -18,40 +18,61 @@
namespace LIBC_NAMESPACE {
-LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count) {
+LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count,
+ bool fast_only) {
#if defined(__AVX512F__)
+ constexpr size_t vector_size = 64;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = generic_v512;
#elif defined(__AVX__)
+ constexpr size_t vector_size = 32;
using uint128_t = generic_v128;
using uint256_t = generic_v256;
using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
+ constexpr size_t vector_size = 16;
using uint128_t = generic_v128;
using uint256_t = cpp::array<generic_v128, 2>;
using uint512_t = cpp::array<generic_v128, 4>;
#else
+ constexpr size_t vector_size = 8;
using uint128_t = cpp::array<uint64_t, 2>;
using uint256_t = cpp::array<uint64_t, 4>;
using uint512_t = cpp::array<uint64_t, 8>;
#endif
+ (void)vector_size;
if (count == 0)
return;
if (count == 1)
return generic::Memmove<uint8_t>::block(dst, src);
- if (count <= 4)
- return generic::Memmove<uint16_t>::head_tail(dst, src, count);
- if (count <= 8)
+ if (count == 2)
+ return generic::Memmove<uint16_t>::block(dst, src);
+ if (count == 3)
+ return generic::Memmove<cpp::array<uint8_t, 3>>::block(dst, src);
+ if (count == 4)
+ return generic::Memmove<uint32_t>::block(dst, src);
+ if (count < 8)
return generic::Memmove<uint32_t>::head_tail(dst, src, count);
- if (count <= 16)
+ // If count is equal to a power of 2, we can handle it as head-tail
+ // of both smaller size and larger size (head-tail are either
+ // non-overlapping for smaller size, or completely collapsed
+ // for larger size). It seems to be more profitable to do the copy
+ // with the larger size, if it's natively supported (e.g. doing
+ // 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
+ // But it's not profitable to use larger size if it's not natively
+ // supported: we will both use more instructions and handle fewer
+ // sizes in earlier branches.
+ if (count < 16 + (vector_size <= sizeof(uint64_t)))
return generic::Memmove<uint64_t>::head_tail(dst, src, count);
- if (count <= 32)
+ if (count < 32 + (vector_size <= sizeof(uint128_t)))
return generic::Memmove<uint128_t>::head_tail(dst, src, count);
- if (count <= 64)
+ if (count < 64 + (vector_size <= sizeof(uint256_t)))
return generic::Memmove<uint256_t>::head_tail(dst, src, count);
if (count <= 128)
return generic::Memmove<uint512_t>::head_tail(dst, src, count);
+ if (fast_only)
+ return;
if (dst < src) {
generic::Memmove<uint256_t>::align_forward<Arg::Src>(dst, src, count);
return generic::Memmove<uint512_t>::loop_and_tail_forward(dst, src, count);
``````````
</details>
https://github.com/llvm/llvm-project/pull/70043
More information about the libc-commits
mailing list