[libc-commits] [libc] 2cfae7c - [libc] Dispatch memmove to memcpy when buffers are disjoint

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Wed Jun 14 01:29:26 PDT 2023


Author: Guillaume Chatelet
Date: 2023-06-14T08:29:15Z
New Revision: 2cfae7cdf4514652156bfbc0c240df2112ab925c

URL: https://github.com/llvm/llvm-project/commit/2cfae7cdf4514652156bfbc0c240df2112ab925c
DIFF: https://github.com/llvm/llvm-project/commit/2cfae7cdf4514652156bfbc0c240df2112ab925c.diff

LOG: [libc] Dispatch memmove to memcpy when buffers are disjoint

Most of the time `memmove` is called on buffers that are disjoint, in that case we can use `memcpy` which is faster.
The additional test is branchless on x86, aarch64 and RISCV with the zbb extension (bitmanip).
On x86 this patch adds a latency of 2 to 3 cycles.

Before
```
--------------------------------------------------------------------------------
Benchmark                      Time             CPU   Iterations UserCounters...
--------------------------------------------------------------------------------
BM_Memmove/0/0_median       5.00 ns         5.00 ns           10 bytes_per_cycle=1.25477/s bytes_per_second=2.62933G/s items_per_second=199.87M/s __llvm_libc::memmove,memmove Google A
BM_Memmove/1/0_median       6.21 ns         6.21 ns           10 bytes_per_cycle=3.22173/s bytes_per_second=6.75106G/s items_per_second=160.955M/s __llvm_libc::memmove,memmove Google B
BM_Memmove/2/0_median       8.09 ns         8.09 ns           10 bytes_per_cycle=5.31462/s bytes_per_second=11.1366G/s items_per_second=123.603M/s __llvm_libc::memmove,memmove Google D
BM_Memmove/3/0_median       5.95 ns         5.95 ns           10 bytes_per_cycle=2.71865/s bytes_per_second=5.69687G/s items_per_second=167.967M/s __llvm_libc::memmove,memmove Google L
BM_Memmove/4/0_median       5.63 ns         5.63 ns           10 bytes_per_cycle=2.28294/s bytes_per_second=4.78383G/s items_per_second=177.615M/s __llvm_libc::memmove,memmove Google M
BM_Memmove/5/0_median       5.68 ns         5.68 ns           10 bytes_per_cycle=2.16798/s bytes_per_second=4.54295G/s items_per_second=176.015M/s __llvm_libc::memmove,memmove Google Q
BM_Memmove/6/0_median       7.46 ns         7.46 ns           10 bytes_per_cycle=3.97619/s bytes_per_second=8.332G/s items_per_second=134.044M/s __llvm_libc::memmove,memmove Google S
BM_Memmove/7/0_median       5.40 ns         5.40 ns           10 bytes_per_cycle=1.79695/s bytes_per_second=3.76546G/s items_per_second=185.211M/s __llvm_libc::memmove,memmove Google U
BM_Memmove/8/0_median       5.62 ns         5.62 ns           10 bytes_per_cycle=3.18747/s bytes_per_second=6.67927G/s items_per_second=177.983M/s __llvm_libc::memmove,memmove Google W
BM_Memmove/9/0_median        101 ns          101 ns           10 bytes_per_cycle=9.77359/s bytes_per_second=20.4803G/s items_per_second=9.9333M/s __llvm_libc::memmove,uniform 384 to 4096
```
After
```
BM_Memmove/0/0_median       3.57 ns         3.57 ns           10 bytes_per_cycle=1.71375/s bytes_per_second=3.59112G/s items_per_second=280.411M/s __llvm_libc::memmove,memmove Google A
BM_Memmove/1/0_median       4.52 ns         4.52 ns           10 bytes_per_cycle=4.47557/s bytes_per_second=9.37843G/s items_per_second=221.427M/s __llvm_libc::memmove,memmove Google B
BM_Memmove/2/0_median       5.70 ns         5.70 ns           10 bytes_per_cycle=7.37396/s bytes_per_second=15.4519G/s items_per_second=175.399M/s __llvm_libc::memmove,memmove Google D
BM_Memmove/3/0_median       4.47 ns         4.47 ns           10 bytes_per_cycle=3.4148/s bytes_per_second=7.15563G/s items_per_second=223.743M/s __llvm_libc::memmove,memmove Google L
BM_Memmove/4/0_median       4.53 ns         4.53 ns           10 bytes_per_cycle=2.86071/s bytes_per_second=5.99454G/s items_per_second=220.69M/s __llvm_libc::memmove,memmove Google M
BM_Memmove/5/0_median       4.19 ns         4.19 ns           10 bytes_per_cycle=2.5484/s bytes_per_second=5.3401G/s items_per_second=238.924M/s __llvm_libc::memmove,memmove Google Q
BM_Memmove/6/0_median       5.02 ns         5.02 ns           10 bytes_per_cycle=5.94164/s bytes_per_second=12.4505G/s items_per_second=199.14M/s __llvm_libc::memmove,memmove Google S
BM_Memmove/7/0_median       4.03 ns         4.03 ns           10 bytes_per_cycle=2.47028/s bytes_per_second=5.17641G/s items_per_second=247.906M/s __llvm_libc::memmove,memmove Google U
BM_Memmove/8/0_median       4.70 ns         4.70 ns           10 bytes_per_cycle=3.84975/s bytes_per_second=8.06706G/s items_per_second=212.72M/s __llvm_libc::memmove,memmove Google W
BM_Memmove/9/0_median       90.7 ns         90.7 ns           10 bytes_per_cycle=10.8681/s bytes_per_second=22.7739G/s items_per_second=11.02M/s __llvm_libc::memmove,uniform 384 to 4096
```

Reviewed By: courbet

Differential Revision: https://reviews.llvm.org/D152811

Added: 
    

Modified: 
    libc/src/string/memmove.cpp
    libc/src/string/memory_utils/utils.h
    libc/test/src/string/memory_utils/utils_test.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp
index ba1b29bf9876e..3b6866fd9f017 100644
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/string/memmove.h"
+#include "src/string/memory_utils/memcpy_implementations.h"
 #include "src/string/memory_utils/memmove_implementations.h"
 #include <stddef.h> // size_t
 
@@ -14,7 +15,10 @@ namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(void *, memmove,
                    (void *dst, const void *src, size_t count)) {
-  inline_memmove(dst, src, count);
+  if (is_disjoint(dst, src, count))
+    inline_memcpy(dst, src, count);
+  else
+    inline_memmove(dst, src, count);
   return dst;
 }
 

diff  --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 32be4c497507e..5a3dcc6907cee 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -84,6 +84,21 @@ template <size_t alignment, typename T> static T *assume_aligned(T *ptr) {
   return reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignment));
 }
 
+// Returns true iff memory regions [p1, p1 + size] and [p2, p2 + size] are
+// disjoint.
+LIBC_INLINE bool is_disjoint(const void *p1, const void *p2, size_t size) {
+  const char *a = static_cast<const char *>(p1);
+  const char *b = static_cast<const char *>(p2);
+  if (a > b) {
+    // Swap a and b, this compiles down to conditionnal move for aarch64, x86
+    // and RISCV with zbb extension.
+    const char *tmp = a;
+    a = b;
+    b = tmp;
+  }
+  return a + size <= b;
+}
+
 #if LIBC_HAS_BUILTIN(__builtin_memcpy_inline)
 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
 #endif

diff  --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp
index 4fd6def780aa9..8968e0d85ae99 100644
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@@ -144,6 +144,19 @@ TEST(LlvmLibcUtilsTest, Align2) {
   }
 }
 
+TEST(LlvmLibcUtilsTest, DisjointBuffers) {
+  char buf[3];
+  const char *const a = buf + 0;
+  const char *const b = buf + 1;
+  EXPECT_TRUE(is_disjoint(a, b, 0));
+  EXPECT_TRUE(is_disjoint(a, b, 1));
+  EXPECT_FALSE(is_disjoint(a, b, 2));
+
+  EXPECT_TRUE(is_disjoint(b, a, 0));
+  EXPECT_TRUE(is_disjoint(b, a, 1));
+  EXPECT_FALSE(is_disjoint(b, a, 2));
+}
+
 TEST(LlvmLibcUtilsTest, LoadStoreAligned) {
   const uint64_t init = 0xDEAD'C0DE'BEEF'F00D;
   CPtr const src = reinterpret_cast<CPtr>(&init);


        


More information about the libc-commits mailing list