[libc-commits] [libc] f4a3549 - [libc] Add optimized memcpy for RISCV

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Wed May 10 01:42:19 PDT 2023


Author: Guillaume Chatelet
Date: 2023-05-10T08:42:07Z
New Revision: f4a35492504d7a47afc8ea5b5dd9c437b7b66380

URL: https://github.com/llvm/llvm-project/commit/f4a35492504d7a47afc8ea5b5dd9c437b7b66380
DIFF: https://github.com/llvm/llvm-project/commit/f4a35492504d7a47afc8ea5b5dd9c437b7b66380.diff

LOG: [libc] Add optimized memcpy for RISCV

This patch adds two versions of memcpy optimized for architectures where unaligned accesses are either illegal or extremely slow.
It is currently enabled for RISCV 64 and RISCV 32 but it could be used for ARM 32 architectures as well.

Here is the before / after output of `libc.benchmarks.memory_functions.opt_host --benchmark_filter=BM_Memcpy` on a quad core Linux starfive RISCV 64 board running at 1.5GHz.

Before:
```
Run on (4 X 1500 MHz CPU s)
CPU Caches:
  L1 Instruction 32 KiB (x4)
  L1 Data 32 KiB (x4)
  L2 Unified 2048 KiB (x1)
------------------------------------------------------------------------
Benchmark              Time             CPU   Iterations UserCounters...
------------------------------------------------------------------------
BM_Memcpy/0/0        474 ns          474 ns      1483776 bytes_per_cycle=0.243492/s bytes_per_second=348.318M/s items_per_second=2.11097M/s __llvm_libc::memcpy,memcpy Google A
BM_Memcpy/1/0        210 ns          209 ns      3649536 bytes_per_cycle=0.233819/s bytes_per_second=334.481M/s items_per_second=4.77519M/s __llvm_libc::memcpy,memcpy Google B
BM_Memcpy/2/0       1814 ns         1814 ns       396288 bytes_per_cycle=0.247899/s bytes_per_second=354.622M/s items_per_second=551.402k/s __llvm_libc::memcpy,memcpy Google D
BM_Memcpy/3/0       89.3 ns         89.2 ns      7459840 bytes_per_cycle=0.217415/s bytes_per_second=311.014M/s items_per_second=11.2071M/s __llvm_libc::memcpy,memcpy Google L
BM_Memcpy/4/0        134 ns          134 ns      3815424 bytes_per_cycle=0.226584/s bytes_per_second=324.131M/s items_per_second=7.44567M/s __llvm_libc::memcpy,memcpy Google M
BM_Memcpy/5/0       52.8 ns         52.6 ns     11001856 bytes_per_cycle=0.194893/s bytes_per_second=278.797M/s items_per_second=19.0284M/s __llvm_libc::memcpy,memcpy Google Q
BM_Memcpy/6/0        180 ns          180 ns      4101120 bytes_per_cycle=0.231884/s bytes_per_second=331.713M/s items_per_second=5.55957M/s __llvm_libc::memcpy,memcpy Google S
BM_Memcpy/7/0        195 ns          195 ns      3906560 bytes_per_cycle=0.232951/s bytes_per_second=333.239M/s items_per_second=5.1217M/s __llvm_libc::memcpy,memcpy Google U
BM_Memcpy/8/0        152 ns          152 ns      4789248 bytes_per_cycle=0.227507/s bytes_per_second=325.452M/s items_per_second=6.58187M/s __llvm_libc::memcpy,memcpy Google W
BM_Memcpy/9/0       6036 ns         6033 ns       118784 bytes_per_cycle=0.249158/s bytes_per_second=356.423M/s items_per_second=165.75k/s __llvm_libc::memcpy,uniform 384 to 4096
```

After:
```
BM_Memcpy/0/0        126 ns          126 ns      5770240 bytes_per_cycle=1.04707/s bytes_per_second=1.46273G/s items_per_second=7.9385M/s __llvm_libc::memcpy,memcpy Google A
BM_Memcpy/1/0       75.1 ns         75.0 ns     10204160 bytes_per_cycle=0.691143/s bytes_per_second=988.687M/s items_per_second=13.3289M/s __llvm_libc::memcpy,memcpy Google B
BM_Memcpy/2/0        333 ns          333 ns      2174976 bytes_per_cycle=1.39297/s bytes_per_second=1.94596G/s items_per_second=3.00002M/s __llvm_libc::memcpy,memcpy Google D
BM_Memcpy/3/0       49.6 ns         49.5 ns     16092160 bytes_per_cycle=0.710161/s bytes_per_second=1015.89M/s items_per_second=20.1844M/s __llvm_libc::memcpy,memcpy Google L
BM_Memcpy/4/0       57.7 ns         57.7 ns     11213824 bytes_per_cycle=0.561557/s bytes_per_second=803.314M/s items_per_second=17.3228M/s __llvm_libc::memcpy,memcpy Google M
BM_Memcpy/5/0       48.0 ns         47.9 ns     16437248 bytes_per_cycle=0.346708/s bytes_per_second=495.97M/s items_per_second=20.8571M/s __llvm_libc::memcpy,memcpy Google Q
BM_Memcpy/6/0       67.5 ns         67.5 ns     10616832 bytes_per_cycle=0.614173/s bytes_per_second=878.582M/s items_per_second=14.8142M/s __llvm_libc::memcpy,memcpy Google S
BM_Memcpy/7/0       84.7 ns         84.6 ns     10480640 bytes_per_cycle=0.819077/s bytes_per_second=1.14424G/s items_per_second=11.8174M/s __llvm_libc::memcpy,memcpy Google U
BM_Memcpy/8/0       61.7 ns         61.6 ns     11191296 bytes_per_cycle=0.550078/s bytes_per_second=786.893M/s items_per_second=16.2279M/s __llvm_libc::memcpy,memcpy Google W
BM_Memcpy/9/0        981 ns          981 ns       703488 bytes_per_cycle=1.52333/s bytes_per_second=2.12807G/s items_per_second=1019.81k/s __llvm_libc::memcpy,uniform 384 to 4096
```

It is not as good as glibc for now so there's room for improvement. I suspect a path pumping 16 bytes at once given the doubled numbers for large copies.
```
BM_Memcpy/0/1        146 ns         82.5 ns      8576000 bytes_per_cycle=1.35236/s bytes_per_second=1.88922G/s items_per_second=12.1169M/s glibc memcpy,memcpy Google A
BM_Memcpy/1/1        112 ns         63.7 ns     10634240 bytes_per_cycle=0.628018/s bytes_per_second=898.387M/s items_per_second=15.702M/s glibc memcpy,memcpy Google B
BM_Memcpy/2/1        315 ns          180 ns      4079616 bytes_per_cycle=2.65229/s bytes_per_second=3.7052G/s items_per_second=5.54764M/s glibc memcpy,memcpy Google D
BM_Memcpy/3/1       85.3 ns         43.1 ns     15854592 bytes_per_cycle=0.774164/s bytes_per_second=1107.45M/s items_per_second=23.2249M/s glibc memcpy,memcpy Google L
BM_Memcpy/4/1        105 ns         54.3 ns     13427712 bytes_per_cycle=0.7793/s bytes_per_second=1114.8M/s items_per_second=18.4109M/s glibc memcpy,memcpy Google M
BM_Memcpy/5/1       77.1 ns         43.2 ns     16476160 bytes_per_cycle=0.279808/s bytes_per_second=400.269M/s items_per_second=23.1428M/s glibc memcpy,memcpy Google Q
BM_Memcpy/6/1        112 ns         62.7 ns     11236352 bytes_per_cycle=0.676078/s bytes_per_second=967.137M/s items_per_second=15.9387M/s glibc memcpy,memcpy Google S
BM_Memcpy/7/1        131 ns         65.5 ns     11751424 bytes_per_cycle=0.965616/s bytes_per_second=1.34895G/s items_per_second=15.2762M/s glibc memcpy,memcpy Google U
BM_Memcpy/8/1        104 ns         55.0 ns     12314624 bytes_per_cycle=0.583336/s bytes_per_second=834.468M/s items_per_second=18.1937M/s glibc memcpy,memcpy Google W
BM_Memcpy/9/1        932 ns          466 ns      1480704 bytes_per_cycle=3.17342/s bytes_per_second=4.43321G/s items_per_second=2.14679M/s glibc memcpy,uniform 384 to 4096
```

Reviewed By: sivachandra

Differential Revision: https://reviews.llvm.org/D150202

Added: 
    

Modified: 
    libc/src/__support/macros/properties/architectures.h
    libc/src/string/memory_utils/CMakeLists.txt
    libc/src/string/memory_utils/memcpy_implementations.h
    libc/src/string/memory_utils/utils.h
    libc/test/src/string/memory_utils/utils_test.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/macros/properties/architectures.h b/libc/src/__support/macros/properties/architectures.h
index 1247fd6ef5cbc..66bb6fb68a642 100644
--- a/libc/src/__support/macros/properties/architectures.h
+++ b/libc/src/__support/macros/properties/architectures.h
@@ -49,6 +49,10 @@
 #define LIBC_TARGET_ARCH_IS_RISCV64
 #endif
 
+#if defined(__riscv) && (__riscv_xlen == 32)
+#define LIBC_TARGET_ARCH_IS_RISCV32
+#endif
+
 #if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM))
 #define LIBC_TARGET_ARCH_IS_ANY_ARM
 #endif

diff  --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 31335227f4ab3..7bb0e960ee13d 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -18,6 +18,7 @@ add_header_library(
     x86_64/memcmp_implementations.h
     x86_64/memcpy_implementations.h
   DEPS
+    libc.src.__support.common
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.cstddef
     libc.src.__support.CPP.type_traits

diff  --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h
index b1b60ff590d50..b22606781703f 100644
--- a/libc/src/string/memory_utils/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/memcpy_implementations.h
@@ -26,24 +26,79 @@
 namespace __llvm_libc {
 
 [[maybe_unused]] LIBC_INLINE void
-inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src,
-                            size_t count) {
+inline_memcpy_byte_per_byte(Ptr dst, CPtr src, size_t offset, size_t count) {
   LIBC_LOOP_NOUNROLL
-  for (size_t offset = 0; offset < count; ++offset)
-    builtin::Memcpy<1>::block(dst + offset, src + offset);
+  for (; offset < count; ++offset)
+    dst[offset] = src[offset];
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_aligned_access_32bit(Ptr __restrict dst, CPtr __restrict src,
+                                   size_t count) {
+  constexpr size_t kAlign = sizeof(uint32_t);
+  if (count <= 2 * kAlign)
+    return inline_memcpy_byte_per_byte(dst, src, 0, count);
+  size_t bytes_to_dst_align = distance_to_align_up<kAlign>(dst);
+  inline_memcpy_byte_per_byte(dst, src, 0, bytes_to_dst_align);
+  size_t offset = bytes_to_dst_align;
+  size_t src_alignment = distance_to_align_down<kAlign>(src + offset);
+  for (; offset < count - kAlign; offset += kAlign) {
+    uint32_t value;
+    if (src_alignment == 0)
+      value = load32_aligned<uint32_t>(src, offset);
+    else if (src_alignment == 2)
+      value = load32_aligned<uint16_t, uint16_t>(src, offset);
+    else
+      value = load32_aligned<uint8_t, uint16_t, uint8_t>(src, offset);
+    store32_aligned<uint32_t>(value, dst, offset);
+  }
+  // remainder
+  inline_memcpy_byte_per_byte(dst, src, offset, count);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_aligned_access_64bit(Ptr __restrict dst, CPtr __restrict src,
+                                   size_t count) {
+  constexpr size_t kAlign = sizeof(uint64_t);
+  if (count <= 2 * kAlign)
+    return inline_memcpy_byte_per_byte(dst, src, 0, count);
+  size_t bytes_to_dst_align = distance_to_align_up<kAlign>(dst);
+  inline_memcpy_byte_per_byte(dst, src, 0, bytes_to_dst_align);
+  size_t offset = bytes_to_dst_align;
+  size_t src_alignment = distance_to_align_down<kAlign>(src + offset);
+  for (; offset < count - kAlign; offset += kAlign) {
+    uint64_t value;
+    if (src_alignment == 0)
+      value = load64_aligned<uint64_t>(src, offset);
+    else if (src_alignment == 4)
+      value = load64_aligned<uint32_t, uint32_t>(src, offset);
+    else if (src_alignment == 2)
+      value =
+          load64_aligned<uint16_t, uint16_t, uint16_t, uint16_t>(src, offset);
+    else
+      value = load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
+          src, offset);
+    store64_aligned<uint64_t>(value, dst, offset);
+  }
+  // remainder
+  inline_memcpy_byte_per_byte(dst, src, offset, count);
 }
 
 LIBC_INLINE void inline_memcpy(Ptr __restrict dst, CPtr __restrict src,
                                size_t count) {
   using namespace __llvm_libc::builtin;
 #if defined(LIBC_COPT_MEMCPY_USE_EMBEDDED_TINY)
-  return inline_memcpy_embedded_tiny(dst, src, count);
+  return inline_memcpy_byte_per_byte(dst, src, 0, count);
 #elif defined(LIBC_TARGET_ARCH_IS_X86)
   return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count);
 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
   return inline_memcpy_aarch64(dst, src, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV64)
+  return inline_memcpy_aligned_access_64bit(dst, src, count);
+#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
+  return inline_memcpy_aligned_access_32bit(dst, src, count);
 #else
-  return inline_memcpy_embedded_tiny(dst, src, count);
+  return inline_memcpy_byte_per_byte(dst, src, 0, count);
 #endif
 }
 

diff  --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 5c7b360ad108f..ab33331847afe 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -12,8 +12,9 @@
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/cstddef.h"
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/macros/attributes.h"          //LIBC_INLINE
-#include "src/__support/macros/config.h"              // LIBC_HAS_BUILTIN
+#include "src/__support/endian.h"
+#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/config.h"     // LIBC_HAS_BUILTIN
 
 #include <stddef.h> // size_t
 #include <stdint.h> // intptr_t / uintptr_t
@@ -97,8 +98,15 @@ LIBC_INLINE void memcpy_inline(void *__restrict dst,
 #ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
   __builtin_memcpy_inline(dst, src, Size);
 #else
+// In memory functions `memcpy_inline` is instantiated several times with
+// 
diff erent value of the Size parameter. This doesn't play well with GCC's
+// Value Range Analysis that wrongly detects out of bounds accesses. We disable
+// the 'array-bounds' warning for the purpose of this function.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
   for (size_t i = 0; i < Size; ++i)
     static_cast<char *>(dst)[i] = static_cast<const char *>(src)[i];
+#pragma GCC diagnostic pop
 #endif
 }
 
@@ -153,6 +161,81 @@ template <typename T> LIBC_INLINE void store(Ptr ptr, T value) {
   memcpy_inline<sizeof(T)>(ptr, &value);
 }
 
+// On architectures that do not allow for unaligned access we perform several
+// aligned accesses and recombine them through shifts and logicals operations.
+// For instance, if we know that the pointer is 2-byte aligned we can decompose
+// a 64-bit operation into four 16-bit operations.
+
+// Loads a 'ValueType' by decomposing it into several loads that are assumed to
+// be aligned.
+// e.g. load_aligned<uint32_t, uint16_t, uint16_t>(ptr);
+template <typename ValueType, typename T, typename... TS>
+ValueType load_aligned(CPtr src) {
+  static_assert(sizeof(ValueType) >= (sizeof(T) + ... + sizeof(TS)));
+  const ValueType value = load<T>(assume_aligned<sizeof(T)>(src));
+  if constexpr (sizeof...(TS) > 0) {
+    constexpr size_t shift = sizeof(T) * 8;
+    const ValueType next = load_aligned<ValueType, TS...>(src + sizeof(T));
+    if constexpr (Endian::IS_LITTLE)
+      return value | (next << shift);
+    else if constexpr (Endian::IS_BIG)
+      return (value << shift) | next;
+    else
+      deferred_static_assert("Invalid endianness");
+  } else {
+    return value;
+  }
+}
+
+// Alias for loading a 'uint32_t'.
+template <typename T, typename... TS>
+auto load32_aligned(CPtr src, size_t offset) {
+  static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint32_t));
+  return load_aligned<uint32_t, T, TS...>(src + offset);
+}
+
+// Alias for loading a 'uint64_t'.
+template <typename T, typename... TS>
+auto load64_aligned(CPtr src, size_t offset) {
+  static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint64_t));
+  return load_aligned<uint64_t, T, TS...>(src + offset);
+}
+
+// Stores a 'ValueType' by decomposing it into several stores that are assumed
+// to be aligned.
+// e.g. store_aligned<uint32_t, uint16_t, uint16_t>(value, ptr);
+template <typename ValueType, typename T, typename... TS>
+void store_aligned(ValueType value, Ptr dst) {
+  static_assert(sizeof(ValueType) >= (sizeof(T) + ... + sizeof(TS)));
+  constexpr size_t shift = sizeof(T) * 8;
+  if constexpr (Endian::IS_LITTLE) {
+    store<T>(assume_aligned<sizeof(T)>(dst), value & ~T(0));
+    if constexpr (sizeof...(TS) > 0)
+      store_aligned<ValueType, TS...>(value >> shift, dst + sizeof(T));
+  } else if constexpr (Endian::IS_BIG) {
+    constexpr size_t OFFSET = (0 + ... + sizeof(TS));
+    store<T>(assume_aligned<sizeof(T)>(dst + OFFSET), value & ~T(0));
+    if constexpr (sizeof...(TS) > 0)
+      store_aligned<ValueType, TS...>(value >> shift, dst);
+  } else {
+    deferred_static_assert("Invalid endianness");
+  }
+}
+
+// Alias for storing a 'uint32_t'.
+template <typename T, typename... TS>
+void store32_aligned(uint32_t value, Ptr dst, size_t offset) {
+  static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint32_t));
+  store_aligned<uint32_t, T, TS...>(value, dst + offset);
+}
+
+// Alias for storing a 'uint64_t'.
+template <typename T, typename... TS>
+void store64_aligned(uint64_t value, Ptr dst, size_t offset) {
+  static_assert((sizeof(T) + ... + sizeof(TS)) == sizeof(uint64_t));
+  store_aligned<uint64_t, T, TS...>(value, dst + offset);
+}
+
 // Advances the pointers p1 and p2 by offset bytes and decrease count by the
 // same amount.
 template <typename T1, typename T2>

diff  --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp
index 3f8ce5da72aaa..37d61d84c53b6 100644
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@@ -144,4 +144,44 @@ TEST(LlvmLibcUtilsTest, Align2) {
   }
 }
 
+TEST(LlvmLibcUtilsTest, LoadStoreAligned) {
+  const uint64_t init = 0xDEAD'C0DE'BEEF'F00D;
+  CPtr const src = reinterpret_cast<CPtr>(&init);
+  uint64_t store;
+  Ptr const dst = reinterpret_cast<Ptr>(&store);
+
+  using LoadFun = uint64_t (*)(CPtr);
+  using StoreFun = void (*)(uint64_t, Ptr);
+
+  {
+    LoadFun ld = load_aligned<uint64_t, uint64_t>;
+    StoreFun st = store_aligned<uint64_t, uint64_t>;
+    const uint64_t loaded = ld(src);
+    EXPECT_EQ(init, loaded);
+    store = 0;
+    st(init, dst);
+    EXPECT_EQ(init, store);
+  }
+
+  {
+    LoadFun ld = load_aligned<uint64_t, uint32_t, uint32_t>;
+    StoreFun st = store_aligned<uint64_t, uint32_t, uint32_t>;
+    const uint64_t loaded = ld(src);
+    EXPECT_EQ(init, loaded);
+    store = 0;
+    st(init, dst);
+    EXPECT_EQ(init, store);
+  }
+
+  {
+    LoadFun ld = load_aligned<uint64_t, uint32_t, uint16_t, uint8_t, uint8_t>;
+    StoreFun st = store_aligned<uint64_t, uint32_t, uint16_t, uint8_t, uint8_t>;
+    const uint64_t loaded = ld(src);
+    EXPECT_EQ(init, loaded);
+    store = 0;
+    st(init, dst);
+    EXPECT_EQ(init, store);
+  }
+}
+
 } // namespace __llvm_libc


        


More information about the libc-commits mailing list