[libc-commits] [libc] c2ce8f0 - [libc][NFC] Allow memcpy to be inlined

Thu Nov 4 06:47:30 PDT 2021

Author: Guillaume Chatelet
Date: 2021-11-04T13:47:19Z
New Revision: c2ce8f02ed948195d0275c3e34e7a6f4a40901e6

URL: https://github.com/llvm/llvm-project/commit/c2ce8f02ed948195d0275c3e34e7a6f4a40901e6
DIFF: https://github.com/llvm/llvm-project/commit/c2ce8f02ed948195d0275c3e34e7a6f4a40901e6.diff

LOG: [libc][NFC] Allow memcpy to be inlined

This allows shipping individual functions without also having to provide
`memcpy` at the expense of bigger functions.
Next is to use this `inlined_memcpy` in:
 - loader/linux/x86_64/start.cpp
 - src/string/memmove.cpp
 - src/string/mempcpy.cpp
 - src/string/strcpy.cpp
 - src/string/strdup.cpp
 - src/string/strndup.cpp

Differential Revision: https://reviews.llvm.org/D113097

Added: 
    libc/src/string/memory_utils/memcpy_implementations.h

Modified: 
    libc/src/string/CMakeLists.txt
    libc/src/string/memcpy.cpp

Removed: 
    libc/src/string/aarch64/memcpy.cpp
    libc/src/string/x86_64/memcpy.cpp


################################################################################
diff  --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 0fb50b6c8c353..c7cdcaa913059 100644

--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -357,7 +357,7 @@ endif()
 
 function(add_memcpy memcpy_name)
   add_implementation(memcpy ${memcpy_name}
-    SRCS ${MEMCPY_SRC}
+    SRCS ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp
     HDRS ${LIBC_SOURCE_DIR}/src/string/memcpy.h
     DEPENDS
       .memory_utils.memory_utils
@@ -369,7 +369,6 @@ function(add_memcpy memcpy_name)
 endfunction()
 
 if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
-  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/x86_64/memcpy.cpp)
   add_memcpy(memcpy_x86_64_opt_sse2   COMPILE_OPTIONS -march=k8             REQUIRE SSE2)
   add_memcpy(memcpy_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
   add_memcpy(memcpy_x86_64_opt_avx2   COMPILE_OPTIONS -march=haswell        REQUIRE AVX2)
@@ -377,14 +376,12 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memcpy(memcpy)
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
-  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/aarch64/memcpy.cpp)
   # Disable tail merging as it leads to lower performance.
   # Note that '-mllvm' needs to be prefixed with 'SHELL:' to prevent CMake flag deduplication.
   add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}
                                       COMPILE_OPTIONS "SHELL:-mllvm --tail-merge-threshold=0")
   add_memcpy(memcpy                   COMPILE_OPTIONS "SHELL:-mllvm --tail-merge-threshold=0")
 else()
-  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp)
   add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memcpy(memcpy)
 endif()

diff  --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp
deleted file mode 100644
index 1a1fbbc026a73..0000000000000
--- a/libc/src/string/aarch64/memcpy.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===-- Implementation of memcpy ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/string/memcpy.h"
-#include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
-
-namespace __llvm_libc {
-
-using _1 = scalar::UINT8;
-using _2 = scalar::UINT16;
-using _3 = Chained<scalar::UINT16, scalar::UINT8>;
-using _4 = scalar::UINT32;
-using _8 = scalar::UINT64;
-using _16 = Repeated<scalar::UINT64, 2>;
-using _32 = Repeated<scalar::UINT64, 4>;
-using _64 = Repeated<scalar::UINT64, 8>;
-
-// Design rationale
-// ================
-//
-// Using a profiler to observe size distributions for calls into libc
-// functions, it was found most operations act on a small number of bytes.
-// This makes it important to favor small sizes.
-//
-// We have used __builtin_expect to tell the compiler to favour lower sizes as
-// that will reduce the branching overhead where that would hurt most
-// proportional to total cost of copying.
-//
-// The function is written in C++ for several reasons:
-// - The compiler can __see__ the code, this is useful when performing Profile
-//   Guided Optimization as the optimized code can take advantage of branching
-//   probabilities.
-// - It also allows for easier customization and favors testing multiple
-//   implementation parameters.
-// - As compilers and processors get better, the generated code is improved
-//   with little change on the code side.
-// This implementation has been tuned for Neoverse-N1.
-static void memcpy_aarch64(char *__restrict dst, const char *__restrict src,
-                           size_t count) {
-  if (count == 0)
-    return;
-  if (count == 1)
-    return Copy<_1>(dst, src);
-  if (count == 2)
-    return Copy<_2>(dst, src);
-  if (count == 3)
-    return Copy<_3>(dst, src);
-  if (count == 4)
-    return Copy<_4>(dst, src);
-  if (count < 8)
-    return Copy<HeadTail<_4>>(dst, src, count);
-  if (count < 16)
-    return Copy<HeadTail<_8>>(dst, src, count);
-  if (count < 32)
-    return Copy<HeadTail<_16>>(dst, src, count);
-  if (count < 64)
-    return Copy<HeadTail<_32>>(dst, src, count);
-  if (count < 128)
-    return Copy<HeadTail<_64>>(dst, src, count);
-  return Copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
-}
-
-LLVM_LIBC_FUNCTION(void *, memcpy,
-                   (void *__restrict dst, const void *__restrict src,
-                    size_t size)) {
-  memcpy_aarch64(reinterpret_cast<char *>(dst),
-                 reinterpret_cast<const char *>(src), size);
-  return dst;
-}
-
-} // namespace __llvm_libc

diff  --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
index 5e70e00db1b91..ff990f48a20bc 100644
--- a/libc/src/string/memcpy.cpp
+++ b/libc/src/string/memcpy.cpp
@@ -8,61 +8,15 @@
 
 #include "src/string/memcpy.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/memcpy_implementations.h"
 
 namespace __llvm_libc {
 
-// Design rationale
-// ================
-//
-// Using a profiler to observe size distributions for calls into libc
-// functions, it was found most operations act on a small number of bytes.
-// This makes it important to favor small sizes.
-//
-// The tests for `count` are in ascending order so the cost of branching is
-// proportional to the cost of copying.
-//
-// The function is written in C++ for several reasons:
-// - The compiler can __see__ the code, this is useful when performing Profile
-//   Guided Optimization as the optimized code can take advantage of branching
-//   probabilities.
-// - It also allows for easier customization and favors testing multiple
-//   implementation parameters.
-// - As compilers and processors get better, the generated code is improved
-//   with little change on the code side.
-static void memcpy_impl(char *__restrict dst, const char *__restrict src,
-                        size_t count) {
-  // Use scalar strategies (_1, _2, _3 ...)
-  using namespace __llvm_libc::scalar;
-
-  if (count == 0)
-    return;
-  if (count == 1)
-    return Copy<_1>(dst, src);
-  if (count == 2)
-    return Copy<_2>(dst, src);
-  if (count == 3)
-    return Copy<_3>(dst, src);
-  if (count == 4)
-    return Copy<_4>(dst, src);
-  if (count < 8)
-    return Copy<HeadTail<_4>>(dst, src, count);
-  if (count < 16)
-    return Copy<HeadTail<_8>>(dst, src, count);
-  if (count < 32)
-    return Copy<HeadTail<_16>>(dst, src, count);
-  if (count < 64)
-    return Copy<HeadTail<_32>>(dst, src, count);
-  if (count < 128)
-    return Copy<HeadTail<_64>>(dst, src, count);
-  return Copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
-}
-
 LLVM_LIBC_FUNCTION(void *, memcpy,
                    (void *__restrict dst, const void *__restrict src,
                     size_t size)) {
-  memcpy_impl(reinterpret_cast<char *>(dst),
-              reinterpret_cast<const char *>(src), size);
+  inline_memcpy(reinterpret_cast<char *>(dst),
+                reinterpret_cast<const char *>(src), size);
   return dst;
 }
 

diff  --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h
new file mode 100644
index 0000000000000..2a738f7ecf1d3
--- /dev/null
+++ b/libc/src/string/memory_utils/memcpy_implementations.h
@@ -0,0 +1,157 @@
+//===-- Memcpy implementation -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H
+
+#include "src/__support/architectures.h"
+#include "src/__support/common.h"
+#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/utils.h"
+
+#include <stddef.h> // size_t
+
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// The tests for `count` are in ascending order so the cost of branching is
+// proportional to the cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+//   Guided Optimization as the optimized code can take advantage of branching
+//   probabilities.
+// - It also allows for easier customization and favors testing multiple
+//   implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+//   with little change on the code side.
+
+namespace __llvm_libc {
+
+static inline void inline_memcpy(char *__restrict dst,
+                                 const char *__restrict src, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_X86
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::x86;
+
+  // Whether to use only rep;movsb.
+  constexpr bool kUseOnlyRepMovsb =
+      LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
+
+  // kRepMovsBSize == -1 : Only CopyAligned is used.
+  // kRepMovsBSize ==  0 : Only RepMovsb is used.
+  // else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
+  constexpr size_t kRepMovsBSize =
+#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
+      LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
+#else
+      -1;
+#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+
+  // Whether target supports AVX instructions.
+  constexpr bool kHasAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
+
+#if defined(__AVX__)
+  using LoopBlockSize = _64;
+#else
+  using LoopBlockSize = _32;
+#endif
+
+  if (kUseOnlyRepMovsb)
+    return Copy<Accelerator>(dst, src, count);
+
+  if (count == 0)
+    return;
+  if (count == 1)
+    return Copy<_1>(dst, src);
+  if (count == 2)
+    return Copy<_2>(dst, src);
+  if (count == 3)
+    return Copy<_3>(dst, src);
+  if (count == 4)
+    return Copy<_4>(dst, src);
+  if (count < 8)
+    return Copy<HeadTail<_4>>(dst, src, count);
+  if (count < 16)
+    return Copy<HeadTail<_8>>(dst, src, count);
+  if (count < 32)
+    return Copy<HeadTail<_16>>(dst, src, count);
+  if (count < 64)
+    return Copy<HeadTail<_32>>(dst, src, count);
+  if (count < 128)
+    return Copy<HeadTail<_64>>(dst, src, count);
+  if (kHasAvx && count < 256)
+    return Copy<HeadTail<_128>>(dst, src, count);
+  if (count <= kRepMovsBSize)
+    return Copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
+                                                                 count);
+  return Copy<Accelerator>(dst, src, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_AARCH64
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::scalar;
+  if (count == 0)
+    return;
+  if (count == 1)
+    return Copy<_1>(dst, src);
+  if (count == 2)
+    return Copy<_2>(dst, src);
+  if (count == 3)
+    return Copy<_3>(dst, src);
+  if (count == 4)
+    return Copy<_4>(dst, src);
+  if (count < 8)
+    return Copy<HeadTail<_4>>(dst, src, count);
+  if (count < 16)
+    return Copy<HeadTail<_8>>(dst, src, count);
+  if (count < 32)
+    return Copy<HeadTail<_16>>(dst, src, count);
+  if (count < 64)
+    return Copy<HeadTail<_32>>(dst, src, count);
+  if (count < 128)
+    return Copy<HeadTail<_64>>(dst, src, count);
+  return Copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
+#else
+  /////////////////////////////////////////////////////////////////////////////
+  // Default
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::scalar;
+  if (count == 0)
+    return;
+  if (count == 1)
+    return Copy<_1>(dst, src);
+  if (count == 2)
+    return Copy<_2>(dst, src);
+  if (count == 3)
+    return Copy<_3>(dst, src);
+  if (count == 4)
+    return Copy<_4>(dst, src);
+  if (count < 8)
+    return Copy<HeadTail<_4>>(dst, src, count);
+  if (count < 16)
+    return Copy<HeadTail<_8>>(dst, src, count);
+  if (count < 32)
+    return Copy<HeadTail<_16>>(dst, src, count);
+  if (count < 64)
+    return Copy<HeadTail<_32>>(dst, src, count);
+  if (count < 128)
+    return Copy<HeadTail<_64>>(dst, src, count);
+  return Copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
+#endif
+}
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H

diff  --git a/libc/src/string/x86_64/memcpy.cpp b/libc/src/string/x86_64/memcpy.cpp
deleted file mode 100644
index 7f6e5b64b3a74..0000000000000
--- a/libc/src/string/x86_64/memcpy.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-//===-- Implementation of memcpy ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/string/memcpy.h"
-#include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
-
-namespace __llvm_libc {
-
-// Whether to use only rep;movsb.
-constexpr bool kUseOnlyRepMovsb =
-    LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
-
-// kRepMovsBSize == -1 : Only CopyAligned is used.
-// kRepMovsBSize ==  0 : Only RepMovsb is used.
-// else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
-constexpr size_t kRepMovsBSize =
-#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-    LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
-#else
-    -1;
-#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-
-// Whether target supports AVX instructions.
-constexpr bool kHasAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
-
-#ifdef __AVX__
-using LoopBlockSize = __llvm_libc::x86::_64;
-#else
-using LoopBlockSize = __llvm_libc::x86::_32;
-#endif
-
-static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
-                         size_t count) {
-  // FIXME: Add MSVC support with
-  // #include <intrin.h>
-  // __movsb(reinterpret_cast<unsigned char *>(dst),
-  //         reinterpret_cast<const unsigned char *>(src), count);
-  asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
-}
-
-// Design rationale
-// ================
-//
-// Using a profiler to observe size distributions for calls into libc
-// functions, it was found most operations act on a small number of bytes.
-// This makes it important to favor small sizes.
-//
-// The tests for `count` are in ascending order so the cost of branching is
-// proportional to the cost of copying.
-//
-// The function is written in C++ for several reasons:
-// - The compiler can __see__ the code, this is useful when performing Profile
-//   Guided Optimization as the optimized code can take advantage of branching
-//   probabilities.
-// - It also allows for easier customization and favors testing multiple
-//   implementation parameters.
-// - As compilers and processors get better, the generated code is improved
-//   with little change on the code side.
-static void memcpy_x86(char *__restrict dst, const char *__restrict src,
-                       size_t count) {
-  // Use x86 strategies (_1, _2, _3 ...)
-  using namespace __llvm_libc::x86;
-
-  if (kUseOnlyRepMovsb)
-    return CopyRepMovsb(dst, src, count);
-
-  if (count == 0)
-    return;
-  if (count == 1)
-    return Copy<_1>(dst, src);
-  if (count == 2)
-    return Copy<_2>(dst, src);
-  if (count == 3)
-    return Copy<_3>(dst, src);
-  if (count == 4)
-    return Copy<_4>(dst, src);
-  if (count < 8)
-    return Copy<HeadTail<_4>>(dst, src, count);
-  if (count < 16)
-    return Copy<HeadTail<_8>>(dst, src, count);
-  if (count < 32)
-    return Copy<HeadTail<_16>>(dst, src, count);
-  if (count < 64)
-    return Copy<HeadTail<_32>>(dst, src, count);
-  if (count < 128)
-    return Copy<HeadTail<_64>>(dst, src, count);
-  if (kHasAvx && count < 256)
-    return Copy<HeadTail<_128>>(dst, src, count);
-  if (count <= kRepMovsBSize)
-    return Copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
-                                                                 count);
-  return CopyRepMovsb(dst, src, count);
-}
-
-LLVM_LIBC_FUNCTION(void *, memcpy,
-                   (void *__restrict dst, const void *__restrict src,
-                    size_t size)) {
-  memcpy_x86(reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src),
-             size);
-  return dst;
-}
-
-} // namespace __llvm_libc