[libc-commits] [libc] 0d52a7d - [libc][NFC] Simplify memcpy implementation

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Tue May 26 04:39:05 PDT 2020


Author: Guillaume Chatelet
Date: 2020-05-26T11:38:48Z
New Revision: 0d52a7d038e189770984594a6ca71bea50fee4d9

URL: https://github.com/llvm/llvm-project/commit/0d52a7d038e189770984594a6ca71bea50fee4d9
DIFF: https://github.com/llvm/llvm-project/commit/0d52a7d038e189770984594a6ca71bea50fee4d9.diff

LOG: [libc][NFC] Simplify memcpy implementation

Summary: This is a NFC, it aims at simplifying both the code and build files.

Reviewers: abrachet, sivachandra

Subscribers: mgorny, tschuett, ecnelises, libc-commits, courbet

Tags: #libc-project

Differential Revision: https://reviews.llvm.org/D80291

Added: 
    libc/src/string/x86/memcpy.cpp

Modified: 
    libc/src/string/CMakeLists.txt
    libc/src/string/memory_utils/memcpy_utils.h
    libc/test/src/string/memory_utils/memcpy_utils_test.cpp

Removed: 
    libc/src/string/memcpy.cpp
    libc/src/string/memcpy_arch_specific.h.def
    libc/src/string/x86/memcpy_arch_specific.h.inc


################################################################################
diff  --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 3fe0d861aea3..cd3a9b5f77b3 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -68,30 +68,17 @@ endfunction()
 
 # include the relevant architecture specific implementations
 if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64")
-  set(LIBC_STRING_TARGET_FOLDER "x86")
+  set(LIBC_STRING_TARGET_ARCH "x86")
 else()
-  set(LIBC_STRING_TARGET_FOLDER ${LIBC_TARGET_MACHINE})
+  set(LIBC_STRING_TARGET_ARCH ${LIBC_TARGET_MACHINE})
 endif()
 
-add_gen_header(
-  memcpy_arch_specific
-  DEF_FILE
-    memcpy_arch_specific.h.def
-  GEN_HDR
-    memcpy_arch_specific.h
-  PARAMS
-    memcpy_arch_specific=${LIBC_STRING_TARGET_FOLDER}/memcpy_arch_specific.h.inc
-  DATA_FILES
-    ${LIBC_STRING_TARGET_FOLDER}/memcpy_arch_specific.h.inc
-)
-
 function(add_memcpy memcpy_name)
   add_implementation(memcpy ${memcpy_name}
-    SRCS ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp
+    SRCS ${LIBC_SOURCE_DIR}/src/string/${LIBC_STRING_TARGET_ARCH}/memcpy.cpp
     HDRS ${LIBC_SOURCE_DIR}/src/string/memcpy.h
     DEPENDS
       .memory_utils.memory_utils
-      .memcpy_arch_specific
       libc.include.string
     COMPILE_OPTIONS
       -fno-builtin-memcpy
@@ -144,4 +131,4 @@ add_bzero(bzero MARCH native)
 # Add all other relevant implementations for the native target.
 # ------------------------------------------------------------------------------
 
-include(${LIBC_STRING_TARGET_FOLDER}/CMakeLists.txt)
+include(${LIBC_STRING_TARGET_ARCH}/CMakeLists.txt)

diff  --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
deleted file mode 100644
index 2dee707bdc4e..000000000000
--- a/libc/src/string/memcpy.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===-- Implementation of memcpy ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/string/memcpy.h"
-#include "src/__support/common.h"
-#include "src/string/memcpy_arch_specific.h"
-
-namespace __llvm_libc {
-
-void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
-                                   const void *__restrict src, size_t size) {
-  memcpy_no_return(reinterpret_cast<char *>(dst),
-                   reinterpret_cast<const char *>(src), size);
-  return dst;
-}
-
-} // namespace __llvm_libc

diff  --git a/libc/src/string/memcpy_arch_specific.h.def b/libc/src/string/memcpy_arch_specific.h.def
deleted file mode 100644
index 8b991e804000..000000000000
--- a/libc/src/string/memcpy_arch_specific.h.def
+++ /dev/null
@@ -1,65 +0,0 @@
-//===-- Implementation of arch specific memcpy ----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
-
-%%include_file(${memcpy_arch_specific})
-
-namespace __llvm_libc {
-
-// Design rationale
-// ================
-//
-// Using a profiler to observe size distributions for calls into libc
-// functions, it was found most operations act on a small number of bytes.
-// This makes it important to favor small sizes.
-//
-// The tests for `count` are in ascending order so the cost of branching is
-// proportional to the cost of copying.
-//
-// The function is written in C++ for several reasons:
-// - The compiler can __see__ the code, this is useful when performing Profile
-//   Guided Optimization as the optimized code can take advantage of branching
-//   probabilities.
-// - It also allows for easier customization and favors testing multiple
-//   implementation parameters.
-// - As compilers and processors get better, the generated code is improved
-//   with little change on the code side.
-static void memcpy_no_return(char *__restrict dst, const char *__restrict src,
-                             size_t count) {
-  if (count == 0)
-    return;
-  if (count == 1)
-    return Copy<1>(dst, src);
-  if (count == 2)
-    return Copy<2>(dst, src);
-  if (count == 3)
-    return Copy<3>(dst, src);
-  if (count == 4)
-    return Copy<4>(dst, src);
-  if (count < 8)
-    return CopyOverlap<4>(dst, src, count);
-  if (count == 8)
-    return Copy<8>(dst, src);
-  if (count < 16)
-    return CopyOverlap<8>(dst, src, count);
-  if (count == 16)
-    return Copy<16>(dst, src);
-  if (count < 32)
-    return CopyOverlap<16>(dst, src, count);
-  if (count < 64)
-    return CopyOverlap<32>(dst, src, count);
-  if (count < 128)
-    return CopyOverlap<64>(dst, src, count);
-  CopyGE128(dst, src, count);
-}
-
-} // namespace __llvm_libc
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H

diff  --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h
index 09e379393cf2..a0e5ccc81c9e 100644
--- a/libc/src/string/memory_utils/memcpy_utils.h
+++ b/libc/src/string/memory_utils/memcpy_utils.h
@@ -32,7 +32,7 @@ extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict,
 
 // Copies `kBlockSize` bytes from `src` to `dst`.
 template <size_t kBlockSize>
-static void Copy(char *__restrict dst, const char *__restrict src) {
+static void CopyBlock(char *__restrict dst, const char *__restrict src) {
 #if defined(LLVM_LIBC_MEMCPY_MONITOR)
   LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize);
 #elif defined(USE_BUILTIN_MEMCPY_INLINE)
@@ -52,7 +52,7 @@ template <size_t kBlockSize>
 static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
                           size_t count) {
   const size_t offset = count - kBlockSize;
-  Copy<kBlockSize>(dst + offset, src + offset);
+  CopyBlock<kBlockSize>(dst + offset, src + offset);
 }
 
 // Copies `kBlockSize` bytes twice with an overlap between the two.
@@ -64,9 +64,9 @@ static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
 //
 // Precondition: `count >= kBlockSize && count <= kBlockSize`.
 template <size_t kBlockSize>
-static void CopyOverlap(char *__restrict dst, const char *__restrict src,
-                        size_t count) {
-  Copy<kBlockSize>(dst, src);
+static void CopyBlockOverlap(char *__restrict dst, const char *__restrict src,
+                             size_t count) {
+  CopyBlock<kBlockSize>(dst, src);
   CopyLastBlock<kBlockSize>(dst, src, count);
 }
 
@@ -85,14 +85,14 @@ static void CopyOverlap(char *__restrict dst, const char *__restrict src,
 // Precondition: `count > 2 * kBlockSize` for efficiency.
 //               `count >= kBlockSize` for correctness.
 template <size_t kBlockSize>
-static void CopyAligned(char *__restrict dst, const char *__restrict src,
-                        size_t count) {
-  Copy<kBlockSize>(dst, src); // Copy first block
+static void CopyAlignedBlocks(char *__restrict dst, const char *__restrict src,
+                              size_t count) {
+  CopyBlock<kBlockSize>(dst, src); // Copy first block
 
   // Copy aligned blocks
   size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
   for (; offset + kBlockSize < count; offset += kBlockSize)
-    Copy<kBlockSize>(dst + offset, src + offset);
+    CopyBlock<kBlockSize>(dst + offset, src + offset);
 
   CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
 }

diff  --git a/libc/src/string/x86/memcpy.cpp b/libc/src/string/x86/memcpy.cpp
new file mode 100644
index 000000000000..811ce5183fe4
--- /dev/null
+++ b/libc/src/string/x86/memcpy.cpp
@@ -0,0 +1,94 @@
+//===-- Implementation of memcpy ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memcpy.h"
+#include "src/__support/common.h"
+#include "src/string/memory_utils/memcpy_utils.h"
+
+namespace __llvm_libc {
+
+static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
+                         size_t count) {
+  // FIXME: Add MSVC support with
+  // #include <intrin.h>
+  // __movsb(reinterpret_cast<unsigned char *>(dst),
+  //         reinterpret_cast<const unsigned char *>(src), count);
+  asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+}
+
+#if defined(__AVX__)
+#define BEST_SIZE 64
+#else
+#define BEST_SIZE 32
+#endif
+
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// The tests for `count` are in ascending order so the cost of branching is
+// proportional to the cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+//   Guided Optimization as the optimized code can take advantage of branching
+//   probabilities.
+// - It also allows for easier customization and favors testing multiple
+//   implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+//   with little change on the code side.
+static void memcpy_x86(char *__restrict dst, const char *__restrict src,
+                       size_t count) {
+  if (count == 0)
+    return;
+  if (count == 1)
+    return CopyBlock<1>(dst, src);
+  if (count == 2)
+    return CopyBlock<2>(dst, src);
+  if (count == 3)
+    return CopyBlock<3>(dst, src);
+  if (count == 4)
+    return CopyBlock<4>(dst, src);
+  if (count < 8)
+    return CopyBlockOverlap<4>(dst, src, count);
+  if (count == 8)
+    return CopyBlock<8>(dst, src);
+  if (count < 16)
+    return CopyBlockOverlap<8>(dst, src, count);
+  if (count == 16)
+    return CopyBlock<16>(dst, src);
+  if (count < 32)
+    return CopyBlockOverlap<16>(dst, src, count);
+  if (count < 64)
+    return CopyBlockOverlap<32>(dst, src, count);
+  if (count < 128)
+    return CopyBlockOverlap<64>(dst, src, count);
+#if defined(__AVX__)
+  if (count < 256)
+    return CopyBlockOverlap<128>(dst, src, count);
+#endif
+  // kRepMovsBSize == -1 : Only CopyAligned is used.
+  // kRepMovsBSize ==  0 : Only RepMovsb is used.
+  // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
+  constexpr size_t kRepMovsBSize = -1;
+  if (count <= kRepMovsBSize)
+    return CopyAlignedBlocks<BEST_SIZE>(dst, src, count);
+  return CopyRepMovsb(dst, src, count);
+}
+
+void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
+                                   const void *__restrict src, size_t size) {
+  memcpy_x86(reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src),
+             size);
+  return dst;
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/src/string/x86/memcpy_arch_specific.h.inc b/libc/src/string/x86/memcpy_arch_specific.h.inc
deleted file mode 100644
index 60610d4c73d2..000000000000
--- a/libc/src/string/x86/memcpy_arch_specific.h.inc
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "src/string/memory_utils/memcpy_utils.h"
-
-namespace __llvm_libc {
-
-static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
-                         size_t count) {
-  // FIXME: Add MSVC support with
-  // #include <intrin.h>
-  // __movsb(reinterpret_cast<unsigned char *>(dst),
-  //         reinterpret_cast<const unsigned char *>(src), count);
-  asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
-}
-
-#if defined(__AVX__)
-#define BEST_SIZE 64
-#else
-#define BEST_SIZE 32
-#endif
-
-static void CopyGE128(char *__restrict dst, const char *__restrict src,
-                      size_t count) {
-#if defined(__AVX__)
-  if (count < 256)
-    return CopyOverlap<128>(dst, src, count);
-#endif
-  // kRepMovsBSize == -1 : Only CopyAligned is used.
-  // kRepMovsBSize ==  0 : Only RepMovsb is used.
-  // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
-  constexpr size_t kRepMovsBSize = -1;
-  if (count <= kRepMovsBSize)
-    return CopyAligned<BEST_SIZE>(dst, src, count);
-  CopyRepMovsb(dst, src, count);
-}
-
-} // namespace __llvm_libc

diff  --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
index 491c632216b7..7e32fb4f3080 100644
--- a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
+++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
@@ -83,37 +83,37 @@ TEST(MemcpyUtilsTest, CopyTrivial) {
   auto &trace = GetTrace();
 
   trace.Clear();
-  Copy<1>(I(0), I(0));
+  CopyBlock<1>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "1");
   EXPECT_STREQ(trace.Read(), "1");
 
   trace.Clear();
-  Copy<2>(I(0), I(0));
+  CopyBlock<2>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "11");
   EXPECT_STREQ(trace.Read(), "11");
 
   trace.Clear();
-  Copy<4>(I(0), I(0));
+  CopyBlock<4>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "1111");
   EXPECT_STREQ(trace.Read(), "1111");
 
   trace.Clear();
-  Copy<8>(I(0), I(0));
+  CopyBlock<8>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "11111111");
   EXPECT_STREQ(trace.Read(), "11111111");
 
   trace.Clear();
-  Copy<16>(I(0), I(0));
+  CopyBlock<16>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "1111111111111111");
   EXPECT_STREQ(trace.Read(), "1111111111111111");
 
   trace.Clear();
-  Copy<32>(I(0), I(0));
+  CopyBlock<32>(I(0), I(0));
   EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111");
   EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111");
 
   trace.Clear();
-  Copy<64>(I(0), I(0));
+  CopyBlock<64>(I(0), I(0));
   EXPECT_STREQ(
       trace.Write(),
       "1111111111111111111111111111111111111111111111111111111111111111");
@@ -126,41 +126,41 @@ TEST(MemcpyUtilsTest, CopyOffset) {
   auto &trace = GetTrace();
 
   trace.Clear();
-  Copy<1>(I(3), I(1));
+  CopyBlock<1>(I(3), I(1));
   EXPECT_STREQ(trace.Write(), "0001");
   EXPECT_STREQ(trace.Read(), "01");
 
   trace.Clear();
-  Copy<1>(I(2), I(1));
+  CopyBlock<1>(I(2), I(1));
   EXPECT_STREQ(trace.Write(), "001");
   EXPECT_STREQ(trace.Read(), "01");
 }
 
-TEST(MemcpyUtilsTest, CopyOverlap) {
+TEST(MemcpyUtilsTest, CopyBlockOverlap) {
   auto &trace = GetTrace();
 
   trace.Clear();
-  CopyOverlap<2>(I(0), I(0), 2);
+  CopyBlockOverlap<2>(I(0), I(0), 2);
   EXPECT_STREQ(trace.Write(), "22");
   EXPECT_STREQ(trace.Read(), "22");
 
   trace.Clear();
-  CopyOverlap<2>(I(0), I(0), 3);
+  CopyBlockOverlap<2>(I(0), I(0), 3);
   EXPECT_STREQ(trace.Write(), "121");
   EXPECT_STREQ(trace.Read(), "121");
 
   trace.Clear();
-  CopyOverlap<2>(I(0), I(0), 4);
+  CopyBlockOverlap<2>(I(0), I(0), 4);
   EXPECT_STREQ(trace.Write(), "1111");
   EXPECT_STREQ(trace.Read(), "1111");
 
   trace.Clear();
-  CopyOverlap<4>(I(2), I(1), 7);
+  CopyBlockOverlap<4>(I(2), I(1), 7);
   EXPECT_STREQ(trace.Write(), "001112111");
   EXPECT_STREQ(trace.Read(), "01112111");
 }
 
-TEST(MemcpyUtilsTest, CopyAligned) {
+TEST(MemcpyUtilsTest, CopyAlignedBlocks) {
   auto &trace = GetTrace();
   // Destination is aligned already.
   //   "1111000000000"
@@ -169,7 +169,7 @@ TEST(MemcpyUtilsTest, CopyAligned) {
   // + "0000000001111"
   // = "1111111112221"
   trace.Clear();
-  CopyAligned<4>(I(0), I(0), 13);
+  CopyAlignedBlocks<4>(I(0), I(0), 13);
   EXPECT_STREQ(trace.Write(), "1111111112221");
   EXPECT_STREQ(trace.Read(), "1111111112221");
 
@@ -180,7 +180,7 @@ TEST(MemcpyUtilsTest, CopyAligned) {
   // + "00000000001111"
   // = "01112111112211"
   trace.Clear();
-  CopyAligned<4>(I(1), I(0), 13);
+  CopyAlignedBlocks<4>(I(1), I(0), 13);
   EXPECT_STREQ(trace.Write(), "01112111112211");
   EXPECT_STREQ(trace.Read(), "1112111112211");
 }
@@ -191,7 +191,7 @@ TEST(MemcpyUtilsTest, MaxReloads) {
     for (size_t count = 64; count < 768; ++count) {
       trace.Clear();
       // We should never reload more than twice when copying from count = 2x32.
-      CopyAligned<32>(I(alignment), I(0), count);
+      CopyAlignedBlocks<32>(I(alignment), I(0), count);
       const char *const written = trace.Write();
       // First bytes are untouched.
       for (size_t i = 0; i < alignment; ++i)


        


More information about the libc-commits mailing list