[libc-commits] [libc] 04a309d - [libc] Adding memcpy implementation for x86_64

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Wed Mar 18 09:48:36 PDT 2020


Author: Guillaume Chatelet
Date: 2020-03-18T17:43:21+01:00
New Revision: 04a309dd0be3aea17ab6e84f8bfc046c1f044be2

URL: https://github.com/llvm/llvm-project/commit/04a309dd0be3aea17ab6e84f8bfc046c1f044be2
DIFF: https://github.com/llvm/llvm-project/commit/04a309dd0be3aea17ab6e84f8bfc046c1f044be2.diff

LOG: [libc] Adding memcpy implementation for x86_64

Summary:
The patch is not ready yet and is here to discuss a few options:
 - How do we customize the implementation? (i.e. how to define `kRepMovsBSize`),
 - How do we specify custom compilation flags? (We'd need `-fno-builtin-memcpy` to be passed in),
 - How do we build? We may want to test in debug but build the libc with `-march=native` for instance,
 - Clang has a brand new builtin `__builtin_memcpy_inline` which makes the implementation easy and efficient, but:
   - If we compile with `gcc` or `msvc` we can't use it, resorting on less efficient code generation,
   - With gcc we can use `__builtin_memcpy` but then we'd need a postprocess step to check that the final assembly do not contain call to `memcpy` (unlikely but allowed),
   - For msvc we'd need to resort on the compiler optimization passes.

Reviewers: sivachandra, abrachet

Subscribers: mgorny, MaskRay, tschuett, libc-commits, courbet

Tags: #libc-project

Differential Revision: https://reviews.llvm.org/D74397

Added: 
    libc/cmake/modules/cpu_features/check_cpu_features.cpp.in
    libc/src/string/memcpy.cpp
    libc/src/string/memcpy.h
    libc/src/string/memcpy_arch_specific.h.def
    libc/src/string/memory_utils/memcpy_utils.h
    libc/src/string/x86/CMakeLists.txt
    libc/src/string/x86/memcpy_arch_specific.h.inc
    libc/test/src/string/memcpy_test.cpp
    libc/test/src/string/memory_utils/memcpy_utils_test.cpp

Modified: 
    libc/CMakeLists.txt
    libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
    libc/cmake/modules/LLVMLibCRules.cmake
    libc/lib/CMakeLists.txt
    libc/src/string/CMakeLists.txt
    libc/src/string/memory_utils/CMakeLists.txt
    libc/src/string/memory_utils/utils.h
    libc/test/src/string/CMakeLists.txt
    libc/test/src/string/memory_utils/CMakeLists.txt
    libc/test/src/string/memory_utils/utils_test.cpp

Removed: 
    libc/cmake/modules/cpu_features/check_avx.cpp
    libc/cmake/modules/cpu_features/check_avx512f.cpp
    libc/cmake/modules/cpu_features/check_sse.cpp
    libc/cmake/modules/cpu_features/check_sse2.cpp


################################################################################
diff  --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index fd750a06e0e1..4ee4d7dca573 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -21,6 +21,7 @@ set(LIBC_TARGET_MACHINE ${CMAKE_SYSTEM_PROCESSOR})
 
 include(CMakeParseArguments)
 include(LLVMLibCRules)
+include(LLVMLibCCheckCpuFeatures)
 
 add_subdirectory(src)
 add_subdirectory(config)

diff  --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
index adf81f3e38ab..0bb4af869487 100644
--- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
@@ -1,99 +1,129 @@
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 # Cpu features definition and flags
-#
-# Declare a list of all supported cpu features in ALL_CPU_FEATURES.
-#
-# Declares associated flags to enable/disable individual feature of the form:
-# - CPU_FEATURE_<FEATURE>_ENABLE_FLAG
-# - CPU_FEATURE_<FEATURE>_DISABLE_FLAG
-#
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 
 if(${LIBC_TARGET_MACHINE} MATCHES "x86|x86_64")
-  set(ALL_CPU_FEATURES SSE SSE2 AVX AVX512F)
+  set(ALL_CPU_FEATURES SSE SSE2 AVX AVX2 AVX512F)
 endif()
 
-function(_define_cpu_feature_flags feature)
-  if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-    string(TOLOWER ${feature} lowercase_feature)
-    set(CPU_FEATURE_${feature}_ENABLE_FLAG "-m${lowercase_feature}" PARENT_SCOPE)
-    set(CPU_FEATURE_${feature}_DISABLE_FLAG "-mno-${lowercase_feature}" PARENT_SCOPE)
+list(SORT ALL_CPU_FEATURES)
+
+# Function to check whether the host supports the provided set of features.
+# Usage:
+# host_supports(
+#   <output variable>
+#   <list of cpu features>
+# )
+function(host_supports output_var features)
+  _intersection(a "${HOST_CPU_FEATURES}" "${features}")
+  if("${a}" STREQUAL "${features}")
+    set(${output_var} TRUE PARENT_SCOPE)
+  else()
+    unset(${output_var} PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Function to compute the flags to pass down to the compiler.
+# Usage:
+# compute_flags(
+#   <output variable>
+#   MARCH <arch name or "native">
+#   REQUIRE <list of mandatory features to enable>
+#   REJECT <list of features to disable>
+# )
+function(compute_flags output_var)
+  cmake_parse_arguments(
+    "COMPUTE_FLAGS"
+    "" # Optional arguments
+    "MARCH" # Single value arguments
+    "REQUIRE;REJECT" # Multi value arguments
+    ${ARGN})
+  # Check that features are not required and rejected at the same time.
+  if(COMPUTE_FLAGS_REQUIRE AND COMPUTE_FLAGS_REJECT)
+    _intersection(var ${COMPUTE_FLAGS_REQUIRE} ${COMPUTE_FLAGS_REJECT})
+    if(var)
+      message(FATAL_ERROR "Cpu Features REQUIRE and REJECT ${var}")
+    endif()
+  endif()
+  # Generate the compiler flags in `current`.
+  if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang|GNU")
+    if(COMPUTE_FLAGS_MARCH)
+      list(APPEND current "-march=${COMPUTE_FLAGS_MARCH}")
+    endif()
+    foreach(feature IN LISTS COMPUTE_FLAGS_REQUIRE)
+      string(TOLOWER ${feature} lowercase_feature)
+      list(APPEND current "-m${lowercase_feature}")
+    endforeach()
+    foreach(feature IN LISTS COMPUTE_FLAGS_REJECT)
+      string(TOLOWER ${feature} lowercase_feature)
+      list(APPEND current "-mno-${lowercase_feature}")
+    endforeach()
   else()
     # In future, we can extend for other compilers.
     message(FATAL_ERROR "Unkown compiler ${CMAKE_CXX_COMPILER_ID}.")
   endif()
+  # Export the list of flags.
+  set(${output_var} "${current}" PARENT_SCOPE)
 endfunction()
 
-# Defines cpu features flags
-foreach(feature IN LISTS ALL_CPU_FEATURES)
-  _define_cpu_feature_flags(${feature})
-endforeach()
-
-#------------------------------------------------------------------------------
-# Optimization level flags
-#
-# Generates the set of flags needed to compile for a up to a particular
-# optimization level.
-#
-# Creates variables of the form `CPU_FEATURE_OPT_<FEATURE>_FLAGS`.
-# CPU_FEATURE_OPT_NONE_FLAGS is a special flag for which no feature is needed.
-#
-# e.g.
-# CPU_FEATURE_OPT_NONE_FLAGS : -mno-sse;-mno-sse2;-mno-avx;-mno-avx512f
-# CPU_FEATURE_OPT_SSE_FLAGS : -msse;-mno-sse2;-mno-avx;-mno-avx512f
-# CPU_FEATURE_OPT_SSE2_FLAGS : -msse;-msse2;-mno-avx;-mno-avx512f
-# CPU_FEATURE_OPT_AVX_FLAGS : -msse;-msse2;-mavx;-mno-avx512f
-# CPU_FEATURE_OPT_AVX512F_FLAGS : -msse;-msse2;-mavx;-mavx512f
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
+# Internal helpers and utilities.
+# ------------------------------------------------------------------------------
 
-# Helper function to concatenate flags needed to support optimization up to
-# a particular feature.
-function(_generate_flags_for_up_to feature flag_variable)
-  list(FIND ALL_CPU_FEATURES ${feature} feature_index)
-  foreach(current_feature IN LISTS ALL_CPU_FEATURES)
-    list(FIND ALL_CPU_FEATURES ${current_feature} current_feature_index)  
-    if(${current_feature_index} GREATER ${feature_index})
-      list(APPEND flags ${CPU_FEATURE_${current_feature}_DISABLE_FLAG})
-    else()
-      list(APPEND flags ${CPU_FEATURE_${current_feature}_ENABLE_FLAG})
+# Computes the intersection between two lists.
+function(_intersection output_var list1 list2)
+  foreach(element IN LISTS list1)
+    if("${list2}" MATCHES "(^|;)${element}(;|$)")
+      list(APPEND tmp "${element}")
     endif()
   endforeach()
-  set(${flag_variable} ${flags} PARENT_SCOPE)
+  set(${output_var} ${tmp} PARENT_SCOPE)
 endfunction()
 
-function(_generate_opt_levels)
-  set(opt_levels NONE)
-  list(APPEND opt_levels ${ALL_CPU_FEATURES})
-  foreach(feature IN LISTS opt_levels)
-    set(flag_name "CPU_FEATURE_OPT_${feature}_FLAGS")
-    _generate_flags_for_up_to(${feature} ${flag_name})
-    set(${flag_name} ${${flag_name}} PARENT_SCOPE)
+# Generates a cpp file to introspect the compiler defined flags.
+function(_generate_check_code)
+  foreach(feature IN LISTS ALL_CPU_FEATURES)
+    set(DEFINITIONS
+        "${DEFINITIONS}
+#ifdef __${feature}__
+    \"${feature}\",
+#endif")
   endforeach()
+  configure_file(
+    "${LIBC_SOURCE_DIR}/cmake/modules/cpu_features/check_cpu_features.cpp.in"
+    "cpu_features/check_cpu_features.cpp" @ONLY)
 endfunction()
+_generate_check_code()
 
-_generate_opt_levels()
-
-#------------------------------------------------------------------------------
-# Host cpu feature introspection
-#
-# Populates a HOST_CPU_FEATURES list containing the available CPU_FEATURE.
-#------------------------------------------------------------------------------
-function(_check_host_cpu_feature feature)
-  string(TOLOWER ${feature} lowercase_feature)
+# Compiles and runs the code generated above with the specified requirements.
+# This is helpful to infer which features a particular target supports or if
+# a specific features implies other features (e.g. BMI2 implies SSE2 and SSE).
+function(_check_defined_cpu_feature output_var)
+  cmake_parse_arguments(
+    "CHECK_DEFINED"
+    "" # Optional arguments
+    "MARCH" # Single value arguments
+    "REQUIRE;REJECT" # Multi value arguments
+    ${ARGN})
+  compute_flags(
+    flags
+    MARCH  ${CHECK_DEFINED_MARCH}
+    REQUIRE ${CHECK_DEFINED_REQUIRE}
+    REJECT  ${CHECK_DEFINED_REJECT})
   try_run(
-    run_result
-    compile_result
-    "${CMAKE_CURRENT_BINARY_DIR}/check_${lowercase_feature}"
-    "${CMAKE_MODULE_PATH}/cpu_features/check_${lowercase_feature}.cpp"
-    COMPILE_DEFINITIONS ${CPU_FEATURE_${feature}_ENABLE_FLAG}
-    OUTPUT_VARIABLE compile_output
-  )
+    run_result compile_result "${CMAKE_CURRENT_BINARY_DIR}/check_${feature}"
+    "${CMAKE_CURRENT_BINARY_DIR}/cpu_features/check_cpu_features.cpp"
+    COMPILE_DEFINITIONS ${flags}
+    COMPILE_OUTPUT_VARIABLE compile_output
+    RUN_OUTPUT_VARIABLE run_output)
   if(${compile_result} AND ("${run_result}" EQUAL 0))
-    list(APPEND HOST_CPU_FEATURES ${feature})
-    set(HOST_CPU_FEATURES ${HOST_CPU_FEATURES} PARENT_SCOPE)
+    set(${output_var}
+        "${run_output}"
+        PARENT_SCOPE)
+  else()
+    message(FATAL_ERROR "${compile_output}")
   endif()
 endfunction()
 
-foreach(feature IN LISTS ALL_CPU_FEATURES)
-  _check_host_cpu_feature(${feature})
-endforeach()
+# Populates the HOST_CPU_FEATURES list.
+_check_defined_cpu_feature(HOST_CPU_FEATURES MARCH native)

diff  --git a/libc/cmake/modules/LLVMLibCRules.cmake b/libc/cmake/modules/LLVMLibCRules.cmake
index 18e1d0a081c3..2391ea50b0db 100644
--- a/libc/cmake/modules/LLVMLibCRules.cmake
+++ b/libc/cmake/modules/LLVMLibCRules.cmake
@@ -372,6 +372,7 @@ endfunction(add_redirector_library)
 #      SRCS  <list of .cpp files for the test>
 #      HDRS  <list of .h files for the test>
 #      DEPENDS <list of dependencies>
+#      COMPILE_OPTIONS <list of special compile options for this target>
 #    )
 function(add_libc_unittest target_name)
   if(NOT LLVM_INCLUDE_TESTS)
@@ -382,7 +383,7 @@ function(add_libc_unittest target_name)
     "LIBC_UNITTEST"
     "" # No optional arguments
     "SUITE" # Single value arguments
-    "SRCS;HDRS;DEPENDS" # Multi-value arguments
+    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi-value arguments
     ${ARGN}
   )
   if(NOT LIBC_UNITTEST_SRCS)
@@ -420,6 +421,12 @@ function(add_libc_unittest target_name)
       ${LIBC_BUILD_DIR}
       ${LIBC_BUILD_DIR}/include
   )
+  if(LIBC_UNITTEST_COMPILE_OPTIONS)
+    target_compile_options(
+      ${target_name}
+      PRIVATE ${LIBC_UNITTEST_COMPILE_OPTIONS}
+    )
+  endif()
 
   if(library_deps)
     target_link_libraries(${target_name} PRIVATE ${library_deps})

diff  --git a/libc/cmake/modules/cpu_features/check_avx.cpp b/libc/cmake/modules/cpu_features/check_avx.cpp
deleted file mode 100644
index f0db3abab4e5..000000000000
--- a/libc/cmake/modules/cpu_features/check_avx.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#if !defined __AVX__
-#error "missing __AVX__"
-#endif
-#include <immintrin.h>
-int main() {
-  (void)_mm256_set1_epi8('0');
-  return 0;
-}

diff  --git a/libc/cmake/modules/cpu_features/check_avx512f.cpp b/libc/cmake/modules/cpu_features/check_avx512f.cpp
deleted file mode 100644
index 93444e737ef4..000000000000
--- a/libc/cmake/modules/cpu_features/check_avx512f.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#if !defined __AVX512F__
-#error "missing __AVX512F__"
-#endif
-#include <immintrin.h>
-int main() {
-  (void)_mm512_undefined();
-  return 0;
-}

diff  --git a/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in b/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in
new file mode 100644
index 000000000000..25f67a63e192
--- /dev/null
+++ b/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in
@@ -0,0 +1,29 @@
+#include <cstdio>
+#include <cstdlib>
+
+// This file is instantiated by CMake.
+// DEFINITIONS below is replaced with a set of lines like so:
+//   #ifdef __SSE2__
+//     "SSE2",
+//   #endif
+//
+// This allows for introspection of compiler definitions.
+// The output of the program is a single line of semi colon separated feature
+// names.
+
+// MSVC is using a 
diff erent set of preprocessor definitions for 
+// SSE and SSE2, see _M_IX86_FP in
+// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+
+int main(int, char **) {
+  const char *strings[] = {
+      @DEFINITIONS@
+  };
+  const size_t size = sizeof(strings) / sizeof(strings[0]);
+  for (size_t i = 0; i < size; ++i) {
+    if (i)
+      putchar(';');
+    fputs(strings[i], stdout);
+  }
+  return EXIT_SUCCESS;
+}

diff  --git a/libc/cmake/modules/cpu_features/check_sse.cpp b/libc/cmake/modules/cpu_features/check_sse.cpp
deleted file mode 100644
index 1c1f67179fde..000000000000
--- a/libc/cmake/modules/cpu_features/check_sse.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#if !defined __SSE__
-#error "missing __SSE__"
-#endif
-#include <immintrin.h>
-int main() {
-  (void)_mm_set_ss(1.0f);
-  return 0;
-}

diff  --git a/libc/cmake/modules/cpu_features/check_sse2.cpp b/libc/cmake/modules/cpu_features/check_sse2.cpp
deleted file mode 100644
index f1e598de5877..000000000000
--- a/libc/cmake/modules/cpu_features/check_sse2.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#if !defined __SSE2__
-#error "missing __SSE2__"
-#endif
-#include <immintrin.h>
-int main() {
-  (void)_mm_set1_epi8('0');
-  return 0;
-}

diff  --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index 832d79c1e859..b234c91704a9 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -11,6 +11,7 @@ add_entrypoint_library(
     # string.h entrypoints
     strcpy
     strcat
+    memcpy
 
     # sys/mman.h entrypoints
     mmap

diff  --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 49b0afd2cedf..729ccaaa2b20 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(memory_utils)
+
 add_entrypoint_object(
   strcat
   SRCS
@@ -19,4 +21,63 @@ add_entrypoint_object(
     string_h
 )
 
-add_subdirectory(memory_utils)
+# ------------------------------------------------------------------------------
+# memcpy
+# ------------------------------------------------------------------------------
+
+# include the relevant architecture specific implementations
+if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64")
+  set(LIBC_MEMCPY_IMPL_FOLDER "x86")
+else()
+  set(LIBC_MEMCPY_IMPL_FOLDER ${LIBC_TARGET_MACHINE})
+endif()
+
+add_gen_header(
+  memcpy_arch_specific
+  DEF_FILE
+    memcpy_arch_specific.h.def
+  GEN_HDR
+    memcpy_arch_specific.h
+  PARAMS
+    memcpy_arch_specific=${LIBC_MEMCPY_IMPL_FOLDER}/memcpy_arch_specific.h.inc
+  DATA_FILES
+    ${LIBC_MEMCPY_IMPL_FOLDER}/memcpy_arch_specific.h.inc
+)
+
+# Helper to define an implementation of memcpy.
+# - Computes flags to satisfy required/rejected features and arch,
+# - Declares an entry point,
+# - Attach the REQUIRE_CPU_FEATURES property to the target,
+# - Add the target to `memcpy_implementations` global property for tests.
+function(add_memcpy memcpy_name)
+  cmake_parse_arguments(
+    "ADD_MEMCPY"
+    "" # Optional arguments
+    "MARCH" # Single value arguments
+    "REQUIRE;REJECT" # Multi value arguments
+    ${ARGN})
+  compute_flags(flags
+    MARCH ${ADD_MEMCPY_MARCH}
+    REQUIRE ${ADD_MEMCPY_REQUIRE}
+    REJECT ${ADD_MEMCPY_REJECT}
+  )
+  add_entrypoint_object(
+    ${memcpy_name}
+    SRCS ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp
+    HDRS ${LIBC_SOURCE_DIR}/src/string/memcpy.h
+    DEPENDS
+      string_h
+      memory_utils
+      memcpy_arch_specific
+    COMPILE_OPTIONS
+      -fno-builtin-memcpy
+      ${flags}
+  )
+  set_target_properties(${memcpy_name} PROPERTIES REQUIRE_CPU_FEATURES "${ADD_MEMCPY_REQUIRE}")
+  get_property(all GLOBAL PROPERTY memcpy_implementations)
+  list(APPEND all ${memcpy_name})
+  set_property(GLOBAL PROPERTY memcpy_implementations "${all}")
+endfunction()
+
+add_subdirectory(${LIBC_MEMCPY_IMPL_FOLDER})
+add_memcpy(memcpy MARCH native)

diff  --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
new file mode 100644
index 000000000000..216e22938e7f
--- /dev/null
+++ b/libc/src/string/memcpy.cpp
@@ -0,0 +1,22 @@
+//===--------------------- Implementation of memcpy -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memcpy.h"
+#include "src/__support/common.h"
+#include "src/string/memcpy_arch_specific.h"
+
+namespace __llvm_libc {
+
+void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
+                                   const void *__restrict src, size_t size) {
+  memcpy_no_return(reinterpret_cast<char *>(dst),
+                   reinterpret_cast<const char *>(src), size);
+  return dst;
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/src/string/memcpy.h b/libc/src/string/memcpy.h
new file mode 100644
index 000000000000..a3ae4d40c874
--- /dev/null
+++ b/libc/src/string/memcpy.h
@@ -0,0 +1,21 @@
+//===----------------- Implementation header for memcpy -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMCPY_H
+#define LLVM_LIBC_SRC_STRING_MEMCPY_H
+
+#include "include/string.h"
+#include <stddef.h> // size_t
+
+namespace __llvm_libc {
+
+void *memcpy(void *__restrict, const void *__restrict, size_t);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMCPY_H

diff  --git a/libc/src/string/memcpy_arch_specific.h.def b/libc/src/string/memcpy_arch_specific.h.def
new file mode 100644
index 000000000000..a9bb35223ef8
--- /dev/null
+++ b/libc/src/string/memcpy_arch_specific.h.def
@@ -0,0 +1,65 @@
+//===-------------- Implementation of arch specific memcpy ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
+
+%%include_file(${memcpy_arch_specific})
+
+namespace __llvm_libc {
+
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// The tests for `count` are in ascending order so the cost of branching is
+// proportional to the cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+//   Guided Optimization as the optimized code can take advantage of branching
+//   probabilities.
+// - It also allows for easier customization and favors testing multiple
+//   implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+//   with little change on the code side.
+static void memcpy_no_return(char *__restrict dst, const char *__restrict src,
+                             size_t count) {
+  if (count == 0)
+    return;
+  if (count == 1)
+    return Copy<1>(dst, src);
+  if (count == 2)
+    return Copy<2>(dst, src);
+  if (count == 3)
+    return Copy<3>(dst, src);
+  if (count == 4)
+    return Copy<4>(dst, src);
+  if (count < 8)
+    return CopyOverlap<4>(dst, src, count);
+  if (count == 8)
+    return Copy<8>(dst, src);
+  if (count < 16)
+    return CopyOverlap<8>(dst, src, count);
+  if (count == 16)
+    return Copy<16>(dst, src);
+  if (count < 32)
+    return CopyOverlap<16>(dst, src, count);
+  if (count < 64)
+    return CopyOverlap<32>(dst, src, count);
+  if (count < 128)
+    return CopyOverlap<64>(dst, src, count);
+  CopyGE128(dst, src, count);
+}
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H

diff  --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 259ed0a75828..b826f1f68707 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -12,6 +12,9 @@ add_gen_header(
 
 add_header_library(
   memory_utils
-  HDRS utils.h
-  DEPENDS cacheline_size
+  HDRS
+    utils.h
+    memcpy_utils.h
+  DEPENDS
+    cacheline_size
 )

diff  --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h
new file mode 100644
index 000000000000..c69e557574c0
--- /dev/null
+++ b/libc/src/string/memory_utils/memcpy_utils.h
@@ -0,0 +1,100 @@
+//===---------------------------- Memcpy utils ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
+#define LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
+
+#include "src/string/memory_utils/utils.h"
+#include <stddef.h> // size_t
+
+// __builtin_memcpy_inline guarantees to never call external functions.
+// Unfortunately it is not widely available.
+#if defined(__clang__) && __has_builtin(__builtin_memcpy_inline)
+#define USE_BUILTIN_MEMCPY_INLINE
+#elif defined(__GNUC__)
+#define USE_BUILTIN_MEMCPY
+#endif
+
+// This is useful for testing.
+#if defined(LLVM_LIBC_MEMCPY_MONITOR)
+extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict,
+                                         const char *__restrict, size_t);
+#endif
+
+namespace __llvm_libc {
+
+// Copies `kBlockSize` bytes from `src` to `dst`.
+template <size_t kBlockSize>
+static void Copy(char *__restrict dst, const char *__restrict src) {
+#if defined(LLVM_LIBC_MEMCPY_MONITOR)
+  LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize);
+#elif defined(USE_BUILTIN_MEMCPY_INLINE)
+  __builtin_memcpy_inline(dst, src, kBlockSize);
+#elif defined(USE_BUILTIN_MEMCPY)
+  __builtin_memcpy(dst, src, kBlockSize);
+#else
+  for (size_t i = 0; i < kBlockSize; ++i)
+    dst[i] = src[i];
+#endif
+}
+
+// Copies `kBlockSize` bytes from `src + count - kBlockSize` to
+// `dst + count - kBlockSize`.
+// Precondition: `count >= kBlockSize`.
+template <size_t kBlockSize>
+static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
+                          size_t count) {
+  const size_t offset = count - kBlockSize;
+  Copy<kBlockSize>(dst + offset, src + offset);
+}
+
+// Copies `kBlockSize` bytes twice with an overlap between the two.
+//
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [__XXXXXXXX_________]
+// [________XXXXXXXX___]
+//
+// Precondition: `count >= kBlockSize && count <= kBlockSize`.
+template <size_t kBlockSize>
+static void CopyOverlap(char *__restrict dst, const char *__restrict src,
+                        size_t count) {
+  Copy<kBlockSize>(dst, src);
+  CopyLastBlock<kBlockSize>(dst, src, count);
+}
+
+// Copies `count` bytes by blocks of `kBlockSize` bytes.
+// Copies at the start and end of the buffer are unaligned.
+// Copies in the middle of the buffer are aligned to `kBlockSize`.
+//
+// e.g. with
+// [12345678123456781234567812345678]
+// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+// [__XXXXXXXX______________________]
+// [________XXXXXXXX________________]
+// [________________XXXXXXXX________]
+// [_____________________XXXXXXXX___]
+//
+// Precondition: `count > 2 * kBlockSize` for efficiency.
+//               `count >= kBlockSize` for correctness.
+template <size_t kBlockSize>
+static void CopyAligned(char *__restrict dst, const char *__restrict src,
+                        size_t count) {
+  Copy<kBlockSize>(dst, src); // Copy first block
+
+  // Copy aligned blocks
+  size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
+  for (; offset + kBlockSize < count; offset += kBlockSize)
+    Copy<kBlockSize>(dst + offset, src + offset);
+
+  CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
+}
+
+} // namespace __llvm_libc
+
+#endif //  LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H

diff  --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 33df113213b5..af9b6aeeee51 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -43,6 +43,11 @@ static constexpr size_t ge_power2(size_t value) {
   return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
 }
 
+template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
+  static_assert(is_power2(alignment), "alignment must be a power of 2");
+  return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
+}
+
 template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
   static_assert(is_power2(alignment), "alignment must be a power of 2");
   // The logic is not straightforward and involves unsigned modulo arithmetic
@@ -51,7 +56,7 @@ template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
 }
 
 // Returns the offset from `ptr` to the next cache line.
-static intptr_t offset_to_next_cache_line(const void *ptr) {
+static inline intptr_t offset_to_next_cache_line(const void *ptr) {
   return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
 }
 

diff  --git a/libc/src/string/x86/CMakeLists.txt b/libc/src/string/x86/CMakeLists.txt
new file mode 100644
index 000000000000..b5365733fb80
--- /dev/null
+++ b/libc/src/string/x86/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_none" REJECT "${ALL_CPU_FEATURES}")
+add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_sse" REQUIRE "SSE" REJECT "SSE2")
+add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_avx" REQUIRE "AVX" REJECT "AVX2")
+add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_avx512f" REQUIRE "AVX512F")

diff  --git a/libc/src/string/x86/memcpy_arch_specific.h.inc b/libc/src/string/x86/memcpy_arch_specific.h.inc
new file mode 100644
index 000000000000..ace98ba2e811
--- /dev/null
+++ b/libc/src/string/x86/memcpy_arch_specific.h.inc
@@ -0,0 +1,35 @@
+#include "src/string/memory_utils/memcpy_utils.h"
+
+namespace __llvm_libc {
+
+static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
+                         size_t count) {
+  // FIXME: Add MSVC suppport with
+  // #include <intrin.h>
+  // __movsb(reinterpret_cast<unsigned char *>(dst),
+  //         reinterpret_cast<const unsigned char *>(src), count);
+  asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+}
+
+#if defined(__AVX__)
+#define BEST_SIZE 64
+#else
+#define BEST_SIZE 32
+#endif
+
+static void CopyGE128(char *__restrict dst, const char *__restrict src,
+                      size_t count) {
+#if defined(__AVX__)
+  if (count < 256)
+    return CopyOverlap<128>(dst, src, count);
+#endif
+  // kRepMovsBSize == -1 : Only CopyAligned is used.
+  // kRepMovsBSize ==  0 : Only RepMovsb is used.
+  // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
+  constexpr size_t kRepMovsBSize = -1;
+  if (count <= kRepMovsBSize)
+    return CopyAligned<BEST_SIZE>(dst, src, count);
+  CopyRepMovsb(dst, src, count);
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt
index 258937c7f4f6..43536e96d552 100644
--- a/libc/test/src/string/CMakeLists.txt
+++ b/libc/test/src/string/CMakeLists.txt
@@ -22,3 +22,24 @@ add_libc_unittest(
   DEPENDS
     strcpy
 )
+
+# Tests all implementations of memcpy that can run on the host.
+get_property(memcpy_implementations GLOBAL PROPERTY memcpy_implementations)
+foreach(memcpy_config_name IN LISTS memcpy_implementations)
+  get_target_property(require_cpu_features ${memcpy_config_name} REQUIRE_CPU_FEATURES)
+  host_supports(can_run "${require_cpu_features}")
+  if(can_run)
+    add_libc_unittest(
+      ${memcpy_config_name}_test
+      SUITE
+        libc_string_unittests
+      SRCS
+        memcpy_test.cpp
+      DEPENDS
+        ${memcpy_config_name}
+    )
+  else()
+    message(STATUS "Skipping test for '${memcpy_config_name}' insufficient host cpu features")
+  endif()
+endforeach()
+

diff  --git a/libc/test/src/string/memcpy_test.cpp b/libc/test/src/string/memcpy_test.cpp
new file mode 100644
index 000000000000..c83cdb60fc96
--- /dev/null
+++ b/libc/test/src/string/memcpy_test.cpp
@@ -0,0 +1,53 @@
+//===----------------------- Unittests for memcpy -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "utils/CPP/ArrayRef.h"
+#include "utils/UnitTest/Test.h"
+#include "src/string/memcpy.h"
+
+using __llvm_libc::cpp::Array;
+using __llvm_libc::cpp::ArrayRef;
+using __llvm_libc::cpp::MutableArrayRef;
+using Data = Array<char, 2048>;
+
+static const ArrayRef<char> kNumbers("0123456789", 10);
+static const ArrayRef<char> kDeadcode("DEADC0DE", 8);
+
+// Returns a Data object filled with a repetition of `filler`.
+Data getData(ArrayRef<char> filler) {
+  Data out;
+  for (size_t i = 0; i < out.size(); ++i)
+    out[i] = filler[i % filler.size()];
+  return out;
+}
+
+TEST(MemcpyTest, Thorough) {
+  const Data groundtruth = getData(kNumbers);
+  const Data dirty = getData(kDeadcode);
+  for (size_t count = 0; count < 1024; ++count) {
+    for (size_t align = 0; align < 64; ++align) {
+      auto buffer = dirty;
+      const char *const src = groundtruth.data();
+      char *const dst = &buffer[align];
+      __llvm_libc::memcpy(dst, src, count);
+      // Everything before copy is untouched.
+      for (size_t i = 0; i < align; ++i)
+        ASSERT_EQ(buffer[i], dirty[i]);
+      // Everything in between is copied.
+      for (size_t i = 0; i < count; ++i)
+        ASSERT_EQ(buffer[align + i], groundtruth[i]);
+      // Everything after copy is untouched.
+      for (size_t i = align + count; i < dirty.size(); ++i)
+        ASSERT_EQ(buffer[i], dirty[i]);
+    }
+  }
+}
+
+// FIXME: Add tests with reads and writes on the boundary of a read/write
+// protected page to check we're not reading nor writing prior/past the allowed
+// regions.

diff  --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index e3ec8eb40cf3..c02fa865fcaa 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -4,7 +4,14 @@ add_libc_unittest(
     libc_string_unittests
   SRCS
     utils_test.cpp
+    memcpy_utils_test.cpp
   DEPENDS
     memory_utils
     standalone_cpp
 )
+
+target_compile_definitions(
+  utils_test
+  PRIVATE
+  LLVM_LIBC_MEMCPY_MONITOR=memcpy_monitor
+)

diff  --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
new file mode 100644
index 000000000000..813e86fe65db
--- /dev/null
+++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
@@ -0,0 +1,208 @@
+//===-------------------- Unittests for memory_utils ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memory_utils/memcpy_utils.h"
+#include "utils/CPP/Array.h"
+#include "utils/UnitTest/Test.h"
+
+#include <assert.h>
+#include <stdint.h> // uintptr_t
+
+#ifndef LLVM_LIBC_MEMCPY_MONITOR
+#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test.
+#endif
+
+namespace __llvm_libc {
+
+struct Buffer {
+  static constexpr size_t kMaxBuffer = 1024;
+  char buffer[kMaxBuffer + 1];
+  size_t last = 0;
+
+  void Clear() {
+    last = 0;
+    for (size_t i = 0; i < kMaxBuffer; ++i)
+      buffer[i] = '0';
+    buffer[kMaxBuffer] = '\0';
+  }
+
+  void Increment(const void *ptr) {
+    const auto offset = reinterpret_cast<uintptr_t>(ptr);
+    assert(offset < kMaxBuffer);
+    ++buffer[offset];
+    if (offset > last)
+      last = offset;
+  }
+
+  char *Finish() {
+    assert(last < kMaxBuffer);
+    buffer[last + 1] = '\0';
+    return buffer;
+  }
+};
+
+struct Trace {
+  Buffer read;
+  Buffer write;
+
+  void Add(char *__restrict dst, const char *__restrict src, size_t count) {
+    for (size_t i = 0; i < count; ++i)
+      read.Increment(src + i);
+    for (size_t i = 0; i < count; ++i)
+      write.Increment(dst + i);
+  }
+
+  void Clear() {
+    read.Clear();
+    write.Clear();
+  }
+
+  char *Read() { return read.Finish(); }
+  char *Write() { return write.Finish(); }
+};
+
+static Trace &GetTrace() {
+  static thread_local Trace events;
+  return events;
+}
+
+extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst,
+                                         const char *__restrict src,
+                                         size_t count) {
+  GetTrace().Add(dst, src, count);
+}
+
+char *I(uintptr_t offset) { return reinterpret_cast<char *>(offset); }
+
+TEST(MemcpyUtilsTest, CopyTrivial) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  Copy<1>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1");
+  EXPECT_STREQ(trace.Read(), "1");
+
+  trace.Clear();
+  Copy<2>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11");
+  EXPECT_STREQ(trace.Read(), "11");
+
+  trace.Clear();
+  Copy<4>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1111");
+  EXPECT_STREQ(trace.Read(), "1111");
+
+  trace.Clear();
+  Copy<8>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11111111");
+  EXPECT_STREQ(trace.Read(), "11111111");
+
+  trace.Clear();
+  Copy<16>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1111111111111111");
+  EXPECT_STREQ(trace.Read(), "1111111111111111");
+
+  trace.Clear();
+  Copy<32>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111");
+  EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111");
+
+  trace.Clear();
+  Copy<64>(I(0), I(0));
+  EXPECT_STREQ(
+      trace.Write(),
+      "1111111111111111111111111111111111111111111111111111111111111111");
+  EXPECT_STREQ(
+      trace.Read(),
+      "1111111111111111111111111111111111111111111111111111111111111111");
+}
+
+TEST(MemcpyUtilsTest, CopyOffset) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  Copy<1>(I(3), I(1));
+  EXPECT_STREQ(trace.Write(), "0001");
+  EXPECT_STREQ(trace.Read(), "01");
+
+  trace.Clear();
+  Copy<1>(I(2), I(1));
+  EXPECT_STREQ(trace.Write(), "001");
+  EXPECT_STREQ(trace.Read(), "01");
+}
+
+TEST(MemcpyUtilsTest, CopyOverlap) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  CopyOverlap<2>(I(0), I(0), 2);
+  EXPECT_STREQ(trace.Write(), "22");
+  EXPECT_STREQ(trace.Read(), "22");
+
+  trace.Clear();
+  CopyOverlap<2>(I(0), I(0), 3);
+  EXPECT_STREQ(trace.Write(), "121");
+  EXPECT_STREQ(trace.Read(), "121");
+
+  trace.Clear();
+  CopyOverlap<2>(I(0), I(0), 4);
+  EXPECT_STREQ(trace.Write(), "1111");
+  EXPECT_STREQ(trace.Read(), "1111");
+
+  trace.Clear();
+  CopyOverlap<4>(I(2), I(1), 7);
+  EXPECT_STREQ(trace.Write(), "001112111");
+  EXPECT_STREQ(trace.Read(), "01112111");
+}
+
+TEST(MemcpyUtilsTest, CopyAligned) {
+  auto &trace = GetTrace();
+  // Destination is aligned already.
+  //   "1111000000000"
+  // + "0000111100000"
+  // + "0000000011110"
+  // + "0000000001111"
+  // = "1111111112221"
+  trace.Clear();
+  CopyAligned<4>(I(0), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "1111111112221");
+  EXPECT_STREQ(trace.Read(), "1111111112221");
+
+  // Misaligned destination
+  //   "01111000000000"
+  // + "00001111000000"
+  // + "00000000111100"
+  // + "00000000001111"
+  // = "01112111112211"
+  trace.Clear();
+  CopyAligned<4>(I(1), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "01112111112211");
+  EXPECT_STREQ(trace.Read(), "1112111112211");
+}
+
+TEST(MemcpyUtilsTest, MaxReloads) {
+  auto &trace = GetTrace();
+  for (size_t alignment = 0; alignment < 32; ++alignment) {
+    for (size_t count = 64; count < 768; ++count) {
+      trace.Clear();
+      // We should never reload more than twice when copying from count = 2x32.
+      CopyAligned<32>(I(alignment), I(0), count);
+      const char *const written = trace.Write();
+      // First bytes are untouched.
+      for (size_t i = 0; i < alignment; ++i)
+        EXPECT_EQ(written[i], '0');
+      // Next bytes are loaded once or twice but no more.
+      for (size_t i = alignment; i < count; ++i) {
+        EXPECT_GE(written[i], '1');
+        EXPECT_LE(written[i], '2');
+      }
+    }
+  }
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp
index 5a14cb1df8e4..c1564334c640 100644
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@@ -87,6 +87,14 @@ TEST(UtilsTest, OffsetToNextAligned) {
   EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
 }
 
+TEST(UtilsTest, OffsetFromLastAligned) {
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
+  EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
+}
+
 TEST(UtilsTest, OffsetToNextCacheLine) {
   EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
   EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));


        


More information about the libc-commits mailing list