[libc-commits] [libc] 04a309d - [libc] Adding memcpy implementation for x86_64
Guillaume Chatelet via libc-commits
libc-commits at lists.llvm.org
Wed Mar 18 09:48:36 PDT 2020
Author: Guillaume Chatelet
Date: 2020-03-18T17:43:21+01:00
New Revision: 04a309dd0be3aea17ab6e84f8bfc046c1f044be2
URL: https://github.com/llvm/llvm-project/commit/04a309dd0be3aea17ab6e84f8bfc046c1f044be2
DIFF: https://github.com/llvm/llvm-project/commit/04a309dd0be3aea17ab6e84f8bfc046c1f044be2.diff
LOG: [libc] Adding memcpy implementation for x86_64
Summary:
The patch is not ready yet and is here to discuss a few options:
- How do we customize the implementation? (i.e. how to define `kRepMovsBSize`),
- How do we specify custom compilation flags? (We'd need `-fno-builtin-memcpy` to be passed in),
- How do we build? We may want to test in debug but build the libc with `-march=native` for instance,
- Clang has a brand new builtin `__builtin_memcpy_inline` which makes the implementation easy and efficient, but:
- If we compile with `gcc` or `msvc` we can't use it, resorting on less efficient code generation,
- With gcc we can use `__builtin_memcpy` but then we'd need a postprocess step to check that the final assembly do not contain call to `memcpy` (unlikely but allowed),
- For msvc we'd need to resort on the compiler optimization passes.
Reviewers: sivachandra, abrachet
Subscribers: mgorny, MaskRay, tschuett, libc-commits, courbet
Tags: #libc-project
Differential Revision: https://reviews.llvm.org/D74397
Added:
libc/cmake/modules/cpu_features/check_cpu_features.cpp.in
libc/src/string/memcpy.cpp
libc/src/string/memcpy.h
libc/src/string/memcpy_arch_specific.h.def
libc/src/string/memory_utils/memcpy_utils.h
libc/src/string/x86/CMakeLists.txt
libc/src/string/x86/memcpy_arch_specific.h.inc
libc/test/src/string/memcpy_test.cpp
libc/test/src/string/memory_utils/memcpy_utils_test.cpp
Modified:
libc/CMakeLists.txt
libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
libc/cmake/modules/LLVMLibCRules.cmake
libc/lib/CMakeLists.txt
libc/src/string/CMakeLists.txt
libc/src/string/memory_utils/CMakeLists.txt
libc/src/string/memory_utils/utils.h
libc/test/src/string/CMakeLists.txt
libc/test/src/string/memory_utils/CMakeLists.txt
libc/test/src/string/memory_utils/utils_test.cpp
Removed:
libc/cmake/modules/cpu_features/check_avx.cpp
libc/cmake/modules/cpu_features/check_avx512f.cpp
libc/cmake/modules/cpu_features/check_sse.cpp
libc/cmake/modules/cpu_features/check_sse2.cpp
################################################################################
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index fd750a06e0e1..4ee4d7dca573 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -21,6 +21,7 @@ set(LIBC_TARGET_MACHINE ${CMAKE_SYSTEM_PROCESSOR})
include(CMakeParseArguments)
include(LLVMLibCRules)
+include(LLVMLibCCheckCpuFeatures)
add_subdirectory(src)
add_subdirectory(config)
diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
index adf81f3e38ab..0bb4af869487 100644
--- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
@@ -1,99 +1,129 @@
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
# Cpu features definition and flags
-#
-# Declare a list of all supported cpu features in ALL_CPU_FEATURES.
-#
-# Declares associated flags to enable/disable individual feature of the form:
-# - CPU_FEATURE_<FEATURE>_ENABLE_FLAG
-# - CPU_FEATURE_<FEATURE>_DISABLE_FLAG
-#
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
if(${LIBC_TARGET_MACHINE} MATCHES "x86|x86_64")
- set(ALL_CPU_FEATURES SSE SSE2 AVX AVX512F)
+ set(ALL_CPU_FEATURES SSE SSE2 AVX AVX2 AVX512F)
endif()
-function(_define_cpu_feature_flags feature)
- if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
- string(TOLOWER ${feature} lowercase_feature)
- set(CPU_FEATURE_${feature}_ENABLE_FLAG "-m${lowercase_feature}" PARENT_SCOPE)
- set(CPU_FEATURE_${feature}_DISABLE_FLAG "-mno-${lowercase_feature}" PARENT_SCOPE)
+list(SORT ALL_CPU_FEATURES)
+
+# Function to check whether the host supports the provided set of features.
+# Usage:
+# host_supports(
+# <output variable>
+# <list of cpu features>
+# )
+function(host_supports output_var features)
+ _intersection(a "${HOST_CPU_FEATURES}" "${features}")
+ if("${a}" STREQUAL "${features}")
+ set(${output_var} TRUE PARENT_SCOPE)
+ else()
+ unset(${output_var} PARENT_SCOPE)
+ endif()
+endfunction()
+
+# Function to compute the flags to pass down to the compiler.
+# Usage:
+# compute_flags(
+# <output variable>
+# MARCH <arch name or "native">
+# REQUIRE <list of mandatory features to enable>
+# REJECT <list of features to disable>
+# )
+function(compute_flags output_var)
+ cmake_parse_arguments(
+ "COMPUTE_FLAGS"
+ "" # Optional arguments
+ "MARCH" # Single value arguments
+ "REQUIRE;REJECT" # Multi value arguments
+ ${ARGN})
+ # Check that features are not required and rejected at the same time.
+ if(COMPUTE_FLAGS_REQUIRE AND COMPUTE_FLAGS_REJECT)
+ _intersection(var ${COMPUTE_FLAGS_REQUIRE} ${COMPUTE_FLAGS_REJECT})
+ if(var)
+ message(FATAL_ERROR "Cpu Features REQUIRE and REJECT ${var}")
+ endif()
+ endif()
+ # Generate the compiler flags in `current`.
+ if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang|GNU")
+ if(COMPUTE_FLAGS_MARCH)
+ list(APPEND current "-march=${COMPUTE_FLAGS_MARCH}")
+ endif()
+ foreach(feature IN LISTS COMPUTE_FLAGS_REQUIRE)
+ string(TOLOWER ${feature} lowercase_feature)
+ list(APPEND current "-m${lowercase_feature}")
+ endforeach()
+ foreach(feature IN LISTS COMPUTE_FLAGS_REJECT)
+ string(TOLOWER ${feature} lowercase_feature)
+ list(APPEND current "-mno-${lowercase_feature}")
+ endforeach()
else()
# In future, we can extend for other compilers.
message(FATAL_ERROR "Unkown compiler ${CMAKE_CXX_COMPILER_ID}.")
endif()
+ # Export the list of flags.
+ set(${output_var} "${current}" PARENT_SCOPE)
endfunction()
-# Defines cpu features flags
-foreach(feature IN LISTS ALL_CPU_FEATURES)
- _define_cpu_feature_flags(${feature})
-endforeach()
-
-#------------------------------------------------------------------------------
-# Optimization level flags
-#
-# Generates the set of flags needed to compile for a up to a particular
-# optimization level.
-#
-# Creates variables of the form `CPU_FEATURE_OPT_<FEATURE>_FLAGS`.
-# CPU_FEATURE_OPT_NONE_FLAGS is a special flag for which no feature is needed.
-#
-# e.g.
-# CPU_FEATURE_OPT_NONE_FLAGS : -mno-sse;-mno-sse2;-mno-avx;-mno-avx512f
-# CPU_FEATURE_OPT_SSE_FLAGS : -msse;-mno-sse2;-mno-avx;-mno-avx512f
-# CPU_FEATURE_OPT_SSE2_FLAGS : -msse;-msse2;-mno-avx;-mno-avx512f
-# CPU_FEATURE_OPT_AVX_FLAGS : -msse;-msse2;-mavx;-mno-avx512f
-# CPU_FEATURE_OPT_AVX512F_FLAGS : -msse;-msse2;-mavx;-mavx512f
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
+# Internal helpers and utilities.
+# ------------------------------------------------------------------------------
-# Helper function to concatenate flags needed to support optimization up to
-# a particular feature.
-function(_generate_flags_for_up_to feature flag_variable)
- list(FIND ALL_CPU_FEATURES ${feature} feature_index)
- foreach(current_feature IN LISTS ALL_CPU_FEATURES)
- list(FIND ALL_CPU_FEATURES ${current_feature} current_feature_index)
- if(${current_feature_index} GREATER ${feature_index})
- list(APPEND flags ${CPU_FEATURE_${current_feature}_DISABLE_FLAG})
- else()
- list(APPEND flags ${CPU_FEATURE_${current_feature}_ENABLE_FLAG})
+# Computes the intersection between two lists.
+function(_intersection output_var list1 list2)
+ foreach(element IN LISTS list1)
+ if("${list2}" MATCHES "(^|;)${element}(;|$)")
+ list(APPEND tmp "${element}")
endif()
endforeach()
- set(${flag_variable} ${flags} PARENT_SCOPE)
+ set(${output_var} ${tmp} PARENT_SCOPE)
endfunction()
-function(_generate_opt_levels)
- set(opt_levels NONE)
- list(APPEND opt_levels ${ALL_CPU_FEATURES})
- foreach(feature IN LISTS opt_levels)
- set(flag_name "CPU_FEATURE_OPT_${feature}_FLAGS")
- _generate_flags_for_up_to(${feature} ${flag_name})
- set(${flag_name} ${${flag_name}} PARENT_SCOPE)
+# Generates a cpp file to introspect the compiler defined flags.
+function(_generate_check_code)
+ foreach(feature IN LISTS ALL_CPU_FEATURES)
+ set(DEFINITIONS
+ "${DEFINITIONS}
+#ifdef __${feature}__
+ \"${feature}\",
+#endif")
endforeach()
+ configure_file(
+ "${LIBC_SOURCE_DIR}/cmake/modules/cpu_features/check_cpu_features.cpp.in"
+ "cpu_features/check_cpu_features.cpp" @ONLY)
endfunction()
+_generate_check_code()
-_generate_opt_levels()
-
-#------------------------------------------------------------------------------
-# Host cpu feature introspection
-#
-# Populates a HOST_CPU_FEATURES list containing the available CPU_FEATURE.
-#------------------------------------------------------------------------------
-function(_check_host_cpu_feature feature)
- string(TOLOWER ${feature} lowercase_feature)
+# Compiles and runs the code generated above with the specified requirements.
+# This is helpful to infer which features a particular target supports or if
+# a specific features implies other features (e.g. BMI2 implies SSE2 and SSE).
+function(_check_defined_cpu_feature output_var)
+ cmake_parse_arguments(
+ "CHECK_DEFINED"
+ "" # Optional arguments
+ "MARCH" # Single value arguments
+ "REQUIRE;REJECT" # Multi value arguments
+ ${ARGN})
+ compute_flags(
+ flags
+ MARCH ${CHECK_DEFINED_MARCH}
+ REQUIRE ${CHECK_DEFINED_REQUIRE}
+ REJECT ${CHECK_DEFINED_REJECT})
try_run(
- run_result
- compile_result
- "${CMAKE_CURRENT_BINARY_DIR}/check_${lowercase_feature}"
- "${CMAKE_MODULE_PATH}/cpu_features/check_${lowercase_feature}.cpp"
- COMPILE_DEFINITIONS ${CPU_FEATURE_${feature}_ENABLE_FLAG}
- OUTPUT_VARIABLE compile_output
- )
+ run_result compile_result "${CMAKE_CURRENT_BINARY_DIR}/check_${feature}"
+ "${CMAKE_CURRENT_BINARY_DIR}/cpu_features/check_cpu_features.cpp"
+ COMPILE_DEFINITIONS ${flags}
+ COMPILE_OUTPUT_VARIABLE compile_output
+ RUN_OUTPUT_VARIABLE run_output)
if(${compile_result} AND ("${run_result}" EQUAL 0))
- list(APPEND HOST_CPU_FEATURES ${feature})
- set(HOST_CPU_FEATURES ${HOST_CPU_FEATURES} PARENT_SCOPE)
+ set(${output_var}
+ "${run_output}"
+ PARENT_SCOPE)
+ else()
+ message(FATAL_ERROR "${compile_output}")
endif()
endfunction()
-foreach(feature IN LISTS ALL_CPU_FEATURES)
- _check_host_cpu_feature(${feature})
-endforeach()
+# Populates the HOST_CPU_FEATURES list.
+_check_defined_cpu_feature(HOST_CPU_FEATURES MARCH native)
diff --git a/libc/cmake/modules/LLVMLibCRules.cmake b/libc/cmake/modules/LLVMLibCRules.cmake
index 18e1d0a081c3..2391ea50b0db 100644
--- a/libc/cmake/modules/LLVMLibCRules.cmake
+++ b/libc/cmake/modules/LLVMLibCRules.cmake
@@ -372,6 +372,7 @@ endfunction(add_redirector_library)
# SRCS <list of .cpp files for the test>
# HDRS <list of .h files for the test>
# DEPENDS <list of dependencies>
+# COMPILE_OPTIONS <list of special compile options for this target>
# )
function(add_libc_unittest target_name)
if(NOT LLVM_INCLUDE_TESTS)
@@ -382,7 +383,7 @@ function(add_libc_unittest target_name)
"LIBC_UNITTEST"
"" # No optional arguments
"SUITE" # Single value arguments
- "SRCS;HDRS;DEPENDS" # Multi-value arguments
+ "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi-value arguments
${ARGN}
)
if(NOT LIBC_UNITTEST_SRCS)
@@ -420,6 +421,12 @@ function(add_libc_unittest target_name)
${LIBC_BUILD_DIR}
${LIBC_BUILD_DIR}/include
)
+ if(LIBC_UNITTEST_COMPILE_OPTIONS)
+ target_compile_options(
+ ${target_name}
+ PRIVATE ${LIBC_UNITTEST_COMPILE_OPTIONS}
+ )
+ endif()
if(library_deps)
target_link_libraries(${target_name} PRIVATE ${library_deps})
diff --git a/libc/cmake/modules/cpu_features/check_avx.cpp b/libc/cmake/modules/cpu_features/check_avx.cpp
deleted file mode 100644
index f0db3abab4e5..000000000000
--- a/libc/cmake/modules/cpu_features/check_avx.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#if !defined __AVX__
-#error "missing __AVX__"
-#endif
-#include <immintrin.h>
-int main() {
- (void)_mm256_set1_epi8('0');
- return 0;
-}
diff --git a/libc/cmake/modules/cpu_features/check_avx512f.cpp b/libc/cmake/modules/cpu_features/check_avx512f.cpp
deleted file mode 100644
index 93444e737ef4..000000000000
--- a/libc/cmake/modules/cpu_features/check_avx512f.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#if !defined __AVX512F__
-#error "missing __AVX512F__"
-#endif
-#include <immintrin.h>
-int main() {
- (void)_mm512_undefined();
- return 0;
-}
diff --git a/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in b/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in
new file mode 100644
index 000000000000..25f67a63e192
--- /dev/null
+++ b/libc/cmake/modules/cpu_features/check_cpu_features.cpp.in
@@ -0,0 +1,29 @@
+#include <cstdio>
+#include <cstdlib>
+
+// This file is instantiated by CMake.
+// DEFINITIONS below is replaced with a set of lines like so:
+// #ifdef __SSE2__
+// "SSE2",
+// #endif
+//
+// This allows for introspection of compiler definitions.
+// The output of the program is a single line of semi colon separated feature
+// names.
+
+// MSVC is using a
diff erent set of preprocessor definitions for
+// SSE and SSE2, see _M_IX86_FP in
+// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+
+int main(int, char **) {
+ const char *strings[] = {
+ @DEFINITIONS@
+ };
+ const size_t size = sizeof(strings) / sizeof(strings[0]);
+ for (size_t i = 0; i < size; ++i) {
+ if (i)
+ putchar(';');
+ fputs(strings[i], stdout);
+ }
+ return EXIT_SUCCESS;
+}
diff --git a/libc/cmake/modules/cpu_features/check_sse.cpp b/libc/cmake/modules/cpu_features/check_sse.cpp
deleted file mode 100644
index 1c1f67179fde..000000000000
--- a/libc/cmake/modules/cpu_features/check_sse.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#if !defined __SSE__
-#error "missing __SSE__"
-#endif
-#include <immintrin.h>
-int main() {
- (void)_mm_set_ss(1.0f);
- return 0;
-}
diff --git a/libc/cmake/modules/cpu_features/check_sse2.cpp b/libc/cmake/modules/cpu_features/check_sse2.cpp
deleted file mode 100644
index f1e598de5877..000000000000
--- a/libc/cmake/modules/cpu_features/check_sse2.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#if !defined __SSE2__
-#error "missing __SSE2__"
-#endif
-#include <immintrin.h>
-int main() {
- (void)_mm_set1_epi8('0');
- return 0;
-}
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index 832d79c1e859..b234c91704a9 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -11,6 +11,7 @@ add_entrypoint_library(
# string.h entrypoints
strcpy
strcat
+ memcpy
# sys/mman.h entrypoints
mmap
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 49b0afd2cedf..729ccaaa2b20 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(memory_utils)
+
add_entrypoint_object(
strcat
SRCS
@@ -19,4 +21,63 @@ add_entrypoint_object(
string_h
)
-add_subdirectory(memory_utils)
+# ------------------------------------------------------------------------------
+# memcpy
+# ------------------------------------------------------------------------------
+
+# include the relevant architecture specific implementations
+if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64")
+ set(LIBC_MEMCPY_IMPL_FOLDER "x86")
+else()
+ set(LIBC_MEMCPY_IMPL_FOLDER ${LIBC_TARGET_MACHINE})
+endif()
+
+add_gen_header(
+ memcpy_arch_specific
+ DEF_FILE
+ memcpy_arch_specific.h.def
+ GEN_HDR
+ memcpy_arch_specific.h
+ PARAMS
+ memcpy_arch_specific=${LIBC_MEMCPY_IMPL_FOLDER}/memcpy_arch_specific.h.inc
+ DATA_FILES
+ ${LIBC_MEMCPY_IMPL_FOLDER}/memcpy_arch_specific.h.inc
+)
+
+# Helper to define an implementation of memcpy.
+# - Computes flags to satisfy required/rejected features and arch,
+# - Declares an entry point,
+# - Attach the REQUIRE_CPU_FEATURES property to the target,
+# - Add the target to `memcpy_implementations` global property for tests.
+function(add_memcpy memcpy_name)
+ cmake_parse_arguments(
+ "ADD_MEMCPY"
+ "" # Optional arguments
+ "MARCH" # Single value arguments
+ "REQUIRE;REJECT" # Multi value arguments
+ ${ARGN})
+ compute_flags(flags
+ MARCH ${ADD_MEMCPY_MARCH}
+ REQUIRE ${ADD_MEMCPY_REQUIRE}
+ REJECT ${ADD_MEMCPY_REJECT}
+ )
+ add_entrypoint_object(
+ ${memcpy_name}
+ SRCS ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp
+ HDRS ${LIBC_SOURCE_DIR}/src/string/memcpy.h
+ DEPENDS
+ string_h
+ memory_utils
+ memcpy_arch_specific
+ COMPILE_OPTIONS
+ -fno-builtin-memcpy
+ ${flags}
+ )
+ set_target_properties(${memcpy_name} PROPERTIES REQUIRE_CPU_FEATURES "${ADD_MEMCPY_REQUIRE}")
+ get_property(all GLOBAL PROPERTY memcpy_implementations)
+ list(APPEND all ${memcpy_name})
+ set_property(GLOBAL PROPERTY memcpy_implementations "${all}")
+endfunction()
+
+add_subdirectory(${LIBC_MEMCPY_IMPL_FOLDER})
+add_memcpy(memcpy MARCH native)
diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
new file mode 100644
index 000000000000..216e22938e7f
--- /dev/null
+++ b/libc/src/string/memcpy.cpp
@@ -0,0 +1,22 @@
+//===--------------------- Implementation of memcpy -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memcpy.h"
+#include "src/__support/common.h"
+#include "src/string/memcpy_arch_specific.h"
+
+namespace __llvm_libc {
+
+void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
+ const void *__restrict src, size_t size) {
+ memcpy_no_return(reinterpret_cast<char *>(dst),
+ reinterpret_cast<const char *>(src), size);
+ return dst;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/string/memcpy.h b/libc/src/string/memcpy.h
new file mode 100644
index 000000000000..a3ae4d40c874
--- /dev/null
+++ b/libc/src/string/memcpy.h
@@ -0,0 +1,21 @@
+//===----------------- Implementation header for memcpy -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMCPY_H
+#define LLVM_LIBC_SRC_STRING_MEMCPY_H
+
+#include "include/string.h"
+#include <stddef.h> // size_t
+
+namespace __llvm_libc {
+
+void *memcpy(void *__restrict, const void *__restrict, size_t);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMCPY_H
diff --git a/libc/src/string/memcpy_arch_specific.h.def b/libc/src/string/memcpy_arch_specific.h.def
new file mode 100644
index 000000000000..a9bb35223ef8
--- /dev/null
+++ b/libc/src/string/memcpy_arch_specific.h.def
@@ -0,0 +1,65 @@
+//===-------------- Implementation of arch specific memcpy ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
+
+%%include_file(${memcpy_arch_specific})
+
+namespace __llvm_libc {
+
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// The tests for `count` are in ascending order so the cost of branching is
+// proportional to the cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+// Guided Optimization as the optimized code can take advantage of branching
+// probabilities.
+// - It also allows for easier customization and favors testing multiple
+// implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+// with little change on the code side.
+static void memcpy_no_return(char *__restrict dst, const char *__restrict src,
+ size_t count) {
+ if (count == 0)
+ return;
+ if (count == 1)
+ return Copy<1>(dst, src);
+ if (count == 2)
+ return Copy<2>(dst, src);
+ if (count == 3)
+ return Copy<3>(dst, src);
+ if (count == 4)
+ return Copy<4>(dst, src);
+ if (count < 8)
+ return CopyOverlap<4>(dst, src, count);
+ if (count == 8)
+ return Copy<8>(dst, src);
+ if (count < 16)
+ return CopyOverlap<8>(dst, src, count);
+ if (count == 16)
+ return Copy<16>(dst, src);
+ if (count < 32)
+ return CopyOverlap<16>(dst, src, count);
+ if (count < 64)
+ return CopyOverlap<32>(dst, src, count);
+ if (count < 128)
+ return CopyOverlap<64>(dst, src, count);
+ CopyGE128(dst, src, count);
+}
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 259ed0a75828..b826f1f68707 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -12,6 +12,9 @@ add_gen_header(
add_header_library(
memory_utils
- HDRS utils.h
- DEPENDS cacheline_size
+ HDRS
+ utils.h
+ memcpy_utils.h
+ DEPENDS
+ cacheline_size
)
diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h
new file mode 100644
index 000000000000..c69e557574c0
--- /dev/null
+++ b/libc/src/string/memory_utils/memcpy_utils.h
@@ -0,0 +1,100 @@
+//===---------------------------- Memcpy utils ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
+#define LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
+
+#include "src/string/memory_utils/utils.h"
+#include <stddef.h> // size_t
+
+// __builtin_memcpy_inline guarantees to never call external functions.
+// Unfortunately it is not widely available.
+#if defined(__clang__) && __has_builtin(__builtin_memcpy_inline)
+#define USE_BUILTIN_MEMCPY_INLINE
+#elif defined(__GNUC__)
+#define USE_BUILTIN_MEMCPY
+#endif
+
+// This is useful for testing.
+#if defined(LLVM_LIBC_MEMCPY_MONITOR)
+extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict,
+ const char *__restrict, size_t);
+#endif
+
+namespace __llvm_libc {
+
+// Copies `kBlockSize` bytes from `src` to `dst`.
+template <size_t kBlockSize>
+static void Copy(char *__restrict dst, const char *__restrict src) {
+#if defined(LLVM_LIBC_MEMCPY_MONITOR)
+ LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize);
+#elif defined(USE_BUILTIN_MEMCPY_INLINE)
+ __builtin_memcpy_inline(dst, src, kBlockSize);
+#elif defined(USE_BUILTIN_MEMCPY)
+ __builtin_memcpy(dst, src, kBlockSize);
+#else
+ for (size_t i = 0; i < kBlockSize; ++i)
+ dst[i] = src[i];
+#endif
+}
+
+// Copies `kBlockSize` bytes from `src + count - kBlockSize` to
+// `dst + count - kBlockSize`.
+// Precondition: `count >= kBlockSize`.
+template <size_t kBlockSize>
+static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
+ size_t count) {
+ const size_t offset = count - kBlockSize;
+ Copy<kBlockSize>(dst + offset, src + offset);
+}
+
+// Copies `kBlockSize` bytes twice with an overlap between the two.
+//
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [__XXXXXXXX_________]
+// [________XXXXXXXX___]
+//
+// Precondition: `count >= kBlockSize && count <= kBlockSize`.
+template <size_t kBlockSize>
+static void CopyOverlap(char *__restrict dst, const char *__restrict src,
+ size_t count) {
+ Copy<kBlockSize>(dst, src);
+ CopyLastBlock<kBlockSize>(dst, src, count);
+}
+
+// Copies `count` bytes by blocks of `kBlockSize` bytes.
+// Copies at the start and end of the buffer are unaligned.
+// Copies in the middle of the buffer are aligned to `kBlockSize`.
+//
+// e.g. with
+// [12345678123456781234567812345678]
+// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+// [__XXXXXXXX______________________]
+// [________XXXXXXXX________________]
+// [________________XXXXXXXX________]
+// [_____________________XXXXXXXX___]
+//
+// Precondition: `count > 2 * kBlockSize` for efficiency.
+// `count >= kBlockSize` for correctness.
+template <size_t kBlockSize>
+static void CopyAligned(char *__restrict dst, const char *__restrict src,
+ size_t count) {
+ Copy<kBlockSize>(dst, src); // Copy first block
+
+ // Copy aligned blocks
+ size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
+ for (; offset + kBlockSize < count; offset += kBlockSize)
+ Copy<kBlockSize>(dst + offset, src + offset);
+
+ CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
+}
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 33df113213b5..af9b6aeeee51 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -43,6 +43,11 @@ static constexpr size_t ge_power2(size_t value) {
return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
}
+template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
+ static_assert(is_power2(alignment), "alignment must be a power of 2");
+ return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
+}
+
template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
static_assert(is_power2(alignment), "alignment must be a power of 2");
// The logic is not straightforward and involves unsigned modulo arithmetic
@@ -51,7 +56,7 @@ template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
}
// Returns the offset from `ptr` to the next cache line.
-static intptr_t offset_to_next_cache_line(const void *ptr) {
+static inline intptr_t offset_to_next_cache_line(const void *ptr) {
return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
}
diff --git a/libc/src/string/x86/CMakeLists.txt b/libc/src/string/x86/CMakeLists.txt
new file mode 100644
index 000000000000..b5365733fb80
--- /dev/null
+++ b/libc/src/string/x86/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_none" REJECT "${ALL_CPU_FEATURES}")
+add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_sse" REQUIRE "SSE" REJECT "SSE2")
+add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_avx" REQUIRE "AVX" REJECT "AVX2")
+add_memcpy("memcpy_${LIBC_TARGET_MACHINE}_opt_avx512f" REQUIRE "AVX512F")
diff --git a/libc/src/string/x86/memcpy_arch_specific.h.inc b/libc/src/string/x86/memcpy_arch_specific.h.inc
new file mode 100644
index 000000000000..ace98ba2e811
--- /dev/null
+++ b/libc/src/string/x86/memcpy_arch_specific.h.inc
@@ -0,0 +1,35 @@
+#include "src/string/memory_utils/memcpy_utils.h"
+
+namespace __llvm_libc {
+
+static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
+ size_t count) {
+ // FIXME: Add MSVC suppport with
+ // #include <intrin.h>
+ // __movsb(reinterpret_cast<unsigned char *>(dst),
+ // reinterpret_cast<const unsigned char *>(src), count);
+ asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+}
+
+#if defined(__AVX__)
+#define BEST_SIZE 64
+#else
+#define BEST_SIZE 32
+#endif
+
+static void CopyGE128(char *__restrict dst, const char *__restrict src,
+ size_t count) {
+#if defined(__AVX__)
+ if (count < 256)
+ return CopyOverlap<128>(dst, src, count);
+#endif
+ // kRepMovsBSize == -1 : Only CopyAligned is used.
+ // kRepMovsBSize == 0 : Only RepMovsb is used.
+ // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
+ constexpr size_t kRepMovsBSize = -1;
+ if (count <= kRepMovsBSize)
+ return CopyAligned<BEST_SIZE>(dst, src, count);
+ CopyRepMovsb(dst, src, count);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt
index 258937c7f4f6..43536e96d552 100644
--- a/libc/test/src/string/CMakeLists.txt
+++ b/libc/test/src/string/CMakeLists.txt
@@ -22,3 +22,24 @@ add_libc_unittest(
DEPENDS
strcpy
)
+
+# Tests all implementations of memcpy that can run on the host.
+get_property(memcpy_implementations GLOBAL PROPERTY memcpy_implementations)
+foreach(memcpy_config_name IN LISTS memcpy_implementations)
+ get_target_property(require_cpu_features ${memcpy_config_name} REQUIRE_CPU_FEATURES)
+ host_supports(can_run "${require_cpu_features}")
+ if(can_run)
+ add_libc_unittest(
+ ${memcpy_config_name}_test
+ SUITE
+ libc_string_unittests
+ SRCS
+ memcpy_test.cpp
+ DEPENDS
+ ${memcpy_config_name}
+ )
+ else()
+ message(STATUS "Skipping test for '${memcpy_config_name}' insufficient host cpu features")
+ endif()
+endforeach()
+
diff --git a/libc/test/src/string/memcpy_test.cpp b/libc/test/src/string/memcpy_test.cpp
new file mode 100644
index 000000000000..c83cdb60fc96
--- /dev/null
+++ b/libc/test/src/string/memcpy_test.cpp
@@ -0,0 +1,53 @@
+//===----------------------- Unittests for memcpy -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "utils/CPP/ArrayRef.h"
+#include "utils/UnitTest/Test.h"
+#include "src/string/memcpy.h"
+
+using __llvm_libc::cpp::Array;
+using __llvm_libc::cpp::ArrayRef;
+using __llvm_libc::cpp::MutableArrayRef;
+using Data = Array<char, 2048>;
+
+static const ArrayRef<char> kNumbers("0123456789", 10);
+static const ArrayRef<char> kDeadcode("DEADC0DE", 8);
+
+// Returns a Data object filled with a repetition of `filler`.
+Data getData(ArrayRef<char> filler) {
+ Data out;
+ for (size_t i = 0; i < out.size(); ++i)
+ out[i] = filler[i % filler.size()];
+ return out;
+}
+
+TEST(MemcpyTest, Thorough) {
+ const Data groundtruth = getData(kNumbers);
+ const Data dirty = getData(kDeadcode);
+ for (size_t count = 0; count < 1024; ++count) {
+ for (size_t align = 0; align < 64; ++align) {
+ auto buffer = dirty;
+ const char *const src = groundtruth.data();
+ char *const dst = &buffer[align];
+ __llvm_libc::memcpy(dst, src, count);
+ // Everything before copy is untouched.
+ for (size_t i = 0; i < align; ++i)
+ ASSERT_EQ(buffer[i], dirty[i]);
+ // Everything in between is copied.
+ for (size_t i = 0; i < count; ++i)
+ ASSERT_EQ(buffer[align + i], groundtruth[i]);
+ // Everything after copy is untouched.
+ for (size_t i = align + count; i < dirty.size(); ++i)
+ ASSERT_EQ(buffer[i], dirty[i]);
+ }
+ }
+}
+
+// FIXME: Add tests with reads and writes on the boundary of a read/write
+// protected page to check we're not reading nor writing prior/past the allowed
+// regions.
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index e3ec8eb40cf3..c02fa865fcaa 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -4,7 +4,14 @@ add_libc_unittest(
libc_string_unittests
SRCS
utils_test.cpp
+ memcpy_utils_test.cpp
DEPENDS
memory_utils
standalone_cpp
)
+
+target_compile_definitions(
+ utils_test
+ PRIVATE
+ LLVM_LIBC_MEMCPY_MONITOR=memcpy_monitor
+)
diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
new file mode 100644
index 000000000000..813e86fe65db
--- /dev/null
+++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
@@ -0,0 +1,208 @@
+//===-------------------- Unittests for memory_utils ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memory_utils/memcpy_utils.h"
+#include "utils/CPP/Array.h"
+#include "utils/UnitTest/Test.h"
+
+#include <assert.h>
+#include <stdint.h> // uintptr_t
+
+#ifndef LLVM_LIBC_MEMCPY_MONITOR
+#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test.
+#endif
+
+namespace __llvm_libc {
+
+struct Buffer {
+ static constexpr size_t kMaxBuffer = 1024;
+ char buffer[kMaxBuffer + 1];
+ size_t last = 0;
+
+ void Clear() {
+ last = 0;
+ for (size_t i = 0; i < kMaxBuffer; ++i)
+ buffer[i] = '0';
+ buffer[kMaxBuffer] = '\0';
+ }
+
+ void Increment(const void *ptr) {
+ const auto offset = reinterpret_cast<uintptr_t>(ptr);
+ assert(offset < kMaxBuffer);
+ ++buffer[offset];
+ if (offset > last)
+ last = offset;
+ }
+
+ char *Finish() {
+ assert(last < kMaxBuffer);
+ buffer[last + 1] = '\0';
+ return buffer;
+ }
+};
+
+struct Trace {
+ Buffer read;
+ Buffer write;
+
+ void Add(char *__restrict dst, const char *__restrict src, size_t count) {
+ for (size_t i = 0; i < count; ++i)
+ read.Increment(src + i);
+ for (size_t i = 0; i < count; ++i)
+ write.Increment(dst + i);
+ }
+
+ void Clear() {
+ read.Clear();
+ write.Clear();
+ }
+
+ char *Read() { return read.Finish(); }
+ char *Write() { return write.Finish(); }
+};
+
+static Trace &GetTrace() {
+ static thread_local Trace events;
+ return events;
+}
+
+extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst,
+ const char *__restrict src,
+ size_t count) {
+ GetTrace().Add(dst, src, count);
+}
+
+char *I(uintptr_t offset) { return reinterpret_cast<char *>(offset); }
+
+TEST(MemcpyUtilsTest, CopyTrivial) {
+ auto &trace = GetTrace();
+
+ trace.Clear();
+ Copy<1>(I(0), I(0));
+ EXPECT_STREQ(trace.Write(), "1");
+ EXPECT_STREQ(trace.Read(), "1");
+
+ trace.Clear();
+ Copy<2>(I(0), I(0));
+ EXPECT_STREQ(trace.Write(), "11");
+ EXPECT_STREQ(trace.Read(), "11");
+
+ trace.Clear();
+ Copy<4>(I(0), I(0));
+ EXPECT_STREQ(trace.Write(), "1111");
+ EXPECT_STREQ(trace.Read(), "1111");
+
+ trace.Clear();
+ Copy<8>(I(0), I(0));
+ EXPECT_STREQ(trace.Write(), "11111111");
+ EXPECT_STREQ(trace.Read(), "11111111");
+
+ trace.Clear();
+ Copy<16>(I(0), I(0));
+ EXPECT_STREQ(trace.Write(), "1111111111111111");
+ EXPECT_STREQ(trace.Read(), "1111111111111111");
+
+ trace.Clear();
+ Copy<32>(I(0), I(0));
+ EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111");
+ EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111");
+
+ trace.Clear();
+ Copy<64>(I(0), I(0));
+ EXPECT_STREQ(
+ trace.Write(),
+ "1111111111111111111111111111111111111111111111111111111111111111");
+ EXPECT_STREQ(
+ trace.Read(),
+ "1111111111111111111111111111111111111111111111111111111111111111");
+}
+
+TEST(MemcpyUtilsTest, CopyOffset) {
+ auto &trace = GetTrace();
+
+ trace.Clear();
+ Copy<1>(I(3), I(1));
+ EXPECT_STREQ(trace.Write(), "0001");
+ EXPECT_STREQ(trace.Read(), "01");
+
+ trace.Clear();
+ Copy<1>(I(2), I(1));
+ EXPECT_STREQ(trace.Write(), "001");
+ EXPECT_STREQ(trace.Read(), "01");
+}
+
+TEST(MemcpyUtilsTest, CopyOverlap) {
+ auto &trace = GetTrace();
+
+ trace.Clear();
+ CopyOverlap<2>(I(0), I(0), 2);
+ EXPECT_STREQ(trace.Write(), "22");
+ EXPECT_STREQ(trace.Read(), "22");
+
+ trace.Clear();
+ CopyOverlap<2>(I(0), I(0), 3);
+ EXPECT_STREQ(trace.Write(), "121");
+ EXPECT_STREQ(trace.Read(), "121");
+
+ trace.Clear();
+ CopyOverlap<2>(I(0), I(0), 4);
+ EXPECT_STREQ(trace.Write(), "1111");
+ EXPECT_STREQ(trace.Read(), "1111");
+
+ trace.Clear();
+ CopyOverlap<4>(I(2), I(1), 7);
+ EXPECT_STREQ(trace.Write(), "001112111");
+ EXPECT_STREQ(trace.Read(), "01112111");
+}
+
+TEST(MemcpyUtilsTest, CopyAligned) {
+ auto &trace = GetTrace();
+ // Destination is aligned already.
+ // "1111000000000"
+ // + "0000111100000"
+ // + "0000000011110"
+ // + "0000000001111"
+ // = "1111111112221"
+ trace.Clear();
+ CopyAligned<4>(I(0), I(0), 13);
+ EXPECT_STREQ(trace.Write(), "1111111112221");
+ EXPECT_STREQ(trace.Read(), "1111111112221");
+
+ // Misaligned destination
+ // "01111000000000"
+ // + "00001111000000"
+ // + "00000000111100"
+ // + "00000000001111"
+ // = "01112111112211"
+ trace.Clear();
+ CopyAligned<4>(I(1), I(0), 13);
+ EXPECT_STREQ(trace.Write(), "01112111112211");
+ EXPECT_STREQ(trace.Read(), "1112111112211");
+}
+
+TEST(MemcpyUtilsTest, MaxReloads) {
+ auto &trace = GetTrace();
+ for (size_t alignment = 0; alignment < 32; ++alignment) {
+ for (size_t count = 64; count < 768; ++count) {
+ trace.Clear();
+ // We should never reload more than twice when copying from count = 2x32.
+ CopyAligned<32>(I(alignment), I(0), count);
+ const char *const written = trace.Write();
+ // First bytes are untouched.
+ for (size_t i = 0; i < alignment; ++i)
+ EXPECT_EQ(written[i], '0');
+ // Next bytes are loaded once or twice but no more.
+ for (size_t i = alignment; i < count; ++i) {
+ EXPECT_GE(written[i], '1');
+ EXPECT_LE(written[i], '2');
+ }
+ }
+ }
+}
+
+} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp
index 5a14cb1df8e4..c1564334c640 100644
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@@ -87,6 +87,14 @@ TEST(UtilsTest, OffsetToNextAligned) {
EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
}
+TEST(UtilsTest, OffsetFromLastAligned) {
+ EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
+ EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
+ EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
+ EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
+ EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
+}
+
TEST(UtilsTest, OffsetToNextCacheLine) {
EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));
More information about the libc-commits
mailing list