[compiler-rt] 3112578 - [AArch64][compiler-rt] Add memcpy, memset, memmove, memchr builtins. (#77496)

Dinar Temirbulatov via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 23 03:26:44 PST 2024


Author: Dinar Temirbulatov
Date: 2024-01-23T11:26:20Z
New Revision: 3112578597c031e6f00c4b126182bd0d8582c729

URL: https://github.com/llvm/llvm-project/commit/3112578597c031e6f00c4b126182bd0d8582c729
DIFF: https://github.com/llvm/llvm-project/commit/3112578597c031e6f00c4b126182bd0d8582c729.diff

LOG: [AArch64][compiler-rt] Add memcpy, memset, memmove, memchr builtins. (#77496)

Add naive implementation of memcpy, memset, memmove, memchr for SME
targets.
Co-authored-by: David Sherwood <david.sherwood at arm.com>

Added: 
    compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
    compiler-rt/test/builtins/Unit/sme-string-test.cpp

Modified: 
    compiler-rt/cmake/builtin-config-ix.cmake
    compiler-rt/lib/builtins/CMakeLists.txt
    compiler-rt/test/CMakeLists.txt
    compiler-rt/test/lit.common.cfg.py
    compiler-rt/test/lit.common.configured.in
    compiler-rt/unittests/lit.common.unit.configured.in

Removed: 
    


################################################################################
diff  --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index b40138aa011f8f..b17c43bf6a68b8 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -35,10 +35,12 @@ asm(\".arch armv8-a+lse\");
 asm(\"cas w0, w1, [x2]\");
 ")
 
-builtin_check_c_compiler_source(COMPILER_RT_HAS_ASM_SME
+builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
 "
-asm(\".arch armv9-a+sme\");
-asm(\"smstart\");
+void foo(void)  __arm_streaming_compatible {
+  asm(\".arch armv9-a+sme\");
+  asm(\"smstart\");
+}
 ")
 
 if(ANDROID)

diff  --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 378884bcaf2e52..28ded8766f2533 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -560,9 +560,10 @@ set(aarch64_SOURCES
   aarch64/fp_mode.c
 )
 
-if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
-  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c)
+if(COMPILER_RT_HAS_AARCH64_SME AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
+  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
   message(STATUS "AArch64 SME ABI routines enabled")
+  set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
 else()
   message(STATUS "AArch64 SME ABI routines disabled")
 endif()

diff  --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
new file mode 100644
index 00000000000000..cd73025a19cc1a
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -0,0 +1,87 @@
+#include <stdlib.h>
+
+// WARNING: When building the scalar versions of these functions you need to
+// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
+// from recognising a loop idiom and planting calls to memcpy!
+
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = srcp[i];
+
+  return dest;
+}
+
+// If dest and src overlap then behaviour is undefined, hence we can add the
+// restrict keywords here. This also matches the definition of the libc memcpy
+// according to the man page.
+void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
+                      size_t n) __arm_streaming_compatible {
+  return __arm_sc_memcpy_fwd(dest, src, n);
+}
+
+void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = c8;
+
+  return dest;
+}
+
+static void *__arm_sc_memcpy_rev(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+  // TODO: Improve performance by copying larger chunks in reverse, or by
+  // using SVE.
+  while (n > 0) {
+    --n;
+    destp[n] = srcp[n];
+  }
+  return dest;
+}
+
+// Semantically a memmove is equivalent to the following:
+//   1. Copy the entire contents of src to a temporary array that does not
+//      overlap with src or dest.
+//   2. Copy the contents of the temporary array into dest.
+void *__arm_sc_memmove(void *dest, const void *src,
+                       size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  // If src and dest don't overlap then just invoke memcpy
+  if ((srcp > (destp + n)) || (destp > (srcp + n)))
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 1:
+  //     src: Low     |   ->   |     High
+  //    dest: Low  |   ->   |        High
+  // Here src is always ahead of dest at a higher addres. If we first read a
+  // chunk of data from src we can safely write the same chunk to dest without
+  // corrupting future reads of src.
+  if (srcp > destp)
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 2:
+  //     src: Low  |   ->   |        High
+  //    dest: Low     |   ->   |     High
+  // While we're in the overlap region we're always corrupting future reads of
+  // src when writing to dest. An efficient way to do this is to copy the data
+  // in reverse by starting at the highest address.
+  return __arm_sc_memcpy_rev(dest, src, n);
+}
+
+const void *__arm_sc_memchr(const void *src, int c,
+                            size_t n) __arm_streaming_compatible {
+  const unsigned char *srcp = (const unsigned char *)src;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    if (srcp[i] == c8)
+      return &srcp[i];
+
+  return NULL;
+}

diff  --git a/compiler-rt/test/CMakeLists.txt b/compiler-rt/test/CMakeLists.txt
index 7357604b1f651e..ee2ef907bcae45 100644
--- a/compiler-rt/test/CMakeLists.txt
+++ b/compiler-rt/test/CMakeLists.txt
@@ -18,6 +18,8 @@ pythonize_bool(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC)
 
 pythonize_bool(COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER)
 
+pythonize_bool(COMPILER_RT_HAS_AARCH64_SME)
+
 configure_compiler_rt_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.common.configured.in
   ${CMAKE_CURRENT_BINARY_DIR}/lit.common.configured)

diff  --git a/compiler-rt/test/builtins/Unit/sme-string-test.cpp b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
new file mode 100644
index 00000000000000..3bc4559f9ae047
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
@@ -0,0 +1,120 @@
+// REQUIRES: aarch64-target-arch, aarch64-sme-available
+// RUN: %clangxx_builtins %s %librt -o %t && %run %t
+
+#include <cassert>
+#include <initializer_list>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+void *__arm_sc_memcpy(void *, const void *, size_t);
+void *__arm_sc_memset(void *, int, size_t);
+void *__arm_sc_memmove(void *, const void *, size_t);
+void *__arm_sc_memchr(const void *, int, size_t);
+}
+
+template <unsigned N> class Memory {
+public:
+  uint8_t ptr[N];
+  unsigned size;
+
+  Memory(unsigned stride = 0) {
+    size = N;
+    if (stride == 0)
+      return;
+    for (unsigned i = 0; i < N; i++)
+      ptr[i] = i * stride;
+  }
+
+  void assert_equal(const Memory &other) {
+    assert(N == other.size);
+    assert(memcmp(ptr, other.ptr, N) == 0);
+  }
+
+  void assert_equal(std::initializer_list<uint8_t> s) {
+    assert(N == s.size());
+    auto it = s.begin();
+    for (unsigned i = 0; i < N; ++i)
+      assert(ptr[i] == *it++);
+  }
+
+  void assert_elemt_equal_at(unsigned I, uint8_t elem) {
+    assert(ptr[I] == elem);
+  }
+};
+
+int main() {
+
+  // Testing memcpy from src to dst.
+  {
+    Memory<8> src(1);
+    Memory<8> dst;
+    if (!__arm_sc_memcpy(dst.ptr, src.ptr, 8))
+      abort();
+    dst.assert_equal(src);
+    dst.assert_equal({0, 1, 2, 3, 4, 5, 6, 7});
+  }
+
+  // Testing memcpy from src to dst with pointer offset.
+  {
+    Memory<8> src(1);
+    Memory<8> dst(1);
+    if (!__arm_sc_memcpy(dst.ptr + 1, src.ptr, 6))
+      abort();
+    dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+  }
+
+  // Testing memchr.
+  {
+    Memory<8> src(4);
+    for (unsigned i = 0; i < 8; ++i) {
+      uint8_t e = src.ptr[i];
+      uint8_t *elem = (uint8_t *)memchr(src.ptr, e, 8);
+      if (!elem)
+        abort();
+      src.assert_elemt_equal_at(elem - src.ptr, *elem);
+      for (unsigned i = 0; i < 8; ++i)
+        assert(__arm_sc_memchr(src.ptr, src.ptr[i], 8) ==
+               memchr(src.ptr, src.ptr[i], 8));
+    }
+  }
+
+  // Testing memset.
+  {
+    Memory<8> array;
+    if (!__arm_sc_memset(array.ptr, 2, 8))
+      abort();
+    array.assert_equal({2, 2, 2, 2, 2, 2, 2, 2});
+  }
+
+  // Testing memset with pointer offset.
+  {
+    Memory<8> array(1);
+    if (!__arm_sc_memset(array.ptr + 1, 2, 6))
+      abort();
+    array.assert_equal({0, 2, 2, 2, 2, 2, 2, 7});
+  }
+
+  // Testing memmove with a simple non-overlap case.
+  {
+    Memory<8> src(1);
+    Memory<8> dst(1);
+    if (!__arm_sc_memmove(dst.ptr + 1, src.ptr, 6))
+      abort();
+    dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+  }
+
+  // Testing memove with overlap pointers dst > src, dst < src.
+  {
+    Memory<8> srcdst(1);
+    if (!__arm_sc_memmove(srcdst.ptr + 1, srcdst.ptr, 6))
+      abort();
+    srcdst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+    if (!__arm_sc_memmove(srcdst.ptr, srcdst.ptr + 1, 6))
+      abort();
+    srcdst.assert_equal({0, 1, 2, 3, 4, 5, 5, 7});
+  }
+
+  return 0;
+}

diff  --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 1753a55508c7cf..113777b0ea8a19 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -454,6 +454,9 @@ def get_ios_commands_dir():
 if config.has_lld:
     config.available_features.add("lld-available")
 
+if config.aarch64_sme:
+    config.available_features.add("aarch64-sme-available")
+
 if config.use_lld:
     config.available_features.add("lld")
 

diff  --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 7c2d53520099a1..b93e20e80a6ed5 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -50,6 +50,7 @@ set_default("gwp_asan", @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@)
 set_default("expensive_checks", @LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL@)
 set_default("test_standalone_build_libs", @COMPILER_RT_TEST_STANDALONE_BUILD_LIBS_PYBOOL@)
 set_default("has_compiler_rt_libatomic", @COMPILER_RT_BUILD_STANDALONE_LIBATOMIC_PYBOOL@)
+set_default("aarch64_sme", @COMPILER_RT_HAS_AARCH64_SME@)
 # True iff the test suite supports ignoring the test compiler's runtime library path
 # and using `config.compiler_rt_libdir` instead. This only matters when the runtime
 # library paths 
diff er.

diff  --git a/compiler-rt/unittests/lit.common.unit.configured.in b/compiler-rt/unittests/lit.common.unit.configured.in
index 3e42e83c9e70a2..23ec222697712e 100644
--- a/compiler-rt/unittests/lit.common.unit.configured.in
+++ b/compiler-rt/unittests/lit.common.unit.configured.in
@@ -7,6 +7,7 @@ config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
 config.compiler_rt_src_root = "@COMPILER_RT_SOURCE_DIR@"
 config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR@")
+config.aarch64_sme = @COMPILER_RT_HAS_AARCH64_SME@
 config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.host_arch = "@HOST_ARCH@"


        


More information about the llvm-commits mailing list