[compiler-rt] [AArch64][compiler-rt] Add memcpy, memset, memmove, memchr simple imp… (PR #77496)

Dinar Temirbulatov via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 9 08:44:15 PST 2024


https://github.com/dtemirbulatov created https://github.com/llvm/llvm-project/pull/77496

…lementation RT builtins.

Add naive implementation of memcpy, memset, memmove, memchr for SME targets. Patch co-authored by David Sherwood <david.sherwood at arm.com>

>From 1845737de1a2bbdac168bf4c93700a5402362d43 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Tue, 9 Jan 2024 16:37:12 +0000
Subject: [PATCH] [AArch64][compiler-rt] Add memcpy, memset, memmove, memchr
 simple implementation RT builtins.

Add naive implementation of memcpy, memset, memmove, memchr for SME targets.
Patch co-authored by David Sherwood <david.sherwood at arm.com>
---
 compiler-rt/lib/builtins/CMakeLists.txt       |   2 +-
 .../lib/builtins/aarch64/sme-libc-routines.c  | 106 ++++++++++
 .../test/builtins/Unit/sme-string-test.c      | 183 ++++++++++++++++++
 3 files changed, 290 insertions(+), 1 deletion(-)
 create mode 100644 compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
 create mode 100644 compiler-rt/test/builtins/Unit/sme-string-test.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index e5b52db175d960..79d3af1a4fd8d8 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -561,7 +561,7 @@ set(aarch64_SOURCES
 )
 
 if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
-  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c)
+  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
   message(STATUS "AArch64 SME ABI routines enabled")
 else()
   message(STATUS "AArch64 SME ABI routines disabled")
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
new file mode 100644
index 00000000000000..b28f3bbb985d33
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -0,0 +1,106 @@
+#include <stdlib.h>
+
+// WARNING: When building the scalar versions of these functions you need to
+// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
+// from recognising a loop idiom and planting calls to memcpy!
+
+
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src, size_t n)
+    __arm_streaming_compatible __arm_preserves_za {
+  unsigned char *destp = (unsigned char *) dest;
+  const unsigned char *srcp = (const unsigned char *) src;
+
+  for (size_t i = 0; i < n; i++) {
+    destp[i] = srcp[i];
+  }
+
+  return dest;
+}
+
+
+// If dest and src overlap then behaviour is undefined, hence we can add the
+// restrict keywords here. This also matches the definition of the libc memcpy
+// according to the man page.
+void *__arm_sc_memcpy(void * __restrict__ dest, const void * __restrict__ src,
+                      size_t n)
+    __arm_streaming_compatible __arm_preserves_za {
+  return __arm_sc_memcpy_fwd(dest, src, n);
+}
+
+
+void *__arm_sc_memset(void *dest, int c, size_t n)
+    __arm_streaming_compatible __arm_preserves_za {
+  unsigned char *destp = (unsigned char *) dest;
+  unsigned char c8 = (unsigned char) c;
+
+  for (size_t i = 0; i < n; i++) {
+    destp[i] = c8;
+  }
+
+  return dest;
+}
+
+
+static void *__arm_sc_memcpy_rev(void *dest, const void *src, size_t n)
+    __arm_streaming_compatible __arm_preserves_za {
+  unsigned char *destp = (unsigned char *) dest;
+  const unsigned char *srcp = (const unsigned char *) src;
+
+  // TODO: Improve performance by copying larger chunks in reverse, or by
+  // using SVE.
+  while (n > 0) {
+    n--;
+    destp[n] = srcp[n];
+  }
+  return dest;
+}
+
+
+// Semantically a memmove is equivalent to the following:
+//   1. Copy the entire contents of src to a temporary array that does not
+//      overlap with src or dest.
+//   2. Copy the contents of the temporary array into dest.
+void *__arm_sc_memmove(void *dest, const void *src, size_t n)
+    __arm_streaming_compatible __arm_preserves_za {
+  unsigned char *destp = (unsigned char *) dest;
+  const unsigned char *srcp = (const unsigned char *) src;
+
+  // If src and dest are identical there is nothing to do!
+  if ((destp == srcp) || (n == 0))
+    return destp;
+
+  // If src and dest don't overlap then just invoke memcpy
+  if ((srcp > (destp + n)) || (destp > (srcp + n)))
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 1:
+  //     src: Low     |   ->   |     High
+  //    dest: Low  |   ->   |        High
+  // Here src is always ahead of dest at a higher addres. If we first read a
+  // chunk of data from src we can safely write the same chunk to dest without
+  // corrupting future reads of src.
+  if (srcp > destp)
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 2:
+  //     src: Low  |   ->   |        High
+  //    dest: Low     |   ->   |     High
+  // While we're in the overlap region we're always corrupting future reads of
+  // src when writing to dest. An efficient way to do this is to copy the data
+  // in reverse by starting at the highest address.
+  return __arm_sc_memcpy_rev(dest, src, n);
+}
+
+
+const void *__arm_sc_memchr(const void *src, int c, size_t n)
+    __arm_streaming_compatible __arm_preserves_za {
+  const unsigned char *srcp = (const unsigned char *) src;
+  unsigned char c8 = (unsigned char) c;
+
+  for (size_t i = 0; i < n; i++) {
+    if (srcp[i] == c8)
+      return &srcp[i];
+  }
+
+  return NULL;
+}
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.c b/compiler-rt/test/builtins/Unit/sme-string-test.c
new file mode 100644
index 00000000000000..3401dcdbc31901
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/sme-string-test.c
@@ -0,0 +1,183 @@
+// REQUIRES: linux, aarch64-target-arch
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <time.h>
+
+#define N 1024
+#define NREPS 1234
+
+static uint8_t dst[N], src[N];
+
+extern void *__arm_sc_memcpy(void *, const void *, size_t);
+extern void *__arm_sc_memset(void *, int, size_t);
+extern void *__arm_sc_memmove(void *, const void *, size_t);
+extern void *__arm_sc_memchr(const void *, int, size_t);
+
+void init(void) {
+  for (int i = 0; i < N; i++) {
+    src[i] = i * 2;
+    dst[i] = i + 1;
+  }
+}
+
+void reinit_dst(int n) {
+  for (int i = 0; i < n; i++) {
+    dst[i] = i + 1;
+  }
+}
+
+int sum(uint8_t *dest, int n) {
+  int t = 0;
+  for (int i = 0; i < n; i++) {
+    t += dest[i];
+  }
+  return t;
+}
+
+long get_time_diff(struct timespec tv[2]) {
+  long us0 = (tv[0].tv_sec * 1000000) + (tv[0].tv_nsec / 1000);
+  long us1 = (tv[1].tv_sec * 1000000) + (tv[1].tv_nsec / 1000);
+  return us1 - us0;
+}
+
+int main() {
+  struct timespec tv[2];
+
+  init();
+
+  // Test correctness of memcpy
+  for (int i = 0; i < 67; i++) {
+    int t[2];
+    if (!__arm_sc_memcpy(dst, src, i)) {
+      fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memcpy!\n");
+      abort();
+    }
+    t[0] = sum(dst, N);
+    reinit_dst(i);
+    memcpy(dst, src, i);
+    t[1] = sum(dst, N);
+    reinit_dst(i);
+    if (t[0] != t[1]) {
+      fprintf(stderr, "__arm_sc_memcpy doesn't match memcpy behaviour!\n");
+      abort();
+    }
+  }
+
+#ifdef TEST_PERF
+  // Collect perf data for memcpy
+  clock_gettime(CLOCK_REALTIME, &tv[0]);
+  for (int r = 0; r < NREPS; r++) {
+    for (int i = 0; i < 67; i++) {
+      int t[2];
+      if (!__arm_sc_memcpy(dst, src, i)) {
+        fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memcpy!\n");
+        abort();
+      }
+    }
+  }
+  reinit_dst(67);
+  clock_gettime(CLOCK_REALTIME, &tv[1]);
+  printf("memcpy time = %ld\n", get_time_diff(tv));
+#endif
+
+  // Test correctness of memset
+  for (int i = 0; i < 67; i++) {
+    int t[2];
+    if (!__arm_sc_memset(dst, src[i], i)) {
+      fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memset!\n");
+      abort();
+    }
+    t[0] = sum(dst, N);
+    reinit_dst(i);
+    memset(dst, src[i], i);
+    t[1] = sum(dst, N);
+    reinit_dst(i);
+    if (t[0] != t[1]) {
+      fprintf(stderr, "__arm_sc_memcpy doesn't match memset behaviour!\n");
+      abort();
+    }
+  }
+
+#ifdef TEST_PERF
+  // Collect perf data for memset
+  clock_gettime(CLOCK_REALTIME, &tv[0]);
+  for (int r = 0; r < NREPS; r++) {
+    for (int i = 0; i < 67; i++) {
+      if (!__arm_sc_memset(dst, src[i], i)) {
+        fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memset!\n");
+        abort();
+      }
+    }
+  }
+  reinit_dst(67);
+  clock_gettime(CLOCK_REALTIME, &tv[1]);
+  printf("memset time = %ld\n", get_time_diff(tv));
+#endif
+
+  // Test correctness of memchr
+  for (int i = 0; i < 67; i++) {
+    for (int j = 0; j < 67; j++) {
+      uint8_t *t[2];
+      t[0] = __arm_sc_memchr(src, src[j], i);
+      t[1] = memchr(src, src[j], i);
+      if (t[0] != t[1]) {
+        fprintf(stderr, "__arm_sc_memchr doesn't match memchr behaviour!\n");
+        abort();
+      }
+    }
+  }
+
+#ifdef TEST_PERF
+  // Collect perf data for memchr
+  clock_gettime(CLOCK_REALTIME, &tv[0]);
+  for (int r = 0; r < NREPS; r++) {
+    for (int i = 0; i < 67; i++) {
+      for (int j = 0; j < 67; j++) {
+        __arm_sc_memchr(src, src[j], i);
+      }
+    }
+  }
+  clock_gettime(CLOCK_REALTIME, &tv[1]);
+  printf("memchr time = %ld\n", get_time_diff(tv));
+#endif
+
+  // Test correctness for memmove
+  for (int i = 0; i < 67; i++) {
+    for (int j = 0; j < 67; j++) {
+      int t[2];
+      if (!__arm_sc_memmove(&dst[66 - j], &dst[j], i)) {
+        fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memmove!\n");
+        abort();
+      }
+      t[0] = sum(dst, N);
+      reinit_dst(200);
+      memmove(&dst[66 - j], &dst[j], i);
+      t[1] = sum(dst, N);
+      reinit_dst(200);
+      if (t[0] != t[1]) {
+        fprintf(stderr, "__arm_sc_memmove doesn't match memmove behaviour!\n");
+        abort();
+      }
+    }
+  }
+
+#ifdef TEST_PERF
+  // Collect perf data for memmove
+  clock_gettime(CLOCK_REALTIME, &tv[0]);
+  for (int r = 0; r < NREPS; r++) {
+    for (int i = 0; i < 67; i++) {
+      for (int j = 0; j < 67; j++) {
+        __arm_sc_memmove(&dst[66 - j], &dst[j], i);
+      }
+    }
+  }
+  clock_gettime(CLOCK_REALTIME, &tv[1]);
+  printf("memmove time = %ld\n", get_time_diff(tv));
+#endif
+
+  return 0;
+}



More information about the llvm-commits mailing list