[compiler-rt] [AArch64][compiler-rt] Add memcpy, memset, memmove, memchr simple imp… (PR #77496)

Dinar Temirbulatov via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 22 02:41:53 PST 2024


https://github.com/dtemirbulatov updated https://github.com/llvm/llvm-project/pull/77496

>From 2db7921d5bedb3d5f834a7520dc3fee1e9903646 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Tue, 9 Jan 2024 16:37:12 +0000
Subject: [PATCH 1/6] [AArch64][compiler-rt] Add memcpy, memset, memmove,
 memchr simple implementation RT builtins.

Add naive implementation of memcpy, memset, memmove, memchr for SME targets.
Patch co-authored by David Sherwood <david.sherwood at arm.com>
---
 compiler-rt/lib/builtins/CMakeLists.txt       |   2 +-
 .../lib/builtins/aarch64/sme-libc-routines.c  | 102 ++++++++++
 .../test/builtins/Unit/sme-string-test.c      | 183 ++++++++++++++++++
 3 files changed, 286 insertions(+), 1 deletion(-)
 create mode 100644 compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
 create mode 100644 compiler-rt/test/builtins/Unit/sme-string-test.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index e5b52db175d960f..79d3af1a4fd8d86 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -561,7 +561,7 @@ set(aarch64_SOURCES
 )
 
 if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
-  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c)
+  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
   message(STATUS "AArch64 SME ABI routines enabled")
 else()
   message(STATUS "AArch64 SME ABI routines disabled")
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
new file mode 100644
index 000000000000000..c846daa51cc91fa
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -0,0 +1,102 @@
+#include <stdlib.h>
+
+// WARNING: When building the scalar versions of these functions you need to
+// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
+// from recognising a loop idiom and planting calls to memcpy!
+
+static void *
+__arm_sc_memcpy_fwd(void *dest, const void *src,
+                    size_t n) __arm_streaming_compatible __arm_preserves_za {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  for (size_t i = 0; i < n; i++) {
+    destp[i] = srcp[i];
+  }
+
+  return dest;
+}
+
+// If dest and src overlap then behaviour is undefined, hence we can add the
+// restrict keywords here. This also matches the definition of the libc memcpy
+// according to the man page.
+void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
+                      size_t n) __arm_streaming_compatible __arm_preserves_za {
+  return __arm_sc_memcpy_fwd(dest, src, n);
+}
+
+void *__arm_sc_memset(void *dest, int c,
+                      size_t n) __arm_streaming_compatible __arm_preserves_za {
+  unsigned char *destp = (unsigned char *)dest;
+  unsigned char c8 = (unsigned char)c;
+
+  for (size_t i = 0; i < n; i++) {
+    destp[i] = c8;
+  }
+
+  return dest;
+}
+
+static void *
+__arm_sc_memcpy_rev(void *dest, const void *src,
+                    size_t n) __arm_streaming_compatible __arm_preserves_za {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  // TODO: Improve performance by copying larger chunks in reverse, or by
+  // using SVE.
+  while (n > 0) {
+    n--;
+    destp[n] = srcp[n];
+  }
+  return dest;
+}
+
+// Semantically a memmove is equivalent to the following:
+//   1. Copy the entire contents of src to a temporary array that does not
+//      overlap with src or dest.
+//   2. Copy the contents of the temporary array into dest.
+void *__arm_sc_memmove(void *dest, const void *src,
+                       size_t n) __arm_streaming_compatible __arm_preserves_za {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  // If src and dest are identical there is nothing to do!
+  if ((destp == srcp) || (n == 0))
+    return destp;
+
+  // If src and dest don't overlap then just invoke memcpy
+  if ((srcp > (destp + n)) || (destp > (srcp + n)))
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 1:
+  //     src: Low     |   ->   |     High
+  //    dest: Low  |   ->   |        High
+  // Here src is always ahead of dest at a higher addres. If we first read a
+  // chunk of data from src we can safely write the same chunk to dest without
+  // corrupting future reads of src.
+  if (srcp > destp)
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 2:
+  //     src: Low  |   ->   |        High
+  //    dest: Low     |   ->   |     High
+  // While we're in the overlap region we're always corrupting future reads of
+  // src when writing to dest. An efficient way to do this is to copy the data
+  // in reverse by starting at the highest address.
+  return __arm_sc_memcpy_rev(dest, src, n);
+}
+
+const void *
+__arm_sc_memchr(const void *src, int c,
+                size_t n) __arm_streaming_compatible __arm_preserves_za {
+  const unsigned char *srcp = (const unsigned char *)src;
+  unsigned char c8 = (unsigned char)c;
+
+  for (size_t i = 0; i < n; i++) {
+    if (srcp[i] == c8)
+      return &srcp[i];
+  }
+
+  return NULL;
+}
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.c b/compiler-rt/test/builtins/Unit/sme-string-test.c
new file mode 100644
index 000000000000000..0b066c3c179c95f
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/sme-string-test.c
@@ -0,0 +1,183 @@
+// REQUIRES: linux, aarch64-target-arch
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#define N 1024
+#define NREPS 1234
+
+static uint8_t dst[N], src[N];
+
+extern void *__arm_sc_memcpy(void *, const void *, size_t);
+extern void *__arm_sc_memset(void *, int, size_t);
+extern void *__arm_sc_memmove(void *, const void *, size_t);
+extern void *__arm_sc_memchr(const void *, int, size_t);
+
+void init(void) {
+  for (int i = 0; i < N; i++) {
+    src[i] = i * 2;
+    dst[i] = i + 1;
+  }
+}
+
+void reinit_dst(int n) {
+  for (int i = 0; i < n; i++) {
+    dst[i] = i + 1;
+  }
+}
+
+int sum(uint8_t *dest, int n) {
+  int t = 0;
+  for (int i = 0; i < n; i++) {
+    t += dest[i];
+  }
+  return t;
+}
+
+long get_time_diff(struct timespec tv[2]) {
+  long us0 = (tv[0].tv_sec * 1000000) + (tv[0].tv_nsec / 1000);
+  long us1 = (tv[1].tv_sec * 1000000) + (tv[1].tv_nsec / 1000);
+  return us1 - us0;
+}
+
+int main() {
+  struct timespec tv[2];
+
+  init();
+
+  // Test correctness of memcpy
+  for (int i = 0; i < 67; i++) {
+    int t[2];
+    if (!__arm_sc_memcpy(dst, src, i)) {
+      fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memcpy!\n");
+      abort();
+    }
+    t[0] = sum(dst, N);
+    reinit_dst(i);
+    memcpy(dst, src, i);
+    t[1] = sum(dst, N);
+    reinit_dst(i);
+    if (t[0] != t[1]) {
+      fprintf(stderr, "__arm_sc_memcpy doesn't match memcpy behaviour!\n");
+      abort();
+    }
+  }
+
+#ifdef TEST_PERF
+  // Collect perf data for memcpy
+  clock_gettime(CLOCK_REALTIME, &tv[0]);
+  for (int r = 0; r < NREPS; r++) {
+    for (int i = 0; i < 67; i++) {
+      int t[2];
+      if (!__arm_sc_memcpy(dst, src, i)) {
+        fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memcpy!\n");
+        abort();
+      }
+    }
+  }
+  reinit_dst(67);
+  clock_gettime(CLOCK_REALTIME, &tv[1]);
+  printf("memcpy time = %ld\n", get_time_diff(tv));
+#endif
+
+  // Test correctness of memset
+  for (int i = 0; i < 67; i++) {
+    int t[2];
+    if (!__arm_sc_memset(dst, src[i], i)) {
+      fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memset!\n");
+      abort();
+    }
+    t[0] = sum(dst, N);
+    reinit_dst(i);
+    memset(dst, src[i], i);
+    t[1] = sum(dst, N);
+    reinit_dst(i);
+    if (t[0] != t[1]) {
+      fprintf(stderr, "__arm_sc_memcpy doesn't match memset behaviour!\n");
+      abort();
+    }
+  }
+
+#ifdef TEST_PERF
+  // Collect perf data for memset
+  clock_gettime(CLOCK_REALTIME, &tv[0]);
+  for (int r = 0; r < NREPS; r++) {
+    for (int i = 0; i < 67; i++) {
+      if (!__arm_sc_memset(dst, src[i], i)) {
+        fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memset!\n");
+        abort();
+      }
+    }
+  }
+  reinit_dst(67);
+  clock_gettime(CLOCK_REALTIME, &tv[1]);
+  printf("memset time = %ld\n", get_time_diff(tv));
+#endif
+
+  // Test correctness of memchr
+  for (int i = 0; i < 67; i++) {
+    for (int j = 0; j < 67; j++) {
+      uint8_t *t[2];
+      t[0] = __arm_sc_memchr(src, src[j], i);
+      t[1] = memchr(src, src[j], i);
+      if (t[0] != t[1]) {
+        fprintf(stderr, "__arm_sc_memchr doesn't match memchr behaviour!\n");
+        abort();
+      }
+    }
+  }
+
+#ifdef TEST_PERF
+  // Collect perf data for memchr
+  clock_gettime(CLOCK_REALTIME, &tv[0]);
+  for (int r = 0; r < NREPS; r++) {
+    for (int i = 0; i < 67; i++) {
+      for (int j = 0; j < 67; j++) {
+        __arm_sc_memchr(src, src[j], i);
+      }
+    }
+  }
+  clock_gettime(CLOCK_REALTIME, &tv[1]);
+  printf("memchr time = %ld\n", get_time_diff(tv));
+#endif
+
+  // Test correctness for memmove
+  for (int i = 0; i < 67; i++) {
+    for (int j = 0; j < 67; j++) {
+      int t[2];
+      if (!__arm_sc_memmove(&dst[66 - j], &dst[j], i)) {
+        fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memmove!\n");
+        abort();
+      }
+      t[0] = sum(dst, N);
+      reinit_dst(200);
+      memmove(&dst[66 - j], &dst[j], i);
+      t[1] = sum(dst, N);
+      reinit_dst(200);
+      if (t[0] != t[1]) {
+        fprintf(stderr, "__arm_sc_memmove doesn't match memmove behaviour!\n");
+        abort();
+      }
+    }
+  }
+
+#ifdef TEST_PERF
+  // Collect perf data for memmove
+  clock_gettime(CLOCK_REALTIME, &tv[0]);
+  for (int r = 0; r < NREPS; r++) {
+    for (int i = 0; i < 67; i++) {
+      for (int j = 0; j < 67; j++) {
+        __arm_sc_memmove(&dst[66 - j], &dst[j], i);
+      }
+    }
+  }
+  clock_gettime(CLOCK_REALTIME, &tv[1]);
+  printf("memmove time = %ld\n", get_time_diff(tv));
+#endif
+
+  return 0;
+}

>From 7e2df5bdaf7517730c7023273dc86ec5b099fa04 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 12 Jan 2024 08:49:11 +0000
Subject: [PATCH 2/6] Addressed comments.

---
 .../lib/builtins/aarch64/sme-libc-routines.c  |  22 ++--
 .../test/builtins/Unit/sme-string-test.c      | 101 ++----------------
 compiler-rt/test/lit.common.cfg.py            |   3 +
 compiler-rt/test/lit.common.configured.in     |   1 +
 .../unittests/lit.common.unit.configured.in   |   1 +
 5 files changed, 22 insertions(+), 106 deletions(-)

diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
index c846daa51cc91fa..0e26a3ab030c85c 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -4,9 +4,8 @@
 // use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
 // from recognising a loop idiom and planting calls to memcpy!
 
-static void *
-__arm_sc_memcpy_fwd(void *dest, const void *src,
-                    size_t n) __arm_streaming_compatible __arm_preserves_za {
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
   unsigned char *destp = (unsigned char *)dest;
   const unsigned char *srcp = (const unsigned char *)src;
 
@@ -21,12 +20,11 @@ __arm_sc_memcpy_fwd(void *dest, const void *src,
 // restrict keywords here. This also matches the definition of the libc memcpy
 // according to the man page.
 void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
-                      size_t n) __arm_streaming_compatible __arm_preserves_za {
+                      size_t n) __arm_streaming_compatible {
   return __arm_sc_memcpy_fwd(dest, src, n);
 }
 
-void *__arm_sc_memset(void *dest, int c,
-                      size_t n) __arm_streaming_compatible __arm_preserves_za {
+void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
   unsigned char *destp = (unsigned char *)dest;
   unsigned char c8 = (unsigned char)c;
 
@@ -37,9 +35,8 @@ void *__arm_sc_memset(void *dest, int c,
   return dest;
 }
 
-static void *
-__arm_sc_memcpy_rev(void *dest, const void *src,
-                    size_t n) __arm_streaming_compatible __arm_preserves_za {
+static void *__arm_sc_memcpy_rev(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
   unsigned char *destp = (unsigned char *)dest;
   const unsigned char *srcp = (const unsigned char *)src;
 
@@ -57,7 +54,7 @@ __arm_sc_memcpy_rev(void *dest, const void *src,
 //      overlap with src or dest.
 //   2. Copy the contents of the temporary array into dest.
 void *__arm_sc_memmove(void *dest, const void *src,
-                       size_t n) __arm_streaming_compatible __arm_preserves_za {
+                       size_t n) __arm_streaming_compatible {
   unsigned char *destp = (unsigned char *)dest;
   const unsigned char *srcp = (const unsigned char *)src;
 
@@ -87,9 +84,8 @@ void *__arm_sc_memmove(void *dest, const void *src,
   return __arm_sc_memcpy_rev(dest, src, n);
 }
 
-const void *
-__arm_sc_memchr(const void *src, int c,
-                size_t n) __arm_streaming_compatible __arm_preserves_za {
+const void *__arm_sc_memchr(const void *src, int c,
+                            size_t n) __arm_streaming_compatible {
   const unsigned char *srcp = (const unsigned char *)src;
   unsigned char c8 = (unsigned char)c;
 
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.c b/compiler-rt/test/builtins/Unit/sme-string-test.c
index 0b066c3c179c95f..51c1ad9ed02a7c8 100644
--- a/compiler-rt/test/builtins/Unit/sme-string-test.c
+++ b/compiler-rt/test/builtins/Unit/sme-string-test.c
@@ -1,14 +1,11 @@
-// REQUIRES: linux, aarch64-target-arch
+// REQUIRES: linux, aarch64-target-arch, sme-available
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <time.h>
 
 #define N 1024
-#define NREPS 1234
 
 static uint8_t dst[N], src[N];
 
@@ -38,146 +35,64 @@ int sum(uint8_t *dest, int n) {
   return t;
 }
 
-long get_time_diff(struct timespec tv[2]) {
-  long us0 = (tv[0].tv_sec * 1000000) + (tv[0].tv_nsec / 1000);
-  long us1 = (tv[1].tv_sec * 1000000) + (tv[1].tv_nsec / 1000);
-  return us1 - us0;
-}
-
 int main() {
-  struct timespec tv[2];
 
   init();
 
   // Test correctness of memcpy
   for (int i = 0; i < 67; i++) {
     int t[2];
-    if (!__arm_sc_memcpy(dst, src, i)) {
-      fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memcpy!\n");
+    if (!__arm_sc_memcpy(dst, src, i))
       abort();
-    }
     t[0] = sum(dst, N);
     reinit_dst(i);
     memcpy(dst, src, i);
     t[1] = sum(dst, N);
     reinit_dst(i);
-    if (t[0] != t[1]) {
-      fprintf(stderr, "__arm_sc_memcpy doesn't match memcpy behaviour!\n");
+    if (t[0] != t[1])
       abort();
-    }
   }
 
-#ifdef TEST_PERF
-  // Collect perf data for memcpy
-  clock_gettime(CLOCK_REALTIME, &tv[0]);
-  for (int r = 0; r < NREPS; r++) {
-    for (int i = 0; i < 67; i++) {
-      int t[2];
-      if (!__arm_sc_memcpy(dst, src, i)) {
-        fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memcpy!\n");
-        abort();
-      }
-    }
-  }
-  reinit_dst(67);
-  clock_gettime(CLOCK_REALTIME, &tv[1]);
-  printf("memcpy time = %ld\n", get_time_diff(tv));
-#endif
-
   // Test correctness of memset
   for (int i = 0; i < 67; i++) {
     int t[2];
-    if (!__arm_sc_memset(dst, src[i], i)) {
-      fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memset!\n");
+    if (!__arm_sc_memset(dst, src[i], i))
       abort();
-    }
     t[0] = sum(dst, N);
     reinit_dst(i);
     memset(dst, src[i], i);
     t[1] = sum(dst, N);
     reinit_dst(i);
-    if (t[0] != t[1]) {
-      fprintf(stderr, "__arm_sc_memcpy doesn't match memset behaviour!\n");
+    if (t[0] != t[1])
       abort();
-    }
   }
 
-#ifdef TEST_PERF
-  // Collect perf data for memset
-  clock_gettime(CLOCK_REALTIME, &tv[0]);
-  for (int r = 0; r < NREPS; r++) {
-    for (int i = 0; i < 67; i++) {
-      if (!__arm_sc_memset(dst, src[i], i)) {
-        fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memset!\n");
-        abort();
-      }
-    }
-  }
-  reinit_dst(67);
-  clock_gettime(CLOCK_REALTIME, &tv[1]);
-  printf("memset time = %ld\n", get_time_diff(tv));
-#endif
-
   // Test correctness of memchr
   for (int i = 0; i < 67; i++) {
     for (int j = 0; j < 67; j++) {
       uint8_t *t[2];
       t[0] = __arm_sc_memchr(src, src[j], i);
       t[1] = memchr(src, src[j], i);
-      if (t[0] != t[1]) {
-        fprintf(stderr, "__arm_sc_memchr doesn't match memchr behaviour!\n");
+      if (t[0] != t[1])
         abort();
-      }
-    }
-  }
-
-#ifdef TEST_PERF
-  // Collect perf data for memchr
-  clock_gettime(CLOCK_REALTIME, &tv[0]);
-  for (int r = 0; r < NREPS; r++) {
-    for (int i = 0; i < 67; i++) {
-      for (int j = 0; j < 67; j++) {
-        __arm_sc_memchr(src, src[j], i);
-      }
     }
   }
-  clock_gettime(CLOCK_REALTIME, &tv[1]);
-  printf("memchr time = %ld\n", get_time_diff(tv));
-#endif
 
   // Test correctness for memmove
   for (int i = 0; i < 67; i++) {
     for (int j = 0; j < 67; j++) {
       int t[2];
-      if (!__arm_sc_memmove(&dst[66 - j], &dst[j], i)) {
-        fprintf(stderr, "Unexpected NULL pointer from __arm_sc_memmove!\n");
+      if (!__arm_sc_memmove(&dst[66 - j], &dst[j], i))
         abort();
-      }
       t[0] = sum(dst, N);
       reinit_dst(200);
       memmove(&dst[66 - j], &dst[j], i);
       t[1] = sum(dst, N);
       reinit_dst(200);
-      if (t[0] != t[1]) {
-        fprintf(stderr, "__arm_sc_memmove doesn't match memmove behaviour!\n");
+      if (t[0] != t[1])
         abort();
-      }
-    }
-  }
-
-#ifdef TEST_PERF
-  // Collect perf data for memmove
-  clock_gettime(CLOCK_REALTIME, &tv[0]);
-  for (int r = 0; r < NREPS; r++) {
-    for (int i = 0; i < 67; i++) {
-      for (int j = 0; j < 67; j++) {
-        __arm_sc_memmove(&dst[66 - j], &dst[j], i);
-      }
     }
   }
-  clock_gettime(CLOCK_REALTIME, &tv[1]);
-  printf("memmove time = %ld\n", get_time_diff(tv));
-#endif
 
   return 0;
 }
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 1753a55508c7cff..d4960b3aabc2cc0 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -454,6 +454,9 @@ def get_ios_commands_dir():
 if config.has_lld:
     config.available_features.add("lld-available")
 
+if config.sme:
+    config.available_features.add("sme-available")
+
 if config.use_lld:
     config.available_features.add("lld")
 
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 7c2d53520099a19..8e5de4eb5b081d2 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -50,6 +50,7 @@ set_default("gwp_asan", @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@)
 set_default("expensive_checks", @LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL@)
 set_default("test_standalone_build_libs", @COMPILER_RT_TEST_STANDALONE_BUILD_LIBS_PYBOOL@)
 set_default("has_compiler_rt_libatomic", @COMPILER_RT_BUILD_STANDALONE_LIBATOMIC_PYBOOL@)
+set_default("sme", @COMPILER_RT_HAS_ASM_SME@)
 # True iff the test suite supports ignoring the test compiler's runtime library path
 # and using `config.compiler_rt_libdir` instead. This only matters when the runtime
 # library paths differ.
diff --git a/compiler-rt/unittests/lit.common.unit.configured.in b/compiler-rt/unittests/lit.common.unit.configured.in
index 3e42e83c9e70a2b..b26817246284788 100644
--- a/compiler-rt/unittests/lit.common.unit.configured.in
+++ b/compiler-rt/unittests/lit.common.unit.configured.in
@@ -7,6 +7,7 @@ config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
 config.compiler_rt_src_root = "@COMPILER_RT_SOURCE_DIR@"
 config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR@")
+config.sme = "@COMPILER_RT_HAS_ASM_SME@"
 config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.host_arch = "@HOST_ARCH@"

>From 2504e890c8d557690059312bd872afbf3adbed76 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Tue, 16 Jan 2024 07:25:10 +0000
Subject: [PATCH 3/6] Addressed comments.

---
 compiler-rt/cmake/builtin-config-ix.cmake     |   6 +
 compiler-rt/lib/builtins/CMakeLists.txt       |   2 +-
 .../test/builtins/Unit/sme-string-test.c      |  98 ----------------
 .../test/builtins/Unit/sme-string-test.cpp    | 108 ++++++++++++++++++
 compiler-rt/test/lit.common.cfg.py            |   4 +-
 compiler-rt/test/lit.common.configured.in     |   2 +-
 .../unittests/lit.common.unit.configured.in   |   2 +-
 7 files changed, 119 insertions(+), 103 deletions(-)
 delete mode 100644 compiler-rt/test/builtins/Unit/sme-string-test.c
 create mode 100644 compiler-rt/test/builtins/Unit/sme-string-test.cpp

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index b40138aa011f8f2..57ed4db7da6602a 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -41,6 +41,12 @@ asm(\".arch armv9-a+sme\");
 asm(\"smstart\");
 ")
 
+builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
+"
+void foo(int a)  __arm_streaming_compatible {
+}
+")
+
 if(ANDROID)
   set(OS_NAME "Android")
 else()
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 79d3af1a4fd8d86..f9b6cac678c4940 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -560,7 +560,7 @@ set(aarch64_SOURCES
   aarch64/fp_mode.c
 )
 
-if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
+if(COMPILER_RT_HAS_ASM_SME AND COMPILER_RT_HAS_AARCH64_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
   list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
   message(STATUS "AArch64 SME ABI routines enabled")
 else()
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.c b/compiler-rt/test/builtins/Unit/sme-string-test.c
deleted file mode 100644
index 51c1ad9ed02a7c8..000000000000000
--- a/compiler-rt/test/builtins/Unit/sme-string-test.c
+++ /dev/null
@@ -1,98 +0,0 @@
-// REQUIRES: linux, aarch64-target-arch, sme-available
-// RUN: %clang_builtins %s %librt -o %t && %run %t
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#define N 1024
-
-static uint8_t dst[N], src[N];
-
-extern void *__arm_sc_memcpy(void *, const void *, size_t);
-extern void *__arm_sc_memset(void *, int, size_t);
-extern void *__arm_sc_memmove(void *, const void *, size_t);
-extern void *__arm_sc_memchr(const void *, int, size_t);
-
-void init(void) {
-  for (int i = 0; i < N; i++) {
-    src[i] = i * 2;
-    dst[i] = i + 1;
-  }
-}
-
-void reinit_dst(int n) {
-  for (int i = 0; i < n; i++) {
-    dst[i] = i + 1;
-  }
-}
-
-int sum(uint8_t *dest, int n) {
-  int t = 0;
-  for (int i = 0; i < n; i++) {
-    t += dest[i];
-  }
-  return t;
-}
-
-int main() {
-
-  init();
-
-  // Test correctness of memcpy
-  for (int i = 0; i < 67; i++) {
-    int t[2];
-    if (!__arm_sc_memcpy(dst, src, i))
-      abort();
-    t[0] = sum(dst, N);
-    reinit_dst(i);
-    memcpy(dst, src, i);
-    t[1] = sum(dst, N);
-    reinit_dst(i);
-    if (t[0] != t[1])
-      abort();
-  }
-
-  // Test correctness of memset
-  for (int i = 0; i < 67; i++) {
-    int t[2];
-    if (!__arm_sc_memset(dst, src[i], i))
-      abort();
-    t[0] = sum(dst, N);
-    reinit_dst(i);
-    memset(dst, src[i], i);
-    t[1] = sum(dst, N);
-    reinit_dst(i);
-    if (t[0] != t[1])
-      abort();
-  }
-
-  // Test correctness of memchr
-  for (int i = 0; i < 67; i++) {
-    for (int j = 0; j < 67; j++) {
-      uint8_t *t[2];
-      t[0] = __arm_sc_memchr(src, src[j], i);
-      t[1] = memchr(src, src[j], i);
-      if (t[0] != t[1])
-        abort();
-    }
-  }
-
-  // Test correctness for memmove
-  for (int i = 0; i < 67; i++) {
-    for (int j = 0; j < 67; j++) {
-      int t[2];
-      if (!__arm_sc_memmove(&dst[66 - j], &dst[j], i))
-        abort();
-      t[0] = sum(dst, N);
-      reinit_dst(200);
-      memmove(&dst[66 - j], &dst[j], i);
-      t[1] = sum(dst, N);
-      reinit_dst(200);
-      if (t[0] != t[1])
-        abort();
-    }
-  }
-
-  return 0;
-}
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.cpp b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
new file mode 100644
index 000000000000000..d207db67e3b9a87
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
@@ -0,0 +1,108 @@
+// REQUIRES: linux, aarch64-target-arch, aarch64-sme-available
+// RUN: %clangxx_builtins %s %librt -o %t && %run %t
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define N 16
+
+extern "C" {
+void *__arm_sc_memcpy(void *, const void *, size_t);
+void *__arm_sc_memset(void *, int, size_t);
+void *__arm_sc_memmove(void *, const void *, size_t);
+void *__arm_sc_memchr(const void *, int, size_t);
+}
+
+class MemoryArea {
+
+  uint8_t dst[N], src[N];
+
+  int sum_and_reset_dst(uint8_t *dest, int n, int j) {
+    int t = 0;
+    for (int i = 0; i < n; i++) {
+      t += dest[i];
+    }
+    for (int i = 0; i < j; i++) {
+      dst[i] = i + 1;
+    }
+    return t;
+  }
+
+public:
+  MemoryArea() {
+    for (int i = 0; i < N; i++) {
+      src[i] = i * 2;
+      dst[i] = i + 1;
+    }
+  }
+
+  // Test correctness of memcpy
+  void test_memcpy() {
+    for (int i = 0; i < 8; i++) {
+      int t[2];
+      if (!__arm_sc_memcpy(dst, src, i))
+        abort();
+      t[0] = sum_and_reset_dst(dst, N, i);
+      memcpy(dst, src, i);
+      t[1] = sum_and_reset_dst(dst, N, i);
+      if (t[0] != t[1])
+        abort();
+    }
+  }
+
+  // Test correctness of memset
+  void test_memset() {
+    for (int i = 0; i < 8; i++) {
+      int t[2];
+      if (!__arm_sc_memset(dst, src[i], i))
+        abort();
+      t[0] = sum_and_reset_dst(dst, N, i);
+      __arm_sc_memset(dst, src[i], i);
+      t[1] = sum_and_reset_dst(dst, N, i);
+      if (t[0] != t[1])
+        abort();
+    }
+  }
+
+  // Test correctness of memchr
+  void test_memchr() {
+    for (int i = 0; i < 8; i++) {
+      for (int j = 0; j < 8; j++) {
+        uint8_t *t[2];
+        t[0] = (uint8_t *)__arm_sc_memchr(src, src[j], i);
+        t[1] = (uint8_t *)__arm_sc_memchr(src, src[j], i);
+        if (t[0] != t[1])
+          abort();
+      }
+    }
+  }
+
+  // Test correctness for memmove
+  void test_memmove() {
+    for (int i = 0; i < 8; i++) {
+      for (int j = 0; j < 8; j++) {
+        int t[2];
+        if (!__arm_sc_memmove(&dst[8 - j], &dst[j], i))
+          abort();
+        t[0] = sum_and_reset_dst(dst, N, 16);
+        __arm_sc_memmove(&dst[8 - j], &dst[j], i);
+        t[1] = sum_and_reset_dst(dst, N, 16);
+        if (t[0] != t[1])
+          abort();
+      }
+    }
+  }
+};
+
+int main() {
+
+  MemoryArea MA = MemoryArea();
+
+  MA.test_memcpy();
+  MA.test_memset();
+  MA.test_memchr();
+  MA.test_memmove();
+
+  return 0;
+}
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index d4960b3aabc2cc0..113777b0ea8a196 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -454,8 +454,8 @@ def get_ios_commands_dir():
 if config.has_lld:
     config.available_features.add("lld-available")
 
-if config.sme:
-    config.available_features.add("sme-available")
+if config.aarch64_sme:
+    config.available_features.add("aarch64-sme-available")
 
 if config.use_lld:
     config.available_features.add("lld")
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 8e5de4eb5b081d2..b93e20e80a6ed5f 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -50,7 +50,7 @@ set_default("gwp_asan", @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@)
 set_default("expensive_checks", @LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL@)
 set_default("test_standalone_build_libs", @COMPILER_RT_TEST_STANDALONE_BUILD_LIBS_PYBOOL@)
 set_default("has_compiler_rt_libatomic", @COMPILER_RT_BUILD_STANDALONE_LIBATOMIC_PYBOOL@)
-set_default("sme", @COMPILER_RT_HAS_ASM_SME@)
+set_default("aarch64_sme", @COMPILER_RT_HAS_AARCH64_SME@)
 # True iff the test suite supports ignoring the test compiler's runtime library path
 # and using `config.compiler_rt_libdir` instead. This only matters when the runtime
 # library paths differ.
diff --git a/compiler-rt/unittests/lit.common.unit.configured.in b/compiler-rt/unittests/lit.common.unit.configured.in
index b26817246284788..0d4785e0f038752 100644
--- a/compiler-rt/unittests/lit.common.unit.configured.in
+++ b/compiler-rt/unittests/lit.common.unit.configured.in
@@ -7,7 +7,7 @@ config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
 config.compiler_rt_src_root = "@COMPILER_RT_SOURCE_DIR@"
 config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR@")
-config.sme = "@COMPILER_RT_HAS_ASM_SME@"
+config.aarch64_sme = "@COMPILER_RT_HAS_AARCH64_SME@"
 config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.host_arch = "@HOST_ARCH@"

>From 5324c3f85a6f7597befd8e7916d5250eb483f42e Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 19 Jan 2024 07:39:11 +0000
Subject: [PATCH 4/6] Addressed comments.

---
 compiler-rt/cmake/builtin-config-ix.cmake     |   8 +-
 compiler-rt/lib/builtins/CMakeLists.txt       |   5 +-
 .../test/builtins/Unit/sme-string-test.cpp    | 174 +++++++++++-------
 3 files changed, 109 insertions(+), 78 deletions(-)

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index 57ed4db7da6602a..0aa789f5429dcfe 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -35,15 +35,11 @@ asm(\".arch armv8-a+lse\");
 asm(\"cas w0, w1, [x2]\");
 ")
 
-builtin_check_c_compiler_source(COMPILER_RT_HAS_ASM_SME
-"
-asm(\".arch armv9-a+sme\");
-asm(\"smstart\");
-")
-
 builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
 "
 void foo(int a)  __arm_streaming_compatible {
+  asm(\".arch armv9-a+sme\");
+  asm(\"smstart\");
 }
 ")
 
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index f9b6cac678c4940..2f53afd7173c889 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -560,9 +560,12 @@ set(aarch64_SOURCES
   aarch64/fp_mode.c
 )
 
-if(COMPILER_RT_HAS_ASM_SME AND COMPILER_RT_HAS_AARCH64_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
+if(COMPILER_RT_HAS_AARCH64_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
   list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
   message(STATUS "AArch64 SME ABI routines enabled")
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-mllvm -disable-loop-idiom-all")
+  endif()
 else()
   message(STATUS "AArch64 SME ABI routines disabled")
 endif()
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.cpp b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
index d207db67e3b9a87..69b6dc7631c6f14 100644
--- a/compiler-rt/test/builtins/Unit/sme-string-test.cpp
+++ b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
@@ -1,12 +1,12 @@
-// REQUIRES: linux, aarch64-target-arch, aarch64-sme-available
+// REQUIRES: aarch64-target-arch, aarch64-sme-available
 // RUN: %clangxx_builtins %s %librt -o %t && %run %t
 
+#include <cassert>
+#include <initializer_list>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define N 16
-
 extern "C" {
 void *__arm_sc_memcpy(void *, const void *, size_t);
 void *__arm_sc_memset(void *, int, size_t);
@@ -14,95 +14,127 @@ void *__arm_sc_memmove(void *, const void *, size_t);
 void *__arm_sc_memchr(const void *, int, size_t);
 }
 
-class MemoryArea {
-
-  uint8_t dst[N], src[N];
+template <unsigned N> class Memory {
+public:
+  uint8_t ptr[N];
 
-  int sum_and_reset_dst(uint8_t *dest, int n, int j) {
-    int t = 0;
-    for (int i = 0; i < n; i++) {
-      t += dest[i];
-    }
-    for (int i = 0; i < j; i++) {
-      dst[i] = i + 1;
+  Memory(int Stride = 0) {
+    if (Stride != 0) {
+      for (unsigned I = 0, Elem = 0; I < N; I++, Elem += Stride) {
+        ptr[I] = Elem;
+      }
     }
-    return t;
   }
 
-public:
-  MemoryArea() {
-    for (int i = 0; i < N; i++) {
-      src[i] = i * 2;
-      dst[i] = i + 1;
+  void assert_equal(const Memory &Other) {
+    for (unsigned I = 0; I < N; I++) {
+      assert(ptr[I] == Other.ptr[I]);
     }
   }
 
-  // Test correctness of memcpy
-  void test_memcpy() {
-    for (int i = 0; i < 8; i++) {
-      int t[2];
-      if (!__arm_sc_memcpy(dst, src, i))
-        abort();
-      t[0] = sum_and_reset_dst(dst, N, i);
-      memcpy(dst, src, i);
-      t[1] = sum_and_reset_dst(dst, N, i);
-      if (t[0] != t[1])
-        abort();
+  void assert_equal(std::initializer_list<uint8_t> S) {
+    assert(S.size() == N);
+    auto It = S.begin();
+    for (unsigned I = 0; I < N; I++) {
+      assert(ptr[I] == *It++);
     }
   }
 
-  // Test correctness of memset
-  void test_memset() {
-    for (int i = 0; i < 8; i++) {
-      int t[2];
-      if (!__arm_sc_memset(dst, src[i], i))
-        abort();
-      t[0] = sum_and_reset_dst(dst, N, i);
-      __arm_sc_memset(dst, src[i], i);
-      t[1] = sum_and_reset_dst(dst, N, i);
-      if (t[0] != t[1])
+  void assert_elemt_equal_at(unsigned I, uint8_t elem) {
+    assert(ptr[I] == elem);
+  }
+};
+
+int main() {
+
+  // Testing memcpy from Src to Dst.
+  {
+    Memory<8> Src(1);
+    Memory<8> Dst;
+    if (!__arm_sc_memcpy(Dst.ptr, Src.ptr, 8))
+      abort();
+    Dst.assert_equal(Src);
+    Dst.assert_equal({0, 1, 2, 3, 4, 5, 6, 7});
+  }
+
+  // Testing memcpy from Src to Dst with pointer offset.
+  {
+    Memory<8> Src(1);
+    Memory<8> Dst(1);
+    if (!__arm_sc_memcpy(Dst.ptr + 1, Src.ptr, 6))
+      abort();
+    Dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+  }
+
+  // Testing memchr.
+  {
+    Memory<8> Src(4);
+    for (unsigned I = 0; I < 8; I++) {
+      uint8_t E = Src.ptr[I];
+      uint8_t *Elem = (uint8_t *)memchr(Src.ptr, E, 8);
+      if (!Elem)
         abort();
+      Src.assert_elemt_equal_at(Elem - Src.ptr, *Elem);
+      assert(__arm_sc_memchr(Src.ptr, E, 8) == memchr(Src.ptr, E, 8));
     }
   }
 
-  // Test correctness of memchr
-  void test_memchr() {
-    for (int i = 0; i < 8; i++) {
-      for (int j = 0; j < 8; j++) {
-        uint8_t *t[2];
-        t[0] = (uint8_t *)__arm_sc_memchr(src, src[j], i);
-        t[1] = (uint8_t *)__arm_sc_memchr(src, src[j], i);
-        if (t[0] != t[1])
-          abort();
-      }
-    }
+  // Testing memset.
+  {
+    Memory<8> Array;
+    if (!__arm_sc_memset(Array.ptr, 2, 8))
+      abort();
+    Array.assert_equal({2, 2, 2, 2, 2, 2, 2, 2});
+  }
+
+  // Testing memset with pointer offset.
+  {
+    Memory<8> Array(1);
+    if (!__arm_sc_memset(Array.ptr + 1, 2, 6))
+      abort();
+    Array.assert_equal({0, 2, 2, 2, 2, 2, 2, 7});
   }
 
-  // Test correctness for memmove
-  void test_memmove() {
-    for (int i = 0; i < 8; i++) {
-      for (int j = 0; j < 8; j++) {
-        int t[2];
-        if (!__arm_sc_memmove(&dst[8 - j], &dst[j], i))
-          abort();
-        t[0] = sum_and_reset_dst(dst, N, 16);
-        __arm_sc_memmove(&dst[8 - j], &dst[j], i);
-        t[1] = sum_and_reset_dst(dst, N, 16);
-        if (t[0] != t[1])
-          abort();
+  // Testing memset with different pointer offset.
+  {
+    for (unsigned I = 0; I < 16; I++) {
+      Memory<16> Array(2);
+      if (!__arm_sc_memset(Array.ptr + I, I, 16 - I))
+        abort();
+
+      uint8_t OrigElem = 0;
+      for (unsigned J = 0; J < 16; J++) {
+        if (I == 0) {
+          Array.assert_elemt_equal_at(J, 0);
+        } else if (J < I) {
+          Array.assert_elemt_equal_at(J, OrigElem);
+        } else {
+          Array.assert_elemt_equal_at(J, (uint8_t)I);
+        }
+        OrigElem += 2;
       }
     }
   }
-};
 
-int main() {
-
-  MemoryArea MA = MemoryArea();
+  // Testing memmove with a simple non-overlap case.
+  {
+    Memory<8> Src(1);
+    Memory<8> Dst(1);
+    if (!__arm_sc_memmove(Dst.ptr + 1, Src.ptr, 6))
+      abort();
+    Dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+  }
 
-  MA.test_memcpy();
-  MA.test_memset();
-  MA.test_memchr();
-  MA.test_memmove();
+  // Testing memove with overlap pointers Dst > Src, Dst < Src.
+  {
+    Memory<8> SrcDst(1);
+    if (!__arm_sc_memmove(SrcDst.ptr + 1, SrcDst.ptr, 6))
+      abort();
+    SrcDst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+    if (!__arm_sc_memmove(SrcDst.ptr, SrcDst.ptr + 1, 6))
+      abort();
+    SrcDst.assert_equal({0, 1, 2, 3, 4, 5, 5, 7});
+  }
 
   return 0;
 }

>From d31dcb1f6bbfc4f5034d06edc90dc624003e6327 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Fri, 19 Jan 2024 16:38:12 +0000
Subject: [PATCH 5/6] Addressed comments.

---
 compiler-rt/cmake/builtin-config-ix.cmake     |  2 +-
 compiler-rt/lib/builtins/CMakeLists.txt       |  6 +--
 .../test/builtins/Unit/sme-string-test.cpp    | 46 ++++++-------------
 3 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index 0aa789f5429dcfe..b17c43bf6a68b89 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -37,7 +37,7 @@ asm(\"cas w0, w1, [x2]\");
 
 builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
 "
-void foo(int a)  __arm_streaming_compatible {
+void foo(void)  __arm_streaming_compatible {
   asm(\".arch armv9-a+sme\");
   asm(\"smstart\");
 }
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 2f53afd7173c889..645cf6c85d18b27 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -560,12 +560,10 @@ set(aarch64_SOURCES
   aarch64/fp_mode.c
 )
 
-if(COMPILER_RT_HAS_AARCH64_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
+if(COMPILER_RT_HAS_AARCH64_SME AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
   list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
   message(STATUS "AArch64 SME ABI routines enabled")
-  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-mllvm -disable-loop-idiom-all")
-  endif()
+  set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
 else()
   message(STATUS "AArch64 SME ABI routines disabled")
 endif()
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.cpp b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
index 69b6dc7631c6f14..3446e5dd01144f9 100644
--- a/compiler-rt/test/builtins/Unit/sme-string-test.cpp
+++ b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
@@ -17,23 +17,24 @@ void *__arm_sc_memchr(const void *, int, size_t);
 template <unsigned N> class Memory {
 public:
   uint8_t ptr[N];
-
-  Memory(int Stride = 0) {
-    if (Stride != 0) {
-      for (unsigned I = 0, Elem = 0; I < N; I++, Elem += Stride) {
-        ptr[I] = Elem;
-      }
+  unsigned size;
+
+  Memory(unsigned Stride = 0) {
+    size = N;
+    if (Stride == 0)
+      return;
+    for (unsigned I = 0, Elem = 0; I < N; I++) {
+      ptr[I] = I * Stride;
     }
   }
 
   void assert_equal(const Memory &Other) {
-    for (unsigned I = 0; I < N; I++) {
-      assert(ptr[I] == Other.ptr[I]);
-    }
+    assert(N == Other.size);
+    assert(memcmp(ptr, Other.ptr, N) == 0);
   }
 
   void assert_equal(std::initializer_list<uint8_t> S) {
-    assert(S.size() == N);
+    assert(N == S.size());
     auto It = S.begin();
     for (unsigned I = 0; I < N; I++) {
       assert(ptr[I] == *It++);
@@ -75,7 +76,9 @@ int main() {
       if (!Elem)
         abort();
       Src.assert_elemt_equal_at(Elem - Src.ptr, *Elem);
-      assert(__arm_sc_memchr(Src.ptr, E, 8) == memchr(Src.ptr, E, 8));
+      for (unsigned I = 0; I < 8; ++I)
+        assert(__arm_sc_memchr(Src.ptr, Src.ptr[I], 8) ==
+               memchr(Src.ptr, Src.ptr[I], 8));
     }
   }
 
@@ -95,27 +98,6 @@ int main() {
     Array.assert_equal({0, 2, 2, 2, 2, 2, 2, 7});
   }
 
-  // Testing memset with different pointer offset.
-  {
-    for (unsigned I = 0; I < 16; I++) {
-      Memory<16> Array(2);
-      if (!__arm_sc_memset(Array.ptr + I, I, 16 - I))
-        abort();
-
-      uint8_t OrigElem = 0;
-      for (unsigned J = 0; J < 16; J++) {
-        if (I == 0) {
-          Array.assert_elemt_equal_at(J, 0);
-        } else if (J < I) {
-          Array.assert_elemt_equal_at(J, OrigElem);
-        } else {
-          Array.assert_elemt_equal_at(J, (uint8_t)I);
-        }
-        OrigElem += 2;
-      }
-    }
-  }
-
   // Testing memmove with a simple non-overlap case.
   {
     Memory<8> Src(1);

>From 5d696f7142ec7f61ce52efb015300868e218f9ce Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov at arm.com>
Date: Mon, 22 Jan 2024 10:40:40 +0000
Subject: [PATCH 6/6] Resolved comments.

---
 .../lib/builtins/aarch64/sme-libc-routines.c  | 16 +--
 .../test/builtins/Unit/sme-string-test.cpp    | 98 +++++++++----------
 2 files changed, 52 insertions(+), 62 deletions(-)

diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
index 0e26a3ab030c85c..d974354dc40f8d3 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -8,10 +8,8 @@ static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
                                  size_t n) __arm_streaming_compatible {
   unsigned char *destp = (unsigned char *)dest;
   const unsigned char *srcp = (const unsigned char *)src;
-
-  for (size_t i = 0; i < n; i++) {
+  for (size_t i = 0; i < n; ++i)
     destp[i] = srcp[i];
-  }
 
   return dest;
 }
@@ -27,10 +25,8 @@ void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
 void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
   unsigned char *destp = (unsigned char *)dest;
   unsigned char c8 = (unsigned char)c;
-
-  for (size_t i = 0; i < n; i++) {
+  for (size_t i = 0; i < n; ++i)
     destp[i] = c8;
-  }
 
   return dest;
 }
@@ -39,11 +35,10 @@ static void *__arm_sc_memcpy_rev(void *dest, const void *src,
                                  size_t n) __arm_streaming_compatible {
   unsigned char *destp = (unsigned char *)dest;
   const unsigned char *srcp = (const unsigned char *)src;
-
   // TODO: Improve performance by copying larger chunks in reverse, or by
   // using SVE.
   while (n > 0) {
-    n--;
+    --n;
     destp[n] = srcp[n];
   }
   return dest;
@@ -57,7 +52,6 @@ void *__arm_sc_memmove(void *dest, const void *src,
                        size_t n) __arm_streaming_compatible {
   unsigned char *destp = (unsigned char *)dest;
   const unsigned char *srcp = (const unsigned char *)src;
-
   // If src and dest are identical there is nothing to do!
   if ((destp == srcp) || (n == 0))
     return destp;
@@ -88,11 +82,9 @@ const void *__arm_sc_memchr(const void *src, int c,
                             size_t n) __arm_streaming_compatible {
   const unsigned char *srcp = (const unsigned char *)src;
   unsigned char c8 = (unsigned char)c;
-
-  for (size_t i = 0; i < n; i++) {
+  for (size_t i = 0; i < n; ++i)
     if (srcp[i] == c8)
       return &srcp[i];
-  }
 
   return NULL;
 }
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.cpp b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
index 3446e5dd01144f9..3bc4559f9ae0479 100644
--- a/compiler-rt/test/builtins/Unit/sme-string-test.cpp
+++ b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
@@ -19,26 +19,24 @@ template <unsigned N> class Memory {
   uint8_t ptr[N];
   unsigned size;
 
-  Memory(unsigned Stride = 0) {
+  Memory(unsigned stride = 0) {
     size = N;
-    if (Stride == 0)
+    if (stride == 0)
       return;
-    for (unsigned I = 0, Elem = 0; I < N; I++) {
-      ptr[I] = I * Stride;
-    }
+    for (unsigned i = 0; i < N; i++)
+      ptr[i] = i * stride;
   }
 
-  void assert_equal(const Memory &Other) {
-    assert(N == Other.size);
-    assert(memcmp(ptr, Other.ptr, N) == 0);
+  void assert_equal(const Memory &other) {
+    assert(N == other.size);
+    assert(memcmp(ptr, other.ptr, N) == 0);
   }
 
-  void assert_equal(std::initializer_list<uint8_t> S) {
-    assert(N == S.size());
-    auto It = S.begin();
-    for (unsigned I = 0; I < N; I++) {
-      assert(ptr[I] == *It++);
-    }
+  void assert_equal(std::initializer_list<uint8_t> s) {
+    assert(N == s.size());
+    auto it = s.begin();
+    for (unsigned i = 0; i < N; ++i)
+      assert(ptr[i] == *it++);
   }
 
   void assert_elemt_equal_at(unsigned I, uint8_t elem) {
@@ -48,74 +46,74 @@ template <unsigned N> class Memory {
 
 int main() {
 
-  // Testing memcpy from Src to Dst.
+  // Testing memcpy from src to dst.
   {
-    Memory<8> Src(1);
-    Memory<8> Dst;
-    if (!__arm_sc_memcpy(Dst.ptr, Src.ptr, 8))
+    Memory<8> src(1);
+    Memory<8> dst;
+    if (!__arm_sc_memcpy(dst.ptr, src.ptr, 8))
       abort();
-    Dst.assert_equal(Src);
-    Dst.assert_equal({0, 1, 2, 3, 4, 5, 6, 7});
+    dst.assert_equal(src);
+    dst.assert_equal({0, 1, 2, 3, 4, 5, 6, 7});
   }
 
-  // Testing memcpy from Src to Dst with pointer offset.
+  // Testing memcpy from src to dst with pointer offset.
   {
-    Memory<8> Src(1);
-    Memory<8> Dst(1);
-    if (!__arm_sc_memcpy(Dst.ptr + 1, Src.ptr, 6))
+    Memory<8> src(1);
+    Memory<8> dst(1);
+    if (!__arm_sc_memcpy(dst.ptr + 1, src.ptr, 6))
       abort();
-    Dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+    dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
   }
 
   // Testing memchr.
   {
-    Memory<8> Src(4);
-    for (unsigned I = 0; I < 8; I++) {
-      uint8_t E = Src.ptr[I];
-      uint8_t *Elem = (uint8_t *)memchr(Src.ptr, E, 8);
-      if (!Elem)
+    Memory<8> src(4);
+    for (unsigned i = 0; i < 8; ++i) {
+      uint8_t e = src.ptr[i];
+      uint8_t *elem = (uint8_t *)memchr(src.ptr, e, 8);
+      if (!elem)
         abort();
-      Src.assert_elemt_equal_at(Elem - Src.ptr, *Elem);
-      for (unsigned I = 0; I < 8; ++I)
-        assert(__arm_sc_memchr(Src.ptr, Src.ptr[I], 8) ==
-               memchr(Src.ptr, Src.ptr[I], 8));
+      src.assert_elemt_equal_at(elem - src.ptr, *elem);
+      for (unsigned i = 0; i < 8; ++i)
+        assert(__arm_sc_memchr(src.ptr, src.ptr[i], 8) ==
+               memchr(src.ptr, src.ptr[i], 8));
     }
   }
 
   // Testing memset.
   {
-    Memory<8> Array;
-    if (!__arm_sc_memset(Array.ptr, 2, 8))
+    Memory<8> array;
+    if (!__arm_sc_memset(array.ptr, 2, 8))
       abort();
-    Array.assert_equal({2, 2, 2, 2, 2, 2, 2, 2});
+    array.assert_equal({2, 2, 2, 2, 2, 2, 2, 2});
   }
 
   // Testing memset with pointer offset.
   {
-    Memory<8> Array(1);
-    if (!__arm_sc_memset(Array.ptr + 1, 2, 6))
+    Memory<8> array(1);
+    if (!__arm_sc_memset(array.ptr + 1, 2, 6))
       abort();
-    Array.assert_equal({0, 2, 2, 2, 2, 2, 2, 7});
+    array.assert_equal({0, 2, 2, 2, 2, 2, 2, 7});
   }
 
   // Testing memmove with a simple non-overlap case.
   {
-    Memory<8> Src(1);
-    Memory<8> Dst(1);
-    if (!__arm_sc_memmove(Dst.ptr + 1, Src.ptr, 6))
+    Memory<8> src(1);
+    Memory<8> dst(1);
+    if (!__arm_sc_memmove(dst.ptr + 1, src.ptr, 6))
       abort();
-    Dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+    dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
   }
 
-  // Testing memove with overlap pointers Dst > Src, Dst < Src.
+  // Testing memove with overlap pointers dst > src, dst < src.
   {
-    Memory<8> SrcDst(1);
-    if (!__arm_sc_memmove(SrcDst.ptr + 1, SrcDst.ptr, 6))
+    Memory<8> srcdst(1);
+    if (!__arm_sc_memmove(srcdst.ptr + 1, srcdst.ptr, 6))
       abort();
-    SrcDst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
-    if (!__arm_sc_memmove(SrcDst.ptr, SrcDst.ptr + 1, 6))
+    srcdst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+    if (!__arm_sc_memmove(srcdst.ptr, srcdst.ptr + 1, 6))
       abort();
-    SrcDst.assert_equal({0, 1, 2, 3, 4, 5, 5, 7});
+    srcdst.assert_equal({0, 1, 2, 3, 4, 5, 5, 7});
   }
 
   return 0;



More information about the llvm-commits mailing list