[compiler-rt] [libunwind] [AArch64] Fix nofp regressions in compiler-rt and libunwind (PR #111235)

Sat Oct 5 16:58:26 PDT 2024

https://github.com/keith-packard updated https://github.com/llvm/llvm-project/pull/111235

>From 048cb1b0ea65fb758e104376c0bff345eab67623 Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp at keithp.com>
Date: Fri, 4 Oct 2024 21:06:37 -0700
Subject: [PATCH 1/2] [libunwind] Support aarch64 without FPU

ldp and stp instructions both require an FPU. Use pairs of ldr or str
instructions when the target doesn't have one.

Signed-off-by: Keith Packard <keithp at keithp.com>
---
 libunwind/src/UnwindRegistersRestore.S | 37 +++++++++++++++----------
 libunwind/src/UnwindRegistersSave.S    | 38 ++++++++++++++++----------
 2 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
index 180a66582f41b5..9422a21503ebc0 100644
--- a/libunwind/src/UnwindRegistersRestore.S
+++ b/libunwind/src/UnwindRegistersRestore.S
@@ -633,6 +633,13 @@ Lnovec:
 .arch_extension gcs
 #endif
 
+#if defined(__ARM_FP) && __ARM_FP != 0
+#define LDP(a,b,r,o,p) stp a, b, [r, o]
+#else
+/* In reverse order so that the last LDP(x0,x1,x0) works. */
+#define LDP(a,b,r,o,p) ldr b, [r, p] ; ldr a, [r, o]
+#endif
+
 //
 // extern "C" void __libunwind_Registers_arm64_jumpto(Registers_arm64 *);
 //
@@ -642,23 +649,24 @@ Lnovec:
   .p2align 2
 DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
   // skip restore of x0,x1 for now
-  ldp    x2, x3,  [x0, #0x010]
-  ldp    x4, x5,  [x0, #0x020]
-  ldp    x6, x7,  [x0, #0x030]
-  ldp    x8, x9,  [x0, #0x040]
-  ldp    x10,x11, [x0, #0x050]
-  ldp    x12,x13, [x0, #0x060]
-  ldp    x14,x15, [x0, #0x070]
+  LDP(x2, x3, x0, #0x010, #0x018)
+  LDP(x4, x5, x0, #0x020, #0x028)
+  LDP(x6, x7, x0, #0x030, #0x038)
+  LDP(x8, x9, x0, #0x040, #0x048)
+  LDP(x10, x11, x0, #0x050, #0x058)
+  LDP(x12, x13, x0, #0x060, #0x068)
+  LDP(x14, x15, x0, #0x070, #0x078)
   // x16 and x17 were clobbered by the call into the unwinder, so no point in
   // restoring them.
-  ldp    x18,x19, [x0, #0x090]
-  ldp    x20,x21, [x0, #0x0A0]
-  ldp    x22,x23, [x0, #0x0B0]
-  ldp    x24,x25, [x0, #0x0C0]
-  ldp    x26,x27, [x0, #0x0D0]
-  ldp    x28,x29, [x0, #0x0E0]
+  LDP(x18, x19, x0, #0x090, #0x098)
+  LDP(x20, x21, x0, #0x0A0, #0x0A8)
+  LDP(x22, x23, x0, #0x0B0, #0x0B8)
+  LDP(x24, x25, x0, #0x0C0, #0x0C8)
+  LDP(x26, x27, x0, #0x0D0, #0x0D8)
+  LDP(x28, x29, x0, #0x0E0, #0x0E8)
   ldr    x30,     [x0, #0x100]  // restore pc into lr
 
+#if defined(__ARM_FP) && __ARM_FP != 0
   ldp    d0, d1,  [x0, #0x110]
   ldp    d2, d3,  [x0, #0x120]
   ldp    d4, d5,  [x0, #0x130]
@@ -676,13 +684,14 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
   ldp    d28,d29, [x0, #0x1F0]
   ldr    d30,     [x0, #0x200]
   ldr    d31,     [x0, #0x208]
+#endif
 
   // Finally, restore sp. This must be done after the last read from the
   // context struct, because it is allocated on the stack, and an exception
   // could clobber the de-allocated portion of the stack after sp has been
   // restored.
   ldr    x16,     [x0, #0x0F8]
-  ldp    x0, x1,  [x0, #0x000]  // restore x0,x1
+  LDP(x0, x1, x0, #0x000, #0x008)  // restore x0,x1
   mov    sp,x16                 // restore sp
 #if defined(__ARM_FEATURE_GCS_DEFAULT)
   // If GCS is enabled we need to push the address we're returning to onto the
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index fab234fcd6f318..19c7ed20a8cf67 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -718,6 +718,12 @@ LnoR2Fix:
 
 #elif defined(__aarch64__)
 
+#if defined(__ARM_FP) && __ARM_FP != 0
+#define STP(a,b,r,o,p) stp a, b, [r, o]
+#else
+#define STP(a,b,r,o,p) str a, [r, o] ; str b, [r, p]
+#endif
+
 //
 // extern int __unw_getcontext(unw_context_t* thread_state)
 //
@@ -726,26 +732,27 @@ LnoR2Fix:
 //
   .p2align 2
 DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
-  stp    x0, x1,  [x0, #0x000]
-  stp    x2, x3,  [x0, #0x010]
-  stp    x4, x5,  [x0, #0x020]
-  stp    x6, x7,  [x0, #0x030]
-  stp    x8, x9,  [x0, #0x040]
-  stp    x10,x11, [x0, #0x050]
-  stp    x12,x13, [x0, #0x060]
-  stp    x14,x15, [x0, #0x070]
-  stp    x16,x17, [x0, #0x080]
-  stp    x18,x19, [x0, #0x090]
-  stp    x20,x21, [x0, #0x0A0]
-  stp    x22,x23, [x0, #0x0B0]
-  stp    x24,x25, [x0, #0x0C0]
-  stp    x26,x27, [x0, #0x0D0]
-  stp    x28,x29, [x0, #0x0E0]
+  STP(x0, x1, x0, #0x000, #0x008)
+  STP(x2, x3, x0, #0x010, #0x018)
+  STP(x4, x5, x0, #0x020, #0x028)
+  STP(x6, x7, x0, #0x030, #0x038)
+  STP(x8, x9, x0, #0x040, #0x048)
+  STP(x10, x11, x0, #0x050, #0x058)
+  STP(x12, x13, x0, #0x060, #0x068)
+  STP(x14, x15, x0, #0x070, #0x078)
+  STP(x16, x17, x0, #0x080, #0x088)
+  STP(x18, x19, x0, #0x090, #0x098)
+  STP(x20, x21, x0, #0x0A0, #0x0A8)
+  STP(x22, x23, x0, #0x0B0, #0x0B8)
+  STP(x24, x25, x0, #0x0C0, #0x0C8)
+  STP(x26, x27, x0, #0x0D0, #0x0D8)
+  STP(x28, x29, x0, #0x0E0, #0x0E8)
   str    x30,     [x0, #0x0F0]
   mov    x1,sp
   str    x1,      [x0, #0x0F8]
   str    x30,     [x0, #0x100]    // store return address as pc
   // skip cpsr
+#if defined(__ARM_FP) && __ARM_FP != 0
   stp    d0, d1,  [x0, #0x110]
   stp    d2, d3,  [x0, #0x120]
   stp    d4, d5,  [x0, #0x130]
@@ -763,6 +770,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
   stp    d28,d29, [x0, #0x1F0]
   str    d30,     [x0, #0x200]
   str    d31,     [x0, #0x208]
+#endif
   mov    x0, #0                   // return UNW_ESUCCESS
   ret
 

>From 4177bf11aa1a85a20f3f9f644868e18b538b6907 Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp at keithp.com>
Date: Fri, 4 Oct 2024 21:08:17 -0700
Subject: [PATCH 2/2] [compiler-rt] Support aarch64 targets without FPU

Fall back to the old C implementations of various routines when
the target doesn't have an FPU.

Signed-off-by: Keith Packard <keithp at keithp.com>
---
 .../builtins/aarch64/sme-libc-mem-routines.S  |  2 +-
 .../lib/builtins/aarch64/sme-libc-routines.c  | 77 +++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
index 0318d9a6f1ebd2..72d87fb4fa8586 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
@@ -6,7 +6,7 @@
 
 #include "../assembly.h"
 
-#ifdef __aarch64__
+#if defined(__aarch64__) && __ARM_FP != 0
 
 #define L(l) .L ## l
 
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
index 315490e73ea2b1..92fb953c03a376 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -1,5 +1,82 @@
 #include <stddef.h>
 
+#if __ARM_FP == 0
+// WARNING: When building the scalar versions of these functions you need to
+// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
+// from recognising a loop idiom and planting calls to memcpy!
+
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = srcp[i];
+
+  return dest;
+}
+
+// If dest and src overlap then behaviour is undefined, hence we can add the
+// restrict keywords here. This also matches the definition of the libc memcpy
+// according to the man page.
+void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
+                      size_t n) __arm_streaming_compatible {
+  return __arm_sc_memcpy_fwd(dest, src, n);
+}
+
+void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = c8;
+
+  return dest;
+}
+
+static void *__arm_sc_memcpy_rev(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+  // TODO: Improve performance by copying larger chunks in reverse, or by
+  // using SVE.
+  while (n > 0) {
+    --n;
+    destp[n] = srcp[n];
+  }
+  return dest;
+}
+
+// Semantically a memmove is equivalent to the following:
+//   1. Copy the entire contents of src to a temporary array that does not
+//      overlap with src or dest.
+//   2. Copy the contents of the temporary array into dest.
+void *__arm_sc_memmove(void *dest, const void *src,
+                       size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  // If src and dest don't overlap then just invoke memcpy
+  if ((srcp > (destp + n)) || (destp > (srcp + n)))
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 1:
+  //     src: Low     |   ->   |     High
+  //    dest: Low  |   ->   |        High
+  // Here src is always ahead of dest at a higher addres. If we first read a
+  // chunk of data from src we can safely write the same chunk to dest without
+  // corrupting future reads of src.
+  if (srcp > destp)
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 2:
+  //     src: Low  |   ->   |        High
+  //    dest: Low     |   ->   |     High
+  // While we're in the overlap region we're always corrupting future reads of
+  // src when writing to dest. An efficient way to do this is to copy the data
+  // in reverse by starting at the highest address.
+  return __arm_sc_memcpy_rev(dest, src, n);
+}
+#endif
+
 const void *__arm_sc_memchr(const void *src, int c,
                             size_t n) __arm_streaming_compatible {
   const unsigned char *srcp = (const unsigned char *)src;