[compiler-rt] [libunwind] [AArch64] Fix nofp regressions in compiler-rt and libunwind (PR #111235)
Keith Packard via cfe-commits
cfe-commits at lists.llvm.org
Mon Oct 7 14:51:52 PDT 2024
https://github.com/keith-packard updated https://github.com/llvm/llvm-project/pull/111235
>From f0546a52bc25327007f75e7267654a858637a76d Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp at keithp.com>
Date: Fri, 4 Oct 2024 21:06:37 -0700
Subject: [PATCH 1/2] Support aarch64 without FPU
Skip save/restore of FPU registers on targets without them.
Signed-off-by: Keith Packard <keithp at keithp.com>
---
libunwind/src/UnwindRegistersRestore.S | 4 ++--
libunwind/src/UnwindRegistersSave.S | 2 ++
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
index 180a66582f41b5..1702d016c368ba 100644
--- a/libunwind/src/UnwindRegistersRestore.S
+++ b/libunwind/src/UnwindRegistersRestore.S
@@ -658,7 +658,7 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
ldp x26,x27, [x0, #0x0D0]
ldp x28,x29, [x0, #0x0E0]
ldr x30, [x0, #0x100] // restore pc into lr
-
+#if defined(__ARM_FP) && __ARM_FP != 0
ldp d0, d1, [x0, #0x110]
ldp d2, d3, [x0, #0x120]
ldp d4, d5, [x0, #0x130]
@@ -676,7 +676,7 @@ DEFINE_LIBUNWIND_FUNCTION(__libunwind_Registers_arm64_jumpto)
ldp d28,d29, [x0, #0x1F0]
ldr d30, [x0, #0x200]
ldr d31, [x0, #0x208]
-
+#endif
// Finally, restore sp. This must be done after the last read from the
// context struct, because it is allocated on the stack, and an exception
// could clobber the de-allocated portion of the stack after sp has been
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index fab234fcd6f318..a489a8ba6df159 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -746,6 +746,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
str x1, [x0, #0x0F8]
str x30, [x0, #0x100] // store return address as pc
// skip cpsr
+#if defined(__ARM_FP) && __ARM_FP != 0
stp d0, d1, [x0, #0x110]
stp d2, d3, [x0, #0x120]
stp d4, d5, [x0, #0x130]
@@ -763,6 +764,7 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext)
stp d28,d29, [x0, #0x1F0]
str d30, [x0, #0x200]
str d31, [x0, #0x208]
+#endif
mov x0, #0 // return UNW_ESUCCESS
ret
>From 43aa8d2b859314fcee174febb24fe72f80bd241c Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp at keithp.com>
Date: Fri, 4 Oct 2024 21:08:17 -0700
Subject: [PATCH 2/2] Support aarch64 targets without FPU
Fall back to the old C implementations of various routines when
the target doesn't have an FPU.
Signed-off-by: Keith Packard <keithp at keithp.com>
---
.../builtins/aarch64/sme-libc-mem-routines.S | 2 +-
.../lib/builtins/aarch64/sme-libc-routines.c | 77 +++++++++++++++++++
2 files changed, 78 insertions(+), 1 deletion(-)
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
index 0318d9a6f1ebd2..72d87fb4fa8586 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
@@ -6,7 +6,7 @@
#include "../assembly.h"
-#ifdef __aarch64__
+#if defined(__aarch64__) && __ARM_FP != 0
#define L(l) .L ## l
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
index 315490e73ea2b1..92fb953c03a376 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -1,5 +1,82 @@
#include <stddef.h>
+#if __ARM_FP == 0
+// WARNING: When building the scalar versions of these functions you need to
+// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
+// from recognising a loop idiom and planting calls to memcpy!
+
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
+ size_t n) __arm_streaming_compatible {
+ unsigned char *destp = (unsigned char *)dest;
+ const unsigned char *srcp = (const unsigned char *)src;
+ for (size_t i = 0; i < n; ++i)
+ destp[i] = srcp[i];
+
+ return dest;
+}
+
+// If dest and src overlap then behaviour is undefined, hence we can add the
+// restrict keywords here. This also matches the definition of the libc memcpy
+// according to the man page.
+void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
+ size_t n) __arm_streaming_compatible {
+ return __arm_sc_memcpy_fwd(dest, src, n);
+}
+
+void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
+ unsigned char *destp = (unsigned char *)dest;
+ unsigned char c8 = (unsigned char)c;
+ for (size_t i = 0; i < n; ++i)
+ destp[i] = c8;
+
+ return dest;
+}
+
+static void *__arm_sc_memcpy_rev(void *dest, const void *src,
+ size_t n) __arm_streaming_compatible {
+ unsigned char *destp = (unsigned char *)dest;
+ const unsigned char *srcp = (const unsigned char *)src;
+ // TODO: Improve performance by copying larger chunks in reverse, or by
+ // using SVE.
+ while (n > 0) {
+ --n;
+ destp[n] = srcp[n];
+ }
+ return dest;
+}
+
+// Semantically a memmove is equivalent to the following:
+// 1. Copy the entire contents of src to a temporary array that does not
+// overlap with src or dest.
+// 2. Copy the contents of the temporary array into dest.
+void *__arm_sc_memmove(void *dest, const void *src,
+ size_t n) __arm_streaming_compatible {
+ unsigned char *destp = (unsigned char *)dest;
+ const unsigned char *srcp = (const unsigned char *)src;
+
+ // If src and dest don't overlap then just invoke memcpy
+ if ((srcp > (destp + n)) || (destp > (srcp + n)))
+ return __arm_sc_memcpy_fwd(dest, src, n);
+
+ // Overlap case 1:
+ // src: Low | -> | High
+ // dest: Low | -> | High
+ // Here src is always ahead of dest at a higher addres. If we first read a
+ // chunk of data from src we can safely write the same chunk to dest without
+ // corrupting future reads of src.
+ if (srcp > destp)
+ return __arm_sc_memcpy_fwd(dest, src, n);
+
+ // Overlap case 2:
+ // src: Low | -> | High
+ // dest: Low | -> | High
+ // While we're in the overlap region we're always corrupting future reads of
+ // src when writing to dest. An efficient way to do this is to copy the data
+ // in reverse by starting at the highest address.
+ return __arm_sc_memcpy_rev(dest, src, n);
+}
+#endif
+
const void *__arm_sc_memchr(const void *src, int c,
size_t n) __arm_streaming_compatible {
const unsigned char *srcp = (const unsigned char *)src;
More information about the cfe-commits
mailing list