[compiler-rt] 1b80990 - Reland "[compiler-rt][AArch64] Allow platform-specific mangling of SME routines. (#119864)"
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 17 03:49:14 PST 2024
Author: Sander de Smalen
Date: 2024-12-17T11:48:02Z
New Revision: 1b8099040e9a919794eba3854486d46fa9018b94
URL: https://github.com/llvm/llvm-project/commit/1b8099040e9a919794eba3854486d46fa9018b94
DIFF: https://github.com/llvm/llvm-project/commit/1b8099040e9a919794eba3854486d46fa9018b94.diff
LOG: Reland "[compiler-rt][AArch64] Allow platform-specific mangling of SME routines. (#119864)"
Avoid issues caused by `.subsections_via_symbols` directive, by using
numbered labels instead of named labels for the branch locations.
This reverts commit 4032ce3413d0230b0ccba1203536f9cb35e5c3b5.
Added:
Modified:
compiler-rt/lib/builtins/aarch64/sme-abi.S
compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
Removed:
################################################################################
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index a6bb921bd9e6b9..45bd221655fd66 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -40,7 +40,7 @@ DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort)
.cfi_offset w30, -24
.cfi_offset w29, -32
.cfi_offset 46, -16
- bl __arm_sme_state
+ bl SYMBOL_NAME(__arm_sme_state)
tbz x0, #0, 2f
1:
smstop sm
@@ -54,7 +54,7 @@ END_COMPILERRT_FUNCTION(do_abort)
// __arm_sme_state fills the result registers based on a local
// that is set as part of the compiler-rt startup code.
// __aarch64_has_sme_and_tpidr2_el0
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state)
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_state)
.variant_pcs __arm_sme_state
BTI_C
mov x0, xzr
@@ -70,9 +70,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state)
mrs x1, TPIDR2_EL0
1:
ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state)
+END_COMPILERRT_FUNCTION(__arm_sme_state)
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
+DEFINE_COMPILERRT_FUNCTION(__arm_tpidr2_restore)
.variant_pcs __arm_tpidr2_restore
BTI_C
// If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific
@@ -106,9 +106,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
ret
2:
b SYMBOL_NAME(do_abort)
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore)
+END_COMPILERRT_FUNCTION(__arm_tpidr2_restore)
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
+DEFINE_COMPILERRT_FUNCTION(__arm_tpidr2_save)
.variant_pcs __arm_tpidr2_save
BTI_C
// If the current thread does not have access to TPIDR2_EL0, the subroutine
@@ -147,9 +147,10 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
ret
2:
b SYMBOL_NAME(do_abort)
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save)
+END_COMPILERRT_FUNCTION(__arm_tpidr2_save)
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
+DEFINE_COMPILERRT_FUNCTION(__arm_za_disable)
+ .cfi_startproc
.variant_pcs __arm_za_disable
BTI_C
// If the current thread does not have access to SME, the subroutine does
@@ -166,7 +167,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
- bl __arm_tpidr2_save
+ bl SYMBOL_NAME(__arm_tpidr2_save)
// * Set TPIDR2_EL0 to null.
msr TPIDR2_EL0, xzr
@@ -181,9 +182,10 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
.cfi_restore w29
0:
ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable)
+ .cfi_endproc
+END_COMPILERRT_FUNCTION(__arm_za_disable)
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg)
+DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg)
.variant_pcs __arm_get_current_vg
BTI_C
@@ -200,7 +202,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg)
2:
mov x0, xzr
ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_get_current_vg)
+END_COMPILERRT_FUNCTION(__arm_get_current_vg)
NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
index 6e13a03691cfd6..e736829967c0cc 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
@@ -6,8 +6,6 @@
#include "../assembly.h"
-#define L(l) .L ## l
-
//
// __arm_sc_memcpy / __arm_sc_memmove
//
@@ -52,17 +50,17 @@
The loop tail is handled by always copying 64 bytes from the end.
*/
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
+DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy)
add srcend1, src, count
add dstend1, dstin, count
cmp count, 128
- b.hi L(copy_long)
+ b.hi 7f // copy_long
cmp count, 32
- b.hi L(copy32_128)
+ b.hi 4f // copy32_128
/* Small copies: 0..32 bytes. */
cmp count, 16
- b.lo L(copy16)
+ b.lo 0f // copy16
ldp A_l, A_h, [src]
ldp D_l, D_h, [srcend1, -16]
stp A_l, A_h, [dstin]
@@ -70,8 +68,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
ret
/* Copy 8-15 bytes. */
-L(copy16):
- tbz count, 3, L(copy8)
+0: // copy16
+ tbz count, 3, 1f // copy8
ldr A_l, [src]
ldr A_h, [srcend1, -8]
str A_l, [dstin]
@@ -80,8 +78,8 @@ L(copy16):
.p2align 3
/* Copy 4-7 bytes. */
-L(copy8):
- tbz count, 2, L(copy4)
+1: // copy8
+ tbz count, 2, 2f // copy4
ldr A_lw, [src]
ldr B_lw, [srcend1, -4]
str A_lw, [dstin]
@@ -89,8 +87,8 @@ L(copy8):
ret
/* Copy 0..3 bytes using a branchless sequence. */
-L(copy4):
- cbz count, L(copy0)
+2: // copy4
+ cbz count, 3f // copy0
lsr tmp1, count, 1
ldrb A_lw, [src]
ldrb C_lw, [srcend1, -1]
@@ -98,18 +96,18 @@ L(copy4):
strb A_lw, [dstin]
strb B_lw, [dstin, tmp1]
strb C_lw, [dstend1, -1]
-L(copy0):
+3: // copy0
ret
.p2align 4
/* Medium copies: 33..128 bytes. */
-L(copy32_128):
+4: // copy32_128
ldp A_l, A_h, [src]
ldp B_l, B_h, [src, 16]
ldp C_l, C_h, [srcend1, -32]
ldp D_l, D_h, [srcend1, -16]
cmp count, 64
- b.hi L(copy128)
+ b.hi 5f // copy128
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstend1, -32]
@@ -118,16 +116,16 @@ L(copy32_128):
.p2align 4
/* Copy 65..128 bytes. */
-L(copy128):
+5: // copy128
ldp E_l, E_h, [src, 32]
ldp F_l, F_h, [src, 48]
cmp count, 96
- b.ls L(copy96)
+ b.ls 6f // copy96
ldp G_l, G_h, [srcend1, -64]
ldp H_l, H_h, [srcend1, -48]
stp G_l, G_h, [dstend1, -64]
stp H_l, H_h, [dstend1, -48]
-L(copy96):
+6: // copy96
stp A_l, A_h, [dstin]
stp B_l, B_h, [dstin, 16]
stp E_l, E_h, [dstin, 32]
@@ -138,12 +136,12 @@ L(copy96):
.p2align 4
/* Copy more than 128 bytes. */
-L(copy_long):
+7: // copy_long
/* Use backwards copy if there is an overlap. */
sub tmp1, dstin, src
- cbz tmp1, L(copy0)
+ cbz tmp1, 3b // copy0
cmp tmp1, count
- b.lo L(copy_long_backwards)
+ b.lo 10f //copy_long_backwards
/* Copy 16 bytes and then align dst to 16-byte alignment. */
@@ -158,8 +156,8 @@ L(copy_long):
ldp C_l, C_h, [src, 48]
ldp D_l, D_h, [src, 64]!
subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(copy64_from_end)
-L(loop64):
+ b.ls 9f // copy64_from_end
+8: // loop64
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [src, 16]
stp B_l, B_h, [dst, 32]
@@ -169,10 +167,10 @@ L(loop64):
stp D_l, D_h, [dst, 64]!
ldp D_l, D_h, [src, 64]!
subs count, count, 64
- b.hi L(loop64)
+ b.hi 8b // loop64
/* Write the last iteration and copy 64 bytes from the end. */
-L(copy64_from_end):
+9: // copy64_from_end
ldp E_l, E_h, [srcend1, -64]
stp A_l, A_h, [dst, 16]
ldp A_l, A_h, [srcend1, -48]
@@ -191,7 +189,7 @@ L(copy64_from_end):
/* Large backwards copy for overlapping copies.
Copy 16 bytes and then align dst to 16-byte alignment. */
-L(copy_long_backwards):
+10: // copy_long_backwards
ldp D_l, D_h, [srcend1, -16]
and tmp1, dstend1, 15
sub srcend1, srcend1, tmp1
@@ -203,9 +201,9 @@ L(copy_long_backwards):
ldp D_l, D_h, [srcend1, -64]!
sub dstend1, dstend1, tmp1
subs count, count, 128
- b.ls L(copy64_from_start)
+ b.ls 12f // copy64_from_start
-L(loop64_backwards):
+11: // loop64_backwards
stp A_l, A_h, [dstend1, -16]
ldp A_l, A_h, [srcend1, -16]
stp B_l, B_h, [dstend1, -32]
@@ -215,10 +213,10 @@ L(loop64_backwards):
stp D_l, D_h, [dstend1, -64]!
ldp D_l, D_h, [srcend1, -64]!
subs count, count, 64
- b.hi L(loop64_backwards)
+ b.hi 11b // loop64_backwards
/* Write the last iteration and copy 64 bytes from the start. */
-L(copy64_from_start):
+12: // copy64_from_start
ldp G_l, G_h, [src, 48]
stp A_l, A_h, [dstend1, -16]
ldp A_l, A_h, [src, 32]
@@ -232,7 +230,7 @@ L(copy64_from_start):
stp B_l, B_h, [dstin, 16]
stp C_l, C_h, [dstin]
ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
+END_COMPILERRT_FUNCTION(__arm_sc_memcpy)
DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
@@ -250,7 +248,7 @@ DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
#define dstend2 x4
#define zva_val x5
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
+DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
#ifdef __ARM_FEATURE_SVE
mov z0.b, valw
#else
@@ -263,9 +261,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
add dstend2, dstin, count
cmp count, 96
- b.hi L(set_long)
+ b.hi 7f // set_long
cmp count, 16
- b.hs L(set_medium)
+ b.hs 4f // set_medium
mov val, v0.D[0]
/* Set 0..15 bytes. */
@@ -285,38 +283,38 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
3: ret
/* Set 17..96 bytes. */
-L(set_medium):
+4: // set_medium
str q0, [dstin]
- tbnz count, 6, L(set96)
+ tbnz count, 6, 6f // set96
str q0, [dstend2, -16]
- tbz count, 5, 1f
+ tbz count, 5, 5f
str q0, [dstin, 16]
str q0, [dstend2, -32]
-1: ret
+5: ret
.p2align 4
/* Set 64..96 bytes. Write 64 bytes from the start and
32 bytes from the end. */
-L(set96):
+6: // set96
str q0, [dstin, 16]
stp q0, q0, [dstin, 32]
stp q0, q0, [dstend2, -32]
ret
.p2align 4
-L(set_long):
+7: // set_long
and valw, valw, 255
bic dst, dstin, 15
str q0, [dstin]
cmp count, 160
ccmp valw, 0, 0, hs
- b.ne L(no_zva)
+ b.ne 9f // no_zva
#ifndef SKIP_ZVA_CHECK
mrs zva_val, dczid_el0
and zva_val, zva_val, 31
cmp zva_val, 4 /* ZVA size is 64 bytes. */
- b.ne L(no_zva)
+ b.ne 9f // no_zva
#endif
str q0, [dst, 16]
stp q0, q0, [dst, 32]
@@ -325,27 +323,27 @@ L(set_long):
sub count, count, 128 /* Adjust count and bias for loop. */
.p2align 4
-L(zva_loop):
+8: // zva_loop
add dst, dst, 64
dc zva, dst
subs count, count, 64
- b.hi L(zva_loop)
+ b.hi 8b // zva_loop
stp q0, q0, [dstend2, -64]
stp q0, q0, [dstend2, -32]
ret
-L(no_zva):
+9: // no_zva
sub count, dstend2, dst /* Count is 16 too large. */
sub dst, dst, 16 /* Dst is biased by -32. */
sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-L(no_zva_loop):
+10: // no_zva_loop
stp q0, q0, [dst, 32]
stp q0, q0, [dst, 64]!
subs count, count, 64
- b.hi L(no_zva_loop)
+ b.hi 10b // no_zva_loop
stp q0, q0, [dstend2, -64]
stp q0, q0, [dstend2, -32]
ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
+END_COMPILERRT_FUNCTION(__arm_sc_memset)
#endif // __aarch64__
More information about the llvm-commits
mailing list