[compiler-rt] 1b80990 - Reland "[compiler-rt][AArch64] Allow platform-specific mangling of SME routines. (#119864)"

Tue Dec 17 03:49:14 PST 2024

Author: Sander de Smalen
Date: 2024-12-17T11:48:02Z
New Revision: 1b8099040e9a919794eba3854486d46fa9018b94

URL: https://github.com/llvm/llvm-project/commit/1b8099040e9a919794eba3854486d46fa9018b94
DIFF: https://github.com/llvm/llvm-project/commit/1b8099040e9a919794eba3854486d46fa9018b94.diff

LOG: Reland "[compiler-rt][AArch64] Allow platform-specific mangling of SME routines. (#119864)"

Avoid issues caused by `.subsections_via_symbols` directive, by using
numbered labels instead of named labels for the branch locations.

This reverts commit 4032ce3413d0230b0ccba1203536f9cb35e5c3b5.

Added: 
    

Modified: 
    compiler-rt/lib/builtins/aarch64/sme-abi.S
    compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index a6bb921bd9e6b9..45bd221655fd66 100644

--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -40,7 +40,7 @@ DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort)
   .cfi_offset w30, -24
   .cfi_offset w29, -32
   .cfi_offset 46, -16
-  bl  __arm_sme_state
+  bl  SYMBOL_NAME(__arm_sme_state)
   tbz  x0, #0, 2f
 1:
   smstop sm
@@ -54,7 +54,7 @@ END_COMPILERRT_FUNCTION(do_abort)
 // __arm_sme_state fills the result registers based on a local
 // that is set as part of the compiler-rt startup code.
 //   __aarch64_has_sme_and_tpidr2_el0
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state)
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_state)
   .variant_pcs __arm_sme_state
   BTI_C
   mov x0, xzr
@@ -70,9 +70,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sme_state)
   mrs x1, TPIDR2_EL0
 1:
   ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_sme_state)
+END_COMPILERRT_FUNCTION(__arm_sme_state)
 
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
+DEFINE_COMPILERRT_FUNCTION(__arm_tpidr2_restore)
   .variant_pcs __arm_tpidr2_restore
   BTI_C
   // If TPIDR2_EL0 is nonnull, the subroutine aborts in some platform-specific
@@ -106,9 +106,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
   ret
 2:
   b  SYMBOL_NAME(do_abort)
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore)
+END_COMPILERRT_FUNCTION(__arm_tpidr2_restore)
 
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
+DEFINE_COMPILERRT_FUNCTION(__arm_tpidr2_save)
   .variant_pcs __arm_tpidr2_save
   BTI_C
   // If the current thread does not have access to TPIDR2_EL0, the subroutine
@@ -147,9 +147,10 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
   ret
 2:
   b  SYMBOL_NAME(do_abort)
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save)
+END_COMPILERRT_FUNCTION(__arm_tpidr2_save)
 
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
+DEFINE_COMPILERRT_FUNCTION(__arm_za_disable)
+  .cfi_startproc
   .variant_pcs __arm_za_disable
   BTI_C
   // If the current thread does not have access to SME, the subroutine does
@@ -166,7 +167,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
   .cfi_def_cfa w29, 16
   .cfi_offset w30, -8
   .cfi_offset w29, -16
-  bl  __arm_tpidr2_save
+  bl  SYMBOL_NAME(__arm_tpidr2_save)
 
   // * Set TPIDR2_EL0 to null.
   msr TPIDR2_EL0, xzr
@@ -181,9 +182,10 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
   .cfi_restore w29
 0:
   ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_za_disable)
+  .cfi_endproc
+END_COMPILERRT_FUNCTION(__arm_za_disable)
 
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg)
+DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg)
   .variant_pcs __arm_get_current_vg
   BTI_C
 
@@ -200,7 +202,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_get_current_vg)
 2:
   mov     x0, xzr
   ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_get_current_vg)
+END_COMPILERRT_FUNCTION(__arm_get_current_vg)
 
 NO_EXEC_STACK_DIRECTIVE
 

diff  --git a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
index 6e13a03691cfd6..e736829967c0cc 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-mem-routines.S
@@ -6,8 +6,6 @@
 
 #include "../assembly.h"
 
-#define L(l) .L ## l
-
 //
 //  __arm_sc_memcpy / __arm_sc_memmove
 //
@@ -52,17 +50,17 @@
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
+DEFINE_COMPILERRT_FUNCTION(__arm_sc_memcpy)
         add     srcend1, src, count
         add     dstend1, dstin, count
         cmp     count, 128
-        b.hi    L(copy_long)
+        b.hi    7f  // copy_long
         cmp     count, 32
-        b.hi    L(copy32_128)
+        b.hi    4f  // copy32_128
 
         /* Small copies: 0..32 bytes.  */
         cmp     count, 16
-        b.lo    L(copy16)
+        b.lo    0f  // copy16
         ldp     A_l, A_h, [src]
         ldp     D_l, D_h, [srcend1, -16]
         stp     A_l, A_h, [dstin]
@@ -70,8 +68,8 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memcpy)
         ret
 
         /* Copy 8-15 bytes.  */
-L(copy16):
-        tbz     count, 3, L(copy8)
+0:  // copy16
+        tbz     count, 3, 1f  // copy8
         ldr     A_l, [src]
         ldr     A_h, [srcend1, -8]
         str     A_l, [dstin]
@@ -80,8 +78,8 @@ L(copy16):
 
         .p2align 3
         /* Copy 4-7 bytes.  */
-L(copy8):
-        tbz     count, 2, L(copy4)
+1:  // copy8
+        tbz     count, 2, 2f  // copy4
         ldr     A_lw, [src]
         ldr     B_lw, [srcend1, -4]
         str     A_lw, [dstin]
@@ -89,8 +87,8 @@ L(copy8):
         ret
 
         /* Copy 0..3 bytes using a branchless sequence.  */
-L(copy4):
-        cbz     count, L(copy0)
+2:  // copy4
+        cbz     count, 3f // copy0
         lsr     tmp1, count, 1
         ldrb    A_lw, [src]
         ldrb    C_lw, [srcend1, -1]
@@ -98,18 +96,18 @@ L(copy4):
         strb    A_lw, [dstin]
         strb    B_lw, [dstin, tmp1]
         strb    C_lw, [dstend1, -1]
-L(copy0):
+3:  // copy0
         ret
 
         .p2align 4
         /* Medium copies: 33..128 bytes.  */
-L(copy32_128):
+4:  // copy32_128
         ldp     A_l, A_h, [src]
         ldp     B_l, B_h, [src, 16]
         ldp     C_l, C_h, [srcend1, -32]
         ldp     D_l, D_h, [srcend1, -16]
         cmp     count, 64
-        b.hi    L(copy128)
+        b.hi    5f  // copy128
         stp     A_l, A_h, [dstin]
         stp     B_l, B_h, [dstin, 16]
         stp     C_l, C_h, [dstend1, -32]
@@ -118,16 +116,16 @@ L(copy32_128):
 
         .p2align 4
         /* Copy 65..128 bytes.  */
-L(copy128):
+5:  // copy128
         ldp     E_l, E_h, [src, 32]
         ldp     F_l, F_h, [src, 48]
         cmp     count, 96
-        b.ls    L(copy96)
+        b.ls    6f  // copy96
         ldp     G_l, G_h, [srcend1, -64]
         ldp     H_l, H_h, [srcend1, -48]
         stp     G_l, G_h, [dstend1, -64]
         stp     H_l, H_h, [dstend1, -48]
-L(copy96):
+6:  // copy96
         stp     A_l, A_h, [dstin]
         stp     B_l, B_h, [dstin, 16]
         stp     E_l, E_h, [dstin, 32]
@@ -138,12 +136,12 @@ L(copy96):
 
         .p2align 4
         /* Copy more than 128 bytes.  */
-L(copy_long):
+7:  // copy_long
         /* Use backwards copy if there is an overlap.  */
         sub     tmp1, dstin, src
-        cbz     tmp1, L(copy0)
+        cbz     tmp1, 3b  // copy0
         cmp     tmp1, count
-        b.lo    L(copy_long_backwards)
+        b.lo    10f //copy_long_backwards
 
         /* Copy 16 bytes and then align dst to 16-byte alignment.  */
 
@@ -158,8 +156,8 @@ L(copy_long):
         ldp     C_l, C_h, [src, 48]
         ldp     D_l, D_h, [src, 64]!
         subs    count, count, 128 + 16  /* Test and readjust count.  */
-        b.ls    L(copy64_from_end)
-L(loop64):
+        b.ls    9f  // copy64_from_end
+8:  // loop64
         stp     A_l, A_h, [dst, 16]
         ldp     A_l, A_h, [src, 16]
         stp     B_l, B_h, [dst, 32]
@@ -169,10 +167,10 @@ L(loop64):
         stp     D_l, D_h, [dst, 64]!
         ldp     D_l, D_h, [src, 64]!
         subs    count, count, 64
-        b.hi    L(loop64)
+        b.hi    8b  // loop64
 
         /* Write the last iteration and copy 64 bytes from the end.  */
-L(copy64_from_end):
+9:  // copy64_from_end
         ldp     E_l, E_h, [srcend1, -64]
         stp     A_l, A_h, [dst, 16]
         ldp     A_l, A_h, [srcend1, -48]
@@ -191,7 +189,7 @@ L(copy64_from_end):
 
         /* Large backwards copy for overlapping copies.
            Copy 16 bytes and then align dst to 16-byte alignment.  */
-L(copy_long_backwards):
+10: // copy_long_backwards
         ldp     D_l, D_h, [srcend1, -16]
         and     tmp1, dstend1, 15
         sub     srcend1, srcend1, tmp1
@@ -203,9 +201,9 @@ L(copy_long_backwards):
         ldp     D_l, D_h, [srcend1, -64]!
         sub     dstend1, dstend1, tmp1
         subs    count, count, 128
-        b.ls    L(copy64_from_start)
+        b.ls    12f // copy64_from_start
 
-L(loop64_backwards):
+11: // loop64_backwards
         stp     A_l, A_h, [dstend1, -16]
         ldp     A_l, A_h, [srcend1, -16]
         stp     B_l, B_h, [dstend1, -32]
@@ -215,10 +213,10 @@ L(loop64_backwards):
         stp     D_l, D_h, [dstend1, -64]!
         ldp     D_l, D_h, [srcend1, -64]!
         subs    count, count, 64
-        b.hi    L(loop64_backwards)
+        b.hi    11b // loop64_backwards
 
         /* Write the last iteration and copy 64 bytes from the start.  */
-L(copy64_from_start):
+12: // copy64_from_start
         ldp     G_l, G_h, [src, 48]
         stp     A_l, A_h, [dstend1, -16]
         ldp     A_l, A_h, [src, 32]
@@ -232,7 +230,7 @@ L(copy64_from_start):
         stp     B_l, B_h, [dstin, 16]
         stp     C_l, C_h, [dstin]
         ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memcpy)
+END_COMPILERRT_FUNCTION(__arm_sc_memcpy)
 
 DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
 
@@ -250,7 +248,7 @@ DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
 #define dstend2  x4
 #define zva_val  x5
 
-DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
+DEFINE_COMPILERRT_FUNCTION(__arm_sc_memset)
 #ifdef __ARM_FEATURE_SVE
         mov     z0.b, valw
 #else
@@ -263,9 +261,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
         add     dstend2, dstin, count
 
         cmp     count, 96
-        b.hi    L(set_long)
+        b.hi    7f  // set_long
         cmp     count, 16
-        b.hs    L(set_medium)
+        b.hs    4f  // set_medium
         mov     val, v0.D[0]
 
         /* Set 0..15 bytes.  */
@@ -285,38 +283,38 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
 3:      ret
 
         /* Set 17..96 bytes.  */
-L(set_medium):
+4:  // set_medium
         str     q0, [dstin]
-        tbnz    count, 6, L(set96)
+        tbnz    count, 6, 6f  // set96
         str     q0, [dstend2, -16]
-        tbz     count, 5, 1f
+        tbz     count, 5, 5f
         str     q0, [dstin, 16]
         str     q0, [dstend2, -32]
-1:      ret
+5:      ret
 
         .p2align 4
         /* Set 64..96 bytes.  Write 64 bytes from the start and
            32 bytes from the end.  */
-L(set96):
+6:  // set96
         str     q0, [dstin, 16]
         stp     q0, q0, [dstin, 32]
         stp     q0, q0, [dstend2, -32]
         ret
 
         .p2align 4
-L(set_long):
+7:  // set_long
         and     valw, valw, 255
         bic     dst, dstin, 15
         str     q0, [dstin]
         cmp     count, 160
         ccmp    valw, 0, 0, hs
-        b.ne    L(no_zva)
+        b.ne    9f  // no_zva
 
 #ifndef SKIP_ZVA_CHECK
         mrs     zva_val, dczid_el0
         and     zva_val, zva_val, 31
         cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
-        b.ne    L(no_zva)
+        b.ne    9f  // no_zva
 #endif
         str     q0, [dst, 16]
         stp     q0, q0, [dst, 32]
@@ -325,27 +323,27 @@ L(set_long):
         sub     count, count, 128       /* Adjust count and bias for loop.  */
 
         .p2align 4
-L(zva_loop):
+8:  // zva_loop
         add     dst, dst, 64
         dc      zva, dst
         subs    count, count, 64
-        b.hi    L(zva_loop)
+        b.hi    8b  // zva_loop
         stp     q0, q0, [dstend2, -64]
         stp     q0, q0, [dstend2, -32]
         ret
 
-L(no_zva):
+9:  // no_zva
         sub     count, dstend2, dst      /* Count is 16 too large.  */
         sub     dst, dst, 16            /* Dst is biased by -32.  */
         sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-L(no_zva_loop):
+10: // no_zva_loop
         stp     q0, q0, [dst, 32]
         stp     q0, q0, [dst, 64]!
         subs    count, count, 64
-        b.hi    L(no_zva_loop)
+        b.hi    10b  // no_zva_loop
         stp     q0, q0, [dstend2, -64]
         stp     q0, q0, [dstend2, -32]
         ret
-END_COMPILERRT_OUTLINE_FUNCTION(__arm_sc_memset)
+END_COMPILERRT_FUNCTION(__arm_sc_memset)
 
 #endif // __aarch64__