[compiler-rt] [Compiler-rt] Add AArch64 routines for __arm_agnostic("sme_za_state") (PR #120059)

Sander de Smalen via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 17 07:53:21 PST 2024


https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/120059

>From 12e18154affece2c835dd83b1acf7b569efcc3f7 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 9 Sep 2024 14:06:51 +0100
Subject: [PATCH 1/4] [Compiler-rt] Add AArch64 routines for
 __arm_agnostic("sme_za_state")

The specification of these routines can be found here:

  https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#sme-support-routines
---
 compiler-rt/cmake/builtin-config-ix.cmake     |   3 +-
 .../lib/builtins/aarch64/sme-abi-assert.c     |   1 +
 compiler-rt/lib/builtins/aarch64/sme-abi.S    | 159 +++++++++++++++++-
 3 files changed, 161 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index 1f63e158409ca4..706a1ff7eeb6db 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -43,8 +43,9 @@ asm(\"cas w0, w1, [x2]\");
 builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
 "
 void foo(void)  __arm_streaming_compatible {
-  asm(\".arch armv9-a+sme\");
+  asm(\".arch armv9-a+sme2\");
   asm(\"smstart\");
+  asm(\"ldr zt0, [sp]\");
 }
 ")
 
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c b/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c
index 4333353f8d2d1b..37305ceb39c50f 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c
@@ -8,3 +8,4 @@
 #include "../cpu_model/AArch64CPUFeatures.inc"
 _Static_assert(FEAT_SVE == 30, "sme-abi.S assumes FEAT_SVE = 30");
 _Static_assert(FEAT_SME == 42, "sme-abi.S assumes FEAT_SME = 42");
+_Static_assert(FEAT_SME2 == 57, "sme-abi.S assumes FEAT_SME2 = 57");
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 45bd221655fd66..90b3f1bf180ffa 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -10,6 +10,8 @@
 
 .set FEAT_SVE_BIT, 30
 .set FEAT_SME_BIT, 42
+.set FEAT_SME2_BIT, 57
+.set FEAT_SME2_MASK, 1 << 57
 .set SVCR_PSTATE_SM_BIT, 0
 
 #if !defined(__APPLE__)
@@ -22,7 +24,7 @@
 #define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff
 #endif
 
-.arch armv9-a+sme
+.arch armv9-a+sme2
 
 // Utility function which calls a system's abort() routine. Because the function
 // is streaming-compatible it should disable streaming-SVE mode before calling
@@ -204,6 +206,161 @@ DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg)
   ret
 END_COMPILERRT_FUNCTION(__arm_get_current_vg)
 
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size)
+  .variant_pcs __arm_sme_state_size
+  BTI_C
+
+  // Test if SME is available and PSTATE = 1.
+  adrp    x16, CPU_FEATS_SYMBOL
+  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x16, #FEAT_SME_BIT, 0f
+  mrs     x16, SVCR
+  tbz     x16, #1, 0f
+
+  // Size = HAS_FEAT_SME2 ? 32 : 96
+  adrp    x16, CPU_FEATS_SYMBOL
+  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+  tst     x16, #FEAT_SME2_MASK
+  mov     w17, #32
+  mov     w16, #96
+  csel    x16, x17, x16, eq
+
+  // Size = Size + (SVLB * SVLB)
+  rdsvl   x17, #1
+  madd    x0, x17, x17, x16
+  ret
+
+0:
+  // Default case, 16 bytes is minimum (to encode VALID bit, multiple of 16 bytes)
+  mov w0, #16
+  ret
+END_COMPILERRT_FUNCTION(__arm_sme_state_size)
+
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
+  .variant_pcs __arm_sme_save
+  BTI_C
+
+  // Clear internal state bits
+  stp     xzr, xzr, [x0]
+
+  // If PTR is not 16-byte aligned, abort.
+  tst     x0, #0xF
+  b.ne    3f
+
+  // If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return.
+  adrp    x16, CPU_FEATS_SYMBOL
+  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x16, #FEAT_SME_BIT, 2f
+  mrs     x16, SVCR
+  tbz     x16, #1, 2f
+  mrs     x16, TPIDR2_EL0
+  cbnz    x16, 2f
+
+  # ZA or ZT0 need saving, we can now set internal VALID bit to 1
+  mov     w16, #1
+  str     x16, [x0]
+
+  adrp    x16, CPU_FEATS_SYMBOL
+  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x16, #FEAT_SME2_BIT, 0f
+
+  // Store ZT0 and ZA
+  add     x16, x0, #32
+  str     zt0, [x16]
+  add     x18, x0, #96
+  b       1f
+
+0:
+  // Has SME only
+  add     x18, x0, #32
+
+1:
+  // Set up lazy-save (x18 = pointer to buffer)
+  rdsvl   x17, #1
+  str     x18, [x0, #16]!
+  strh    w17, [x0, #8]
+  stur    wzr, [x0, #10]
+  strh    wzr, [x0, #14]
+  msr     TPIDR2_EL0, x0
+  ret
+
+2:
+  // Do nothing
+  ret
+
+3:
+  b       SYMBOL_NAME(do_abort)
+END_COMPILERRT_FUNCTION(__arm_sme_save)
+
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore)
+  .cfi_startproc
+  .variant_pcs __arm_sme_save
+  BTI_C
+
+  stp     x29, x30, [sp, #-16]!
+  .cfi_def_cfa_offset 16
+  mov     x29, sp
+  .cfi_def_cfa w29, 16
+  .cfi_offset w30, -8
+  .cfi_offset w29, -16
+
+  // If PTR is not 16-byte aligned, abort.
+  tst     x0, #0xF
+  b.ne    3f
+
+  // If the VALID bit is 0, return early.
+  ldr     x16, [x0]
+  tbz     x16, #0, 2f
+
+  // If SME is not available, abort.
+  adrp    x16, CPU_FEATS_SYMBOL
+  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x16, #FEAT_SME_BIT, 3f
+
+  // If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0.
+  mrs     x16, TPIDR2_EL0
+  cbnz    x16, 0f
+
+  // If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'),
+  // abort.
+  mrs     x16, SVCR
+  tbnz    x16, #1, 3f
+
+  // Restore za.
+  smstart za
+  mov     x16, x0
+  add     x0, x0, #16
+  bl      __arm_tpidr2_restore
+  mov     x0, x16
+  msr     TPIDR2_EL0, xzr
+
+0:
+  smstart za
+
+1:
+  // Check if zt0 needs restoring.
+  adrp    x16, CPU_FEATS_SYMBOL
+  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x16, #FEAT_SME2_BIT, 2f
+
+  // Restore zt0.
+  add     x16, x0, #32
+  ldr     zt0, [x16]
+
+2:
+  // Do nothing
+  .cfi_def_cfa wsp, 16
+  ldp     x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+  ret
+
+3:
+  b       SYMBOL_NAME(do_abort)
+  .cfi_endproc
+END_COMPILERRT_FUNCTION(__arm_sme_restore)
+
 NO_EXEC_STACK_DIRECTIVE
 
 // GNU property note for BTI and PAC

>From 88e7dc405d108daa5bb7511c0872837ae9a3f738 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 16 Dec 2024 12:13:47 +0000
Subject: [PATCH 2/4] Address comments

---
 compiler-rt/lib/builtins/aarch64/sme-abi.S | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 90b3f1bf180ffa..61c2fbbd801acb 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -210,14 +210,16 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size)
   .variant_pcs __arm_sme_state_size
   BTI_C
 
-  // Test if SME is available and PSTATE = 1.
+  // Test if SME is available and ZA state is 'active'.
   adrp    x16, CPU_FEATS_SYMBOL
   ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
   tbz     x16, #FEAT_SME_BIT, 0f
   mrs     x16, SVCR
   tbz     x16, #1, 0f
+  mrs     x16, TPIDR2_EL0
+  cbnz    x16, 0f
 
-  // Size = HAS_FEAT_SME2 ? 32 : 96
+  // Size = HAS_FEAT_SME2 ? 96 : 32
   adrp    x16, CPU_FEATS_SYMBOL
   ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
   tst     x16, #FEAT_SME2_MASK

>From aaecb05f6bf926578c6f34efc67b79e23174c66c Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Tue, 17 Dec 2024 15:36:15 +0000
Subject: [PATCH 3/4] Address more comments

---
 compiler-rt/lib/builtins/aarch64/sme-abi.S | 78 ++++++++++++----------
 1 file changed, 43 insertions(+), 35 deletions(-)

diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 61c2fbbd801acb..855099427e2684 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -206,23 +206,41 @@ DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg)
   ret
 END_COMPILERRT_FUNCTION(__arm_get_current_vg)
 
+// The diagram below describes the layout used in the following routines:
+// * __arm_sme_state_size
+// * __arm_sme_save
+// * __arm_sme_restore
+//
+// +---------------------------------+
+// |             ...                 |
+// |           ZA buffer             |
+// |             ...                 |
+// +---------------------------------+ <- @96
+// |         ZT0 contents            |
+// +---------------------------------+ <- @32
+// | byte 15-10: zero (reserved)     |
+// | byte   9-8: num_za_save_slices  |           TPIDR2 block
+// | byte   7-0: za_save_buffer      |
+// +---------------------------------+ <- @16
+// | bit  127-1: zero (reserved)     |           Internal state for __arm_sme_save/restore
+// | bit      0: VALID               |
+// +---------------------------------+ <- @0
+
 DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size)
   .variant_pcs __arm_sme_state_size
   BTI_C
 
   // Test if SME is available and ZA state is 'active'.
-  adrp    x16, CPU_FEATS_SYMBOL
-  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
-  tbz     x16, #FEAT_SME_BIT, 0f
+  adrp    x17, CPU_FEATS_SYMBOL
+  ldr     x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x17, #FEAT_SME_BIT, 0f
   mrs     x16, SVCR
   tbz     x16, #1, 0f
   mrs     x16, TPIDR2_EL0
   cbnz    x16, 0f
 
   // Size = HAS_FEAT_SME2 ? 96 : 32
-  adrp    x16, CPU_FEATS_SYMBOL
-  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
-  tst     x16, #FEAT_SME2_MASK
+  tst     x17, #FEAT_SME2_MASK
   mov     w17, #32
   mov     w16, #96
   csel    x16, x17, x16, eq
@@ -242,17 +260,17 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
   .variant_pcs __arm_sme_save
   BTI_C
 
-  // Clear internal state bits
-  stp     xzr, xzr, [x0]
-
   // If PTR is not 16-byte aligned, abort.
   tst     x0, #0xF
   b.ne    3f
 
+  // Clear internal state bits
+  stp     xzr, xzr, [x0]
+
   // If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return.
-  adrp    x16, CPU_FEATS_SYMBOL
-  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
-  tbz     x16, #FEAT_SME_BIT, 2f
+  adrp    x17, CPU_FEATS_SYMBOL
+  ldr     x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x17, #FEAT_SME_BIT, 2f
   mrs     x16, SVCR
   tbz     x16, #1, 2f
   mrs     x16, TPIDR2_EL0
@@ -262,20 +280,15 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
   mov     w16, #1
   str     x16, [x0]
 
-  adrp    x16, CPU_FEATS_SYMBOL
-  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
-  tbz     x16, #FEAT_SME2_BIT, 0f
+  add     x18, x0, #32
+  tbz     x17, #FEAT_SME2_BIT, 1f
 
   // Store ZT0 and ZA
   add     x16, x0, #32
   str     zt0, [x16]
-  add     x18, x0, #96
+  add     x18, x18, #64
   b       1f
 
-0:
-  // Has SME only
-  add     x18, x0, #32
-
 1:
   // Set up lazy-save (x18 = pointer to buffer)
   rdsvl   x17, #1
@@ -284,7 +297,6 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
   stur    wzr, [x0, #10]
   strh    wzr, [x0, #14]
   msr     TPIDR2_EL0, x0
-  ret
 
 2:
   // Do nothing
@@ -296,7 +308,7 @@ END_COMPILERRT_FUNCTION(__arm_sme_save)
 
 DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore)
   .cfi_startproc
-  .variant_pcs __arm_sme_save
+  .variant_pcs __arm_sme_restore
   BTI_C
 
   stp     x29, x30, [sp, #-16]!
@@ -312,16 +324,16 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore)
 
   // If the VALID bit is 0, return early.
   ldr     x16, [x0]
-  tbz     x16, #0, 2f
+  cbz     x16, 2f
 
   // If SME is not available, abort.
-  adrp    x16, CPU_FEATS_SYMBOL
-  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
-  tbz     x16, #FEAT_SME_BIT, 3f
+  adrp    x17, CPU_FEATS_SYMBOL
+  ldr     x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x17, #FEAT_SME_BIT, 3f
 
   // If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0.
   mrs     x16, TPIDR2_EL0
-  cbnz    x16, 0f
+  cbnz    x16, 1f
 
   // If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'),
   // abort.
@@ -330,20 +342,16 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore)
 
   // Restore za.
   smstart za
-  mov     x16, x0
   add     x0, x0, #16
   bl      __arm_tpidr2_restore
-  mov     x0, x16
-  msr     TPIDR2_EL0, xzr
+  sub     x0, x0, #16
 
-0:
+1:
   smstart za
+  msr     TPIDR2_EL0, xzr
 
-1:
   // Check if zt0 needs restoring.
-  adrp    x16, CPU_FEATS_SYMBOL
-  ldr     x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
-  tbz     x16, #FEAT_SME2_BIT, 2f
+  tbz     x17, #FEAT_SME2_BIT, 2f
 
   // Restore zt0.
   add     x16, x0, #32

>From 7b50c7a3d740b3042c129fa3ef86ef9a40accdd8 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Tue, 17 Dec 2024 15:52:35 +0000
Subject: [PATCH 4/4] Further simplification

---
 compiler-rt/lib/builtins/aarch64/sme-abi.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 855099427e2684..b70c7ee1fe991a 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -283,9 +283,8 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
   add     x18, x0, #32
   tbz     x17, #FEAT_SME2_BIT, 1f
 
-  // Store ZT0 and ZA
-  add     x16, x0, #32
-  str     zt0, [x16]
+  // Store ZT0
+  str     zt0, [x18]
   add     x18, x18, #64
   b       1f
 



More information about the llvm-commits mailing list