[compiler-rt] [Compiler-rt] Add AArch64 routines for __arm_agnostic("sme_za_state") (PR #120059)
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 20 04:34:28 PST 2024
https://github.com/sdesmalen-arm updated https://github.com/llvm/llvm-project/pull/120059
>From 12e18154affece2c835dd83b1acf7b569efcc3f7 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 9 Sep 2024 14:06:51 +0100
Subject: [PATCH 1/5] [Compiler-rt] Add AArch64 routines for
__arm_agnostic("sme_za_state")
The specification of these routines can be found here:
https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#sme-support-routines
---
compiler-rt/cmake/builtin-config-ix.cmake | 3 +-
.../lib/builtins/aarch64/sme-abi-assert.c | 1 +
compiler-rt/lib/builtins/aarch64/sme-abi.S | 159 +++++++++++++++++-
3 files changed, 161 insertions(+), 2 deletions(-)
diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index 1f63e158409ca4..706a1ff7eeb6db 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -43,8 +43,9 @@ asm(\"cas w0, w1, [x2]\");
builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
"
void foo(void) __arm_streaming_compatible {
- asm(\".arch armv9-a+sme\");
+ asm(\".arch armv9-a+sme2\");
asm(\"smstart\");
+ asm(\"ldr zt0, [sp]\");
}
")
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c b/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c
index 4333353f8d2d1b..37305ceb39c50f 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c
@@ -8,3 +8,4 @@
#include "../cpu_model/AArch64CPUFeatures.inc"
_Static_assert(FEAT_SVE == 30, "sme-abi.S assumes FEAT_SVE = 30");
_Static_assert(FEAT_SME == 42, "sme-abi.S assumes FEAT_SME = 42");
+_Static_assert(FEAT_SME2 == 57, "sme-abi.S assumes FEAT_SME2 = 57");
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 45bd221655fd66..90b3f1bf180ffa 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -10,6 +10,8 @@
.set FEAT_SVE_BIT, 30
.set FEAT_SME_BIT, 42
+.set FEAT_SME2_BIT, 57
+.set FEAT_SME2_MASK, 1 << 57
.set SVCR_PSTATE_SM_BIT, 0
#if !defined(__APPLE__)
@@ -22,7 +24,7 @@
#define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff
#endif
-.arch armv9-a+sme
+.arch armv9-a+sme2
// Utility function which calls a system's abort() routine. Because the function
// is streaming-compatible it should disable streaming-SVE mode before calling
@@ -204,6 +206,161 @@ DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg)
ret
END_COMPILERRT_FUNCTION(__arm_get_current_vg)
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size)
+ .variant_pcs __arm_sme_state_size
+ BTI_C
+
+ // Test if SME is available and PSTATE = 1.
+ adrp x16, CPU_FEATS_SYMBOL
+ ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+ tbz x16, #FEAT_SME_BIT, 0f
+ mrs x16, SVCR
+ tbz x16, #1, 0f
+
+ // Size = HAS_FEAT_SME2 ? 32 : 96
+ adrp x16, CPU_FEATS_SYMBOL
+ ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+ tst x16, #FEAT_SME2_MASK
+ mov w17, #32
+ mov w16, #96
+ csel x16, x17, x16, eq
+
+ // Size = Size + (SVLB * SVLB)
+ rdsvl x17, #1
+ madd x0, x17, x17, x16
+ ret
+
+0:
+ // Default case, 16 bytes is minimum (to encode VALID bit, multiple of 16 bytes)
+ mov w0, #16
+ ret
+END_COMPILERRT_FUNCTION(__arm_sme_state_size)
+
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
+ .variant_pcs __arm_sme_save
+ BTI_C
+
+ // Clear internal state bits
+ stp xzr, xzr, [x0]
+
+ // If PTR is not 16-byte aligned, abort.
+ tst x0, #0xF
+ b.ne 3f
+
+ // If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return.
+ adrp x16, CPU_FEATS_SYMBOL
+ ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+ tbz x16, #FEAT_SME_BIT, 2f
+ mrs x16, SVCR
+ tbz x16, #1, 2f
+ mrs x16, TPIDR2_EL0
+ cbnz x16, 2f
+
+ # ZA or ZT0 need saving, we can now set internal VALID bit to 1
+ mov w16, #1
+ str x16, [x0]
+
+ adrp x16, CPU_FEATS_SYMBOL
+ ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+ tbz x16, #FEAT_SME2_BIT, 0f
+
+ // Store ZT0 and ZA
+ add x16, x0, #32
+ str zt0, [x16]
+ add x18, x0, #96
+ b 1f
+
+0:
+ // Has SME only
+ add x18, x0, #32
+
+1:
+ // Set up lazy-save (x18 = pointer to buffer)
+ rdsvl x17, #1
+ str x18, [x0, #16]!
+ strh w17, [x0, #8]
+ stur wzr, [x0, #10]
+ strh wzr, [x0, #14]
+ msr TPIDR2_EL0, x0
+ ret
+
+2:
+ // Do nothing
+ ret
+
+3:
+ b SYMBOL_NAME(do_abort)
+END_COMPILERRT_FUNCTION(__arm_sme_save)
+
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore)
+ .cfi_startproc
+ .variant_pcs __arm_sme_save
+ BTI_C
+
+ stp x29, x30, [sp, #-16]!
+ .cfi_def_cfa_offset 16
+ mov x29, sp
+ .cfi_def_cfa w29, 16
+ .cfi_offset w30, -8
+ .cfi_offset w29, -16
+
+ // If PTR is not 16-byte aligned, abort.
+ tst x0, #0xF
+ b.ne 3f
+
+ // If the VALID bit is 0, return early.
+ ldr x16, [x0]
+ tbz x16, #0, 2f
+
+ // If SME is not available, abort.
+ adrp x16, CPU_FEATS_SYMBOL
+ ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+ tbz x16, #FEAT_SME_BIT, 3f
+
+ // If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0.
+ mrs x16, TPIDR2_EL0
+ cbnz x16, 0f
+
+ // If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'),
+ // abort.
+ mrs x16, SVCR
+ tbnz x16, #1, 3f
+
+ // Restore za.
+ smstart za
+ mov x16, x0
+ add x0, x0, #16
+ bl __arm_tpidr2_restore
+ mov x0, x16
+ msr TPIDR2_EL0, xzr
+
+0:
+ smstart za
+
+1:
+ // Check if zt0 needs restoring.
+ adrp x16, CPU_FEATS_SYMBOL
+ ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
+ tbz x16, #FEAT_SME2_BIT, 2f
+
+ // Restore zt0.
+ add x16, x0, #32
+ ldr zt0, [x16]
+
+2:
+ // Do nothing
+ .cfi_def_cfa wsp, 16
+ ldp x29, x30, [sp], #16
+ .cfi_def_cfa_offset 0
+ .cfi_restore w30
+ .cfi_restore w29
+ ret
+
+3:
+ b SYMBOL_NAME(do_abort)
+ .cfi_endproc
+END_COMPILERRT_FUNCTION(__arm_sme_restore)
+
NO_EXEC_STACK_DIRECTIVE
// GNU property note for BTI and PAC
>From 88e7dc405d108daa5bb7511c0872837ae9a3f738 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Mon, 16 Dec 2024 12:13:47 +0000
Subject: [PATCH 2/5] Address comments
---
compiler-rt/lib/builtins/aarch64/sme-abi.S | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 90b3f1bf180ffa..61c2fbbd801acb 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -210,14 +210,16 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size)
.variant_pcs __arm_sme_state_size
BTI_C
- // Test if SME is available and PSTATE = 1.
+ // Test if SME is available and ZA state is 'active'.
adrp x16, CPU_FEATS_SYMBOL
ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
tbz x16, #FEAT_SME_BIT, 0f
mrs x16, SVCR
tbz x16, #1, 0f
+ mrs x16, TPIDR2_EL0
+ cbnz x16, 0f
- // Size = HAS_FEAT_SME2 ? 32 : 96
+ // Size = HAS_FEAT_SME2 ? 96 : 32
adrp x16, CPU_FEATS_SYMBOL
ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
tst x16, #FEAT_SME2_MASK
>From aaecb05f6bf926578c6f34efc67b79e23174c66c Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Tue, 17 Dec 2024 15:36:15 +0000
Subject: [PATCH 3/5] Address more comments
---
compiler-rt/lib/builtins/aarch64/sme-abi.S | 78 ++++++++++++----------
1 file changed, 43 insertions(+), 35 deletions(-)
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 61c2fbbd801acb..855099427e2684 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -206,23 +206,41 @@ DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg)
ret
END_COMPILERRT_FUNCTION(__arm_get_current_vg)
+// The diagram below describes the layout used in the following routines:
+// * __arm_sme_state_size
+// * __arm_sme_save
+// * __arm_sme_restore
+//
+// +---------------------------------+
+// | ... |
+// | ZA buffer |
+// | ... |
+// +---------------------------------+ <- @96
+// | ZT0 contents |
+// +---------------------------------+ <- @32
+// | byte 15-10: zero (reserved) |
+// | byte 9-8: num_za_save_slices | TPIDR2 block
+// | byte 7-0: za_save_buffer |
+// +---------------------------------+ <- @16
+// | bit 127-1: zero (reserved) | Internal state for __arm_sme_save/restore
+// | bit 0: VALID |
+// +---------------------------------+ <- @0
+
DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size)
.variant_pcs __arm_sme_state_size
BTI_C
// Test if SME is available and ZA state is 'active'.
- adrp x16, CPU_FEATS_SYMBOL
- ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
- tbz x16, #FEAT_SME_BIT, 0f
+ adrp x17, CPU_FEATS_SYMBOL
+ ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+ tbz x17, #FEAT_SME_BIT, 0f
mrs x16, SVCR
tbz x16, #1, 0f
mrs x16, TPIDR2_EL0
cbnz x16, 0f
// Size = HAS_FEAT_SME2 ? 96 : 32
- adrp x16, CPU_FEATS_SYMBOL
- ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
- tst x16, #FEAT_SME2_MASK
+ tst x17, #FEAT_SME2_MASK
mov w17, #32
mov w16, #96
csel x16, x17, x16, eq
@@ -242,17 +260,17 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
.variant_pcs __arm_sme_save
BTI_C
- // Clear internal state bits
- stp xzr, xzr, [x0]
-
// If PTR is not 16-byte aligned, abort.
tst x0, #0xF
b.ne 3f
+ // Clear internal state bits
+ stp xzr, xzr, [x0]
+
// If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return.
- adrp x16, CPU_FEATS_SYMBOL
- ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
- tbz x16, #FEAT_SME_BIT, 2f
+ adrp x17, CPU_FEATS_SYMBOL
+ ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+ tbz x17, #FEAT_SME_BIT, 2f
mrs x16, SVCR
tbz x16, #1, 2f
mrs x16, TPIDR2_EL0
@@ -262,20 +280,15 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
mov w16, #1
str x16, [x0]
- adrp x16, CPU_FEATS_SYMBOL
- ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
- tbz x16, #FEAT_SME2_BIT, 0f
+ add x18, x0, #32
+ tbz x17, #FEAT_SME2_BIT, 1f
// Store ZT0 and ZA
add x16, x0, #32
str zt0, [x16]
- add x18, x0, #96
+ add x18, x18, #64
b 1f
-0:
- // Has SME only
- add x18, x0, #32
-
1:
// Set up lazy-save (x18 = pointer to buffer)
rdsvl x17, #1
@@ -284,7 +297,6 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
stur wzr, [x0, #10]
strh wzr, [x0, #14]
msr TPIDR2_EL0, x0
- ret
2:
// Do nothing
@@ -296,7 +308,7 @@ END_COMPILERRT_FUNCTION(__arm_sme_save)
DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore)
.cfi_startproc
- .variant_pcs __arm_sme_save
+ .variant_pcs __arm_sme_restore
BTI_C
stp x29, x30, [sp, #-16]!
@@ -312,16 +324,16 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore)
// If the VALID bit is 0, return early.
ldr x16, [x0]
- tbz x16, #0, 2f
+ cbz x16, 2f
// If SME is not available, abort.
- adrp x16, CPU_FEATS_SYMBOL
- ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
- tbz x16, #FEAT_SME_BIT, 3f
+ adrp x17, CPU_FEATS_SYMBOL
+ ldr x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+ tbz x17, #FEAT_SME_BIT, 3f
// If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0.
mrs x16, TPIDR2_EL0
- cbnz x16, 0f
+ cbnz x16, 1f
// If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'),
// abort.
@@ -330,20 +342,16 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore)
// Restore za.
smstart za
- mov x16, x0
add x0, x0, #16
bl __arm_tpidr2_restore
- mov x0, x16
- msr TPIDR2_EL0, xzr
+ sub x0, x0, #16
-0:
+1:
smstart za
+ msr TPIDR2_EL0, xzr
-1:
// Check if zt0 needs restoring.
- adrp x16, CPU_FEATS_SYMBOL
- ldr x16, [x16, CPU_FEATS_SYMBOL_OFFSET]
- tbz x16, #FEAT_SME2_BIT, 2f
+ tbz x17, #FEAT_SME2_BIT, 2f
// Restore zt0.
add x16, x0, #32
>From 7b50c7a3d740b3042c129fa3ef86ef9a40accdd8 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Tue, 17 Dec 2024 15:52:35 +0000
Subject: [PATCH 4/5] Further simplification
---
compiler-rt/lib/builtins/aarch64/sme-abi.S | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 855099427e2684..b70c7ee1fe991a 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -283,9 +283,8 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
add x18, x0, #32
tbz x17, #FEAT_SME2_BIT, 1f
- // Store ZT0 and ZA
- add x16, x0, #32
- str zt0, [x16]
+ // Store ZT0
+ str zt0, [x18]
add x18, x18, #64
b 1f
>From c0dc2a4522b234f19f332ca10ab91dde6918da36 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Fri, 20 Dec 2024 12:08:41 +0000
Subject: [PATCH 5/5] Further simplification
---
compiler-rt/lib/builtins/aarch64/sme-abi.S | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index b70c7ee1fe991a..8dbbe061edb9bf 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -286,15 +286,14 @@ DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
// Store ZT0
str zt0, [x18]
add x18, x18, #64
- b 1f
1:
// Set up lazy-save (x18 = pointer to buffer)
rdsvl x17, #1
str x18, [x0, #16]!
strh w17, [x0, #8]
- stur wzr, [x0, #10]
- strh wzr, [x0, #14]
+ strh wzr, [x0, #10]
+ str wzr, [x0, #12]
msr TPIDR2_EL0, x0
2:
More information about the llvm-commits
mailing list