[llvm] [AArch64][SME2] Preserve ZT0 state around function calls (PR #76968)
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 8 06:21:13 PST 2024
================
@@ -0,0 +1,306 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
+
+; Normal callee, no ZT state
+declare void @normal_callee();
+
+; Callees with ZT state
+declare void @za_shared_callee() "aarch64_pstate_za_shared" "aarch64_sme_pstate_zt0_shared";
+declare void @za_new_callee() "aarch64_pstate_za_new" "aarch64_sme_pstate_zt0_new";
+
+; Callee with preserved ZT state
+declare void @za_preserved_callee() "aarch64_pstate_za_preserved" "aarch64_sme_pstate_zt0_preserved";
+
+
+define void @za_zt_new_caller_normal_callee() "aarch64_pstate_za_new" "aarch64_sme_pstate_zt0_new" nounwind {
+; CHECK-LABEL: za_zt_new_caller_normal_callee:
+; CHECK: // %bb.0: // %prelude
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #80
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: cbz x8, .LBB0_2
+; CHECK-NEXT: // %bb.1: // %save.za
+; CHECK-NEXT: bl __arm_tpidr2_save
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: zero {za}
+; CHECK-NEXT: zero { zt0 }
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
+; CHECK-NEXT: sub x19, x29, #80
+; CHECK-NEXT: sturh w8, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: str zt0, [x19]
+; CHECK-NEXT: bl normal_callee
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: ldr zt0, [x19]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB0_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: smstop za
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @normal_callee();
+ ret void;
+}
+
+define void @za_zt_new_caller_za_callee() "aarch64_pstate_za_new" "aarch64_sme_pstate_zt0_new" nounwind {
+; CHECK-LABEL: za_zt_new_caller_za_callee:
+; CHECK: // %bb.0: // %prelude
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #144
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: cbz x8, .LBB1_2
+; CHECK-NEXT: // %bb.1: // %save.za
+; CHECK-NEXT: bl __arm_tpidr2_save
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: .LBB1_2:
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: zero {za}
+; CHECK-NEXT: zero { zt0 }
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
+; CHECK-NEXT: sub x19, x29, #80
+; CHECK-NEXT: sturh w8, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: str zt0, [x19]
+; CHECK-NEXT: bl za_new_callee
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: ldr zt0, [x19]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB1_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB1_4:
+; CHECK-NEXT: sub x8, x29, #144
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: str zt0, [x8]
+; CHECK-NEXT: bl za_shared_callee
+; CHECK-NEXT: smstop za
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @za_new_callee();
+ call void @za_shared_callee();
+ ret void;
+}
+
+define void @za_zt_shared_caller_normal_callee() "aarch64_pstate_za_shared" "aarch64_sme_pstate_zt0_shared" nounwind {
+; CHECK-LABEL: za_zt_shared_caller_normal_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #80
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: sub x10, x29, #16
+; CHECK-NEXT: sub x19, x29, #80
+; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: sturh w8, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: str zt0, [x19]
+; CHECK-NEXT: bl normal_callee
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: ldr zt0, [x19]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB2_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @normal_callee();
+ ret void;
+}
+
+define void @za_zt_shared_caller_za_callee() "aarch64_pstate_za_shared" "aarch64_sme_pstate_zt0_shared" nounwind {
+; CHECK-LABEL: za_zt_shared_caller_za_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #144
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: sub x10, x29, #16
+; CHECK-NEXT: sub x19, x29, #80
+; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: sturh w8, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: str zt0, [x19]
+; CHECK-NEXT: bl za_new_callee
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: ldr zt0, [x19]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB3_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB3_2:
+; CHECK-NEXT: sub x8, x29, #144
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: str zt0, [x8]
+; CHECK-NEXT: bl za_shared_callee
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @za_new_callee();
+ call void @za_shared_callee();
+ ret void;
+}
+
+define void @za_zt_new_caller_za_preserved_callee() "aarch64_pstate_za_new" "aarch64_sme_pstate_zt0_new" nounwind {
+; CHECK-LABEL: za_zt_new_caller_za_preserved_callee:
+; CHECK: // %bb.0: // %prelude
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: cbz x8, .LBB4_2
+; CHECK-NEXT: // %bb.1: // %save.za
+; CHECK-NEXT: bl __arm_tpidr2_save
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: .LBB4_2:
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: sub x8, x29, #16
+; CHECK-NEXT: zero {za}
+; CHECK-NEXT: zero { zt0 }
+; CHECK-NEXT: sturh wzr, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: bl za_preserved_callee
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: smstop za
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @za_preserved_callee();
+ ret void;
+}
+
+define void @za_zt_shared_caller_za_preserved_callee() "aarch64_pstate_za_shared" "aarch64_sme_pstate_zt0_shared" nounwind {
+; CHECK-LABEL: za_zt_shared_caller_za_preserved_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: sub x9, x29, #16
+; CHECK-NEXT: stp x8, xzr, [x29, #-16]
+; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: bl za_preserved_callee
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @za_preserved_callee();
+ ret void;
+}
+
+define void @za_zt_preserved_caller_za_callee() "aarch64_pstate_za_preserved" "aarch64_sme_pstate_zt0_preserved" nounwind {
+; CHECK-LABEL: za_zt_preserved_caller_za_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: bl normal_callee
+; CHECK-NEXT: bl za_new_callee
+; CHECK-NEXT: bl za_shared_callee
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @normal_callee();
+ call void @za_new_callee();
+ call void @za_shared_callee();
+ ret void;
+}
+
+define void @za_zt_preserved_caller_za_zt_preserved_callee() "aarch64_pstate_za_preserved" "aarch64_sme_pstate_zt0_preserved" nounwind {
+; CHECK-LABEL: za_zt_preserved_caller_za_zt_preserved_callee:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: bl za_preserved_callee
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ call void @za_preserved_callee();
+ ret void;
+}
+
+define i32 @spill_fill_zt_load_start_chain(ptr %ptr) "aarch64_pstate_za_shared" "aarch64_sme_pstate_zt0_shared" {
+; CHECK-LABEL: spill_fill_zt_load_start_chain:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #80
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: sub x8, x29, #80
+; CHECK-NEXT: ldr w19, [x0]
+; CHECK-NEXT: str zt0, [x8]
+; CHECK-NEXT: bl za_shared_callee
+; CHECK-NEXT: mov w0, w19
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %loadval = load i32, ptr %ptr
+ call void @za_shared_callee()
+ ret i32 %loadval
+}
----------------
sdesmalen-arm wrote:
```
declare void @bar()
define void @foo() "aarch64_sme_pstate_zt0_shared" nounwind {
call void @bar()
ret void
}
```
Currently does not emit a fill for `zt0` after the call to `@bar`.
```
foo: // @foo
// %bb.0:
sub sp, sp, #80
mov x8, sp
str x30, [sp, #64] // 8-byte Folded Spill
str zt0, [x8]
bl bar
ldr x30, [sp, #64] // 8-byte Folded Reload
add sp, sp, #80
ret
```
https://github.com/llvm/llvm-project/pull/76968
More information about the llvm-commits
mailing list