[llvm] [AArch64][SME] Add missing SMStartStop regmasks (PR #68458)

Jon Roelofs via llvm-commits llvm-commits at lists.llvm.org
Fri Oct 6 16:27:13 PDT 2023


https://github.com/jroelofs created https://github.com/llvm/llvm-project/pull/68458

Without these, the register allocator doesn't know they clobber callee-saved NEON regs (among other things).

>From 5042481ea786929841096e9a2c48cfe4b789a032 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Fri, 6 Oct 2023 16:15:09 -0700
Subject: [PATCH] [AArch64][SME] Add missing SMSTartStop regmasks

Without these, the register allocator doesn't know they clobber callee-saved
NEON regs (among other things).
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  10 +-
 .../AArch64/sme-disable-gisel-fisel.ll        | 112 ++++++++++++------
 .../CodeGen/AArch64/sme-lazy-save-call.ll     |  84 ++++++++-----
 .../AArch64/sme-shared-za-interface.ll        |  52 +++++---
 .../CodeGen/AArch64/sme-toggle-pstateza.ll    |  17 +++
 5 files changed, 191 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9cda43e58d27a43..b073d0347dbcaf9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4819,6 +4819,7 @@ SDValue AArch64TargetLowering::getPStateSM(SelectionDAG &DAG, SDValue Chain,
 
 SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                                    SelectionDAG &DAG) const {
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   unsigned IntNo = Op.getConstantOperandVal(1);
   SDLoc DL(Op);
   switch (IntNo) {
@@ -4845,13 +4846,15 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         AArch64ISD::SMSTART, DL, MVT::Other,
         Op->getOperand(0), // Chain
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64),
+        DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()));
   case Intrinsic::aarch64_sme_za_disable:
     return DAG.getNode(
         AArch64ISD::SMSTOP, DL, MVT::Other,
         Op->getOperand(0), // Chain
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64),
+        DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()));
   }
 }
 
@@ -7850,7 +7853,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       Result = DAG.getNode(
           AArch64ISD::SMSTART, DL, MVT::Other, Result,
           DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-          DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+          DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64),
+          DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask()));
 
       // Conditionally restore the lazy save using a pseudo node.
       unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 885cd7b0b0947da..fc1104412519583 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -212,14 +212,19 @@ declare double @za_shared_callee(double) "aarch64_pstate_za_shared"
 define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_pstate_za_new"{
 ; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
 ; CHECK-COMMON:       // %bb.0: // %prelude
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    mov x29, sp
-; CHECK-COMMON-NEXT:    sub sp, sp, #16
+; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    add x29, sp, #64
+; CHECK-COMMON-NEXT:    sub sp, sp, #32
+; CHECK-COMMON-NEXT:    stur d0, [x29, #-88] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
 ; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-80]
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-COMMON-NEXT:    cbz x8, .LBB6_2
 ; CHECK-COMMON-NEXT:    b .LBB6_1
@@ -230,13 +235,20 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:  .LBB6_2: // %entry
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    zero {za}
+; CHECK-COMMON-NEXT:    ldur d0, [x29, #-88] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    bl za_shared_callee
 ; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
 ; CHECK-COMMON-NEXT:    fmov d1, x8
 ; CHECK-COMMON-NEXT:    fadd d0, d0, d1
+; CHECK-COMMON-NEXT:    stur d0, [x29, #-96] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    mov sp, x29
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldur d0, [x29, #-96] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    sub sp, x29, #64
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
 entry:
   %call = call double @za_shared_callee(double %x)
@@ -247,21 +259,26 @@ entry:
 define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_pstate_za_shared"{
 ; CHECK-COMMON-LABEL: za_shared_caller_to_za_none_callee:
 ; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    mov x29, sp
-; CHECK-COMMON-NEXT:    sub sp, sp, #16
+; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    add x29, sp, #64
+; CHECK-COMMON-NEXT:    sub sp, sp, #32
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
 ; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
-; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    sub x8, x29, #16
+; CHECK-COMMON-NEXT:    stur x9, [x29, #-80]
+; CHECK-COMMON-NEXT:    sturh w8, [x29, #-72]
+; CHECK-COMMON-NEXT:    sub x8, x29, #80
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-COMMON-NEXT:    bl normal_callee
+; CHECK-COMMON-NEXT:    stur d0, [x29, #-88] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-COMMON-NEXT:    sub x0, x29, #16
+; CHECK-COMMON-NEXT:    sub x0, x29, #80
 ; CHECK-COMMON-NEXT:    cbz x8, .LBB7_1
 ; CHECK-COMMON-NEXT:    b .LBB7_2
 ; CHECK-COMMON-NEXT:  .LBB7_1: // %entry
@@ -270,10 +287,15 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:  .LBB7_2: // %entry
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
-; CHECK-COMMON-NEXT:    fmov d1, x8
-; CHECK-COMMON-NEXT:    fadd d0, d0, d1
-; CHECK-COMMON-NEXT:    mov sp, x29
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    fmov d0, x8
+; CHECK-COMMON-NEXT:    ldur d1, [x29, #-88] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    fadd d0, d1, d0
+; CHECK-COMMON-NEXT:    sub sp, x29, #64
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
 entry:
   %call = call double @normal_callee(double %x)
@@ -285,28 +307,38 @@ entry:
 define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwind {
 ; CHECK-COMMON-LABEL: f128_call_za:
 ; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    mov x29, sp
-; CHECK-COMMON-NEXT:    sub sp, sp, #16
+; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    add x29, sp, #64
+; CHECK-COMMON-NEXT:    sub sp, sp, #32
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
 ; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
-; CHECK-COMMON-NEXT:    sub x9, x29, #16
-; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
+; CHECK-COMMON-NEXT:    stur x9, [x29, #-80]
+; CHECK-COMMON-NEXT:    sub x9, x29, #80
+; CHECK-COMMON-NEXT:    sturh w8, [x29, #-72]
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl __addtf3
+; CHECK-COMMON-NEXT:    stur q0, [x29, #-96] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-COMMON-NEXT:    sub x0, x29, #16
+; CHECK-COMMON-NEXT:    sub x0, x29, #80
 ; CHECK-COMMON-NEXT:    cbnz x8, .LBB8_2
 ; CHECK-COMMON-NEXT:  // %bb.1:
 ; CHECK-COMMON-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-COMMON-NEXT:  .LBB8_2:
+; CHECK-COMMON-NEXT:    ldur q0, [x29, #-96] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-COMMON-NEXT:    mov sp, x29
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    sub sp, x29, #64
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
   %res = fadd fp128 %a, %b
   ret fp128 %res
@@ -345,28 +377,38 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
 define double @frem_call_za(double %a, double %b) "aarch64_pstate_za_shared" nounwind {
 ; CHECK-COMMON-LABEL: frem_call_za:
 ; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    mov x29, sp
-; CHECK-COMMON-NEXT:    sub sp, sp, #16
+; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    add x29, sp, #64
+; CHECK-COMMON-NEXT:    sub sp, sp, #32
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
 ; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
-; CHECK-COMMON-NEXT:    sub x9, x29, #16
-; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
+; CHECK-COMMON-NEXT:    stur x9, [x29, #-80]
+; CHECK-COMMON-NEXT:    sub x9, x29, #80
+; CHECK-COMMON-NEXT:    sturh w8, [x29, #-72]
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl fmod
+; CHECK-COMMON-NEXT:    stur d0, [x29, #-88] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-COMMON-NEXT:    sub x0, x29, #16
+; CHECK-COMMON-NEXT:    sub x0, x29, #80
 ; CHECK-COMMON-NEXT:    cbnz x8, .LBB10_2
 ; CHECK-COMMON-NEXT:  // %bb.1:
 ; CHECK-COMMON-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-COMMON-NEXT:  .LBB10_2:
+; CHECK-COMMON-NEXT:    ldur d0, [x29, #-88] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-COMMON-NEXT:    mov sp, x29
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    sub sp, x29, #64
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
   %res = frem double %a, %b
   ret double %res
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 6757af01278bd9b..a46820b3a08ba05 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -9,28 +9,36 @@ declare float @llvm.cos.f32(float)
 define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" {
 ; CHECK-LABEL: test_lazy_save_1_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    stur x9, [x29, #-16]
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    stur x9, [x29, #-80]
+; CHECK-NEXT:    sub x9, x29, #80
+; CHECK-NEXT:    sturh w8, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    sub x0, x29, #80
 ; CHECK-NEXT:    cbnz x8, .LBB0_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @private_za_callee()
   ret void
@@ -40,41 +48,49 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" {
 define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
 ; CHECK-LABEL: test_lazy_save_2_callees:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x19, #1
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    msub x8, x19, x19, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    sub x20, x29, #16
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    sturh w19, [x29, #-8]
+; CHECK-NEXT:    sub x20, x29, #80
+; CHECK-NEXT:    stur x8, [x29, #-80]
+; CHECK-NEXT:    sturh w19, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x20
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    sub x0, x29, #80
 ; CHECK-NEXT:    cbnz x8, .LBB1_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:    sturh w19, [x29, #-8]
+; CHECK-NEXT:    sturh w19, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x20
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    sub x0, x29, #80
 ; CHECK-NEXT:    cbnz x8, .LBB1_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-NEXT:  .LBB1_4:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @private_za_callee()
   call void @private_za_callee()
@@ -85,28 +101,38 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
 define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_pstate_za_shared" {
 ; CHECK-LABEL: test_lazy_save_expanded_intrinsic:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    stur x9, [x29, #-16]
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    stur x9, [x29, #-80]
+; CHECK-NEXT:    sub x9, x29, #80
+; CHECK-NEXT:    sturh w8, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl cosf
+; CHECK-NEXT:    stur s0, [x29, #-84] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    sub x0, x29, #80
 ; CHECK-NEXT:    cbnz x8, .LBB2_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    ldur s0, [x29, #-84] // 4-byte Folded Reload
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call float @llvm.cos.f32(float %a)
   ret float %res
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 0ac2b21c6aba360..9dccec5a99da148 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -7,28 +7,36 @@ declare void @private_za_callee()
 define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind {
 ; CHECK-LABEL: disable_tailcallopt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    stur x9, [x29, #-16]
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    stur x9, [x29, #-80]
+; CHECK-NEXT:    sub x9, x29, #80
+; CHECK-NEXT:    sturh w8, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    sub x0, x29, #80
 ; CHECK-NEXT:    cbnz x8, .LBB0_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   tail call void @private_za_callee()
   ret void
@@ -38,28 +46,38 @@ define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind {
 define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwind {
 ; CHECK-LABEL: f128_call_za:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    stur x9, [x29, #-16]
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    stur x9, [x29, #-80]
+; CHECK-NEXT:    sub x9, x29, #80
+; CHECK-NEXT:    sturh w8, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl __addtf3
+; CHECK-NEXT:    stur q0, [x29, #-96] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    sub x0, x29, #80
 ; CHECK-NEXT:    cbnz x8, .LBB1_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    ldur q0, [x29, #-96] // 16-byte Folded Reload
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = fadd fp128 %a, %b
   ret fp128 %res
diff --git a/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll b/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll
index 3c50ab54e561e69..6dfcf5989237f2b 100644
--- a/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll
+++ b/llvm/test/CodeGen/AArch64/sme-toggle-pstateza.ll
@@ -4,8 +4,25 @@
 define void @toggle_pstate_za() {
 ; CHECK-LABEL: toggle_pstate_za:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset b8, -8
+; CHECK-NEXT:    .cfi_offset b9, -16
+; CHECK-NEXT:    .cfi_offset b10, -24
+; CHECK-NEXT:    .cfi_offset b11, -32
+; CHECK-NEXT:    .cfi_offset b12, -40
+; CHECK-NEXT:    .cfi_offset b13, -48
+; CHECK-NEXT:    .cfi_offset b14, -56
+; CHECK-NEXT:    .cfi_offset b15, -64
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @llvm.aarch64.sme.za.enable()
   call void @llvm.aarch64.sme.za.disable()



More information about the llvm-commits mailing list