[llvm] [AArch64][SME] Support agnostic ZA functions in the MachineSMEABIPass (PR #149064)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 9 02:58:16 PDT 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/149064
>From 96872e0f90ec6b8ecfa1bb2c9e165f5bf2c76058 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 15 Jul 2025 11:47:39 +0000
Subject: [PATCH 01/11] [AArch64][SME] Support Windows/stack probes in
MachineSMEABIPass
On Windows or with stack probes on other targets, additional code needs
to be inserted after dynamic stack allocations to validate stack
accesses and/or ensure enough stack space has been allocated.
Rather than handle this case in the MachineSMEABIPass (like we do for
the standard case), we allocate the memory for the lazy save buffer in
SelectionDAG, which allows the existing expansions to emit the correct
code.
Note: This means in these cases, we may allocate a lazy save buffer when
there are no lazy saves present in the function (as we have to allocate
the buffer before the MachineSMEABIPass runs).
Change-Id: If89ab54c4de79f6fe5513a6b387e9e349f7bc7d1
---
.../Target/AArch64/AArch64ISelLowering.cpp | 11 ++++++-
llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 3 +-
.../CodeGen/AArch64/sme-lazy-save-windows.ll | 31 ++++++++++++++++++-
3 files changed, 42 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 95695088bf19c..c73fcbbe94097 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8489,13 +8489,22 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
- if (getTM().useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) {
+ if (getTM().useNewSMEABILowering()) {
if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
SDValue Size;
if (Attrs.hasZAState()) {
SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
DAG.getConstant(1, DL, MVT::i32));
Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+ } else if (Attrs.hasAgnosticZAInterface()) {
+ RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
+ SDValue Callee = DAG.getExternalSymbol(
+ getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
+ auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+ getLibcallCallingConv(LC), RetTy, Callee, {});
+ std::tie(Size, Chain) = LowerCallTo(CLI);
}
if (Size) {
SDValue Buffer = DAG.getNode(
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index d95d170a813ac..c9e31bf170417 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -303,8 +303,9 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
// buffer was allocated in SelectionDAG. It marks the end of the
// allocation -- which is a safe point for this pass to insert any TPIDR2
// block setup.
- if (MI.getOpcode() == AArch64::SMEStateAllocPseudo)
+ if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
State.AfterSMEProloguePt = MBBI;
+ }
auto [NeededState, InsertPt] = getZAStateBeforeInst(
*TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
assert((InsertPt == MBBI ||
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
index 1c341e8daf491..c20e028e9caf5 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING
declare void @private_za_callee()
declare void @shared_za_callee() "aarch64_inout_za"
@@ -34,6 +34,35 @@ define void @test_lazy_save() nounwind "aarch64_inout_za" {
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: test_lazy_save:
+; CHECK-NEWLOWERING: // %bb.0:
+; CHECK-NEWLOWERING-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x29, sp
+; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
+; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
+; CHECK-NEWLOWERING-NEXT: mul x9, x8, x8
+; CHECK-NEWLOWERING-NEXT: lsr x15, x9, #4
+; CHECK-NEWLOWERING-NEXT: bl __chkstk
+; CHECK-NEWLOWERING-NEXT: sub x9, sp, x15, lsl #4
+; CHECK-NEWLOWERING-NEXT: mov sp, x9
+; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
+; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEWLOWERING-NEXT: bl private_za_callee
+; CHECK-NEWLOWERING-NEXT: smstart za
+; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
+; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_2
+; CHECK-NEWLOWERING-NEXT: // %bb.1:
+; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEWLOWERING-NEXT: .LBB0_2:
+; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEWLOWERING-NEXT: mov sp, x29
+; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ret
call void @private_za_callee()
ret void
}
>From f7d7fb88ce47cbbb897af213dcb0b0e331e82f9d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 27 Aug 2025 09:11:59 +0000
Subject: [PATCH 02/11] Update tests after rebase
Change-Id: I0c3703e432386814830492e46d896ac2395840fc
---
.../CodeGen/AArch64/sme-lazy-save-windows.ll | 31 +------------------
1 file changed, 1 insertion(+), 30 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
index c20e028e9caf5..1c341e8daf491 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING
+; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s
declare void @private_za_callee()
declare void @shared_za_callee() "aarch64_inout_za"
@@ -34,35 +34,6 @@ define void @test_lazy_save() nounwind "aarch64_inout_za" {
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: test_lazy_save:
-; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x29, sp
-; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT: mul x9, x8, x8
-; CHECK-NEWLOWERING-NEXT: lsr x15, x9, #4
-; CHECK-NEWLOWERING-NEXT: bl __chkstk
-; CHECK-NEWLOWERING-NEXT: sub x9, sp, x15, lsl #4
-; CHECK-NEWLOWERING-NEXT: mov sp, x9
-; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16
-; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT: bl private_za_callee
-; CHECK-NEWLOWERING-NEXT: smstart za
-; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB0_2
-; CHECK-NEWLOWERING-NEXT: // %bb.1:
-; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT: .LBB0_2:
-; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT: mov sp, x29
-; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp x30, x29, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ret
call void @private_za_callee()
ret void
}
>From b752ea52af8355d4d783ff6dff6b6d230f620f67 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 5 Sep 2025 12:15:28 +0000
Subject: [PATCH 03/11] Remove agnostic ZA handling
Change-Id: Id48a5e1cbf3f246165b41657372ff046b6ff0c84
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 ---------
1 file changed, 9 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c73fcbbe94097..e65ea404f8e80 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8496,15 +8496,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
DAG.getConstant(1, DL, MVT::i32));
Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
- } else if (Attrs.hasAgnosticZAInterface()) {
- RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
- SDValue Callee = DAG.getExternalSymbol(
- getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
- auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
- getLibcallCallingConv(LC), RetTy, Callee, {});
- std::tie(Size, Chain) = LowerCallTo(CLI);
}
if (Size) {
SDValue Buffer = DAG.getNode(
>From 40bfc78cfb06ec37a0ae1e448f7d535d85f682fd Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 5 Sep 2025 12:20:33 +0000
Subject: [PATCH 04/11] Remove braces
Change-Id: I6755e71c47d03fba8931876b8607289df30feaae
---
llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index c9e31bf170417..d95d170a813ac 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -303,9 +303,8 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
// buffer was allocated in SelectionDAG. It marks the end of the
// allocation -- which is a safe point for this pass to insert any TPIDR2
// block setup.
- if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
+ if (MI.getOpcode() == AArch64::SMEStateAllocPseudo)
State.AfterSMEProloguePt = MBBI;
- }
auto [NeededState, InsertPt] = getZAStateBeforeInst(
*TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
assert((InsertPt == MBBI ||
>From c9f250b1da2a4483a2a6dc8577790111d474a9bc Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 15 Jul 2025 11:47:48 +0000
Subject: [PATCH 05/11] [AArch64][SME] Support agnostic ZA functions in the
MachineSMEABIPass
This extends the MachineSMEABIPass to handle agnostic ZA functions. This
case is currently handled like shared ZA functions, but we don't require
ZA state to be reloaded before agnostic ZA calls.
Note: This patch does not yet fully handle agnostic ZA functions that
can catch exceptions. E.g.:
```
__arm_agnostic("sme_za_state") void try_catch_agnostic_za_callee()
{
try {
agnostic_za_call();
} catch(...) {
noexcept_agnostic_za_call();
}
}
```
As in this case, we won't commit a ZA save before the
`agnostic_za_call()`, which would be needed to restore ZA in the catch
block. This will be handled in a later patch.
Change-Id: I9cce7b42ec8b64d5442b35231b65dfaf9d149eed
---
.../Target/AArch64/AArch64ISelLowering.cpp | 11 +-
llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 174 +++++++++++++++--
llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 176 ++++++++++++++++--
3 files changed, 323 insertions(+), 38 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e65ea404f8e80..4bde2d043c99b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9303,9 +9303,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
- // TODO: Handle agnostic ZA functions.
- if (!UseNewSMEABILowering || IsAgnosticZAFunction)
+ if (!UseNewSMEABILowering)
return std::nullopt;
+ if (IsAgnosticZAFunction) {
+ if (CallAttrs.requiresPreservingAllZAState())
+ return AArch64ISD::REQUIRES_ZA_SAVE;
+ return std::nullopt;
+ }
if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
return std::nullopt;
return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
@@ -9385,7 +9389,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
};
bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
- bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
+ bool RequiresSaveAllZA =
+ !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
if (RequiresLazySave) {
TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index d95d170a813ac..49beefcb75495 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -7,7 +7,7 @@
//===----------------------------------------------------------------------===//
//
// This pass implements the SME ABI requirements for ZA state. This includes
-// implementing the lazy ZA state save schemes around calls.
+// implementing the lazy (and agnostic) ZA state save schemes around calls.
//
//===----------------------------------------------------------------------===//
//
@@ -200,7 +200,7 @@ struct MachineSMEABI : public MachineFunctionPass {
/// Inserts code to handle changes between ZA states within the function.
/// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
- void insertStateChanges();
+ void insertStateChanges(bool IsAgnosticZA);
// Emission routines for private and shared ZA functions (using lazy saves).
void emitNewZAPrologue(MachineBasicBlock &MBB,
@@ -215,8 +215,41 @@ struct MachineSMEABI : public MachineFunctionPass {
void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
bool ClearTPIDR2);
+ // Emission routines for agnostic ZA functions.
+ void emitSetupFullZASave(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs);
+ void emitFullZASaveRestore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsSave);
+ void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs);
+
void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- ZAState From, ZAState To, LiveRegs PhysLiveRegs);
+ ZAState From, ZAState To, LiveRegs PhysLiveRegs,
+ bool IsAgnosticZA);
+
+ // Helpers for switching between lazy/full ZA save/restore routines.
+ void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
+ if (IsAgnosticZA)
+ return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true);
+ return emitSetupLazySave(MBB, MBBI);
+ }
+ void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
+ if (IsAgnosticZA)
+ return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false);
+ return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs);
+ }
+ void emitAllocateZASaveBuffer(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
+ if (IsAgnosticZA)
+ return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs);
+ return emitAllocateLazySaveBuffer(MBB, MBBI);
+ }
/// Save live physical registers to virtual registers.
PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
@@ -228,6 +261,8 @@ struct MachineSMEABI : public MachineFunctionPass {
/// Get or create a TPIDR2 block in this function.
TPIDR2State getTPIDR2Block();
+ Register getAgnosticZABufferPtr();
+
private:
/// Contains the needed ZA state (and live registers) at an instruction.
struct InstInfo {
@@ -241,6 +276,7 @@ struct MachineSMEABI : public MachineFunctionPass {
struct BlockInfo {
ZAState FixedEntryState{ZAState::ANY};
SmallVector<InstInfo> Insts;
+ LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
};
@@ -250,6 +286,9 @@ struct MachineSMEABI : public MachineFunctionPass {
SmallVector<ZAState> BundleStates;
std::optional<TPIDR2State> TPIDR2Block;
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
+ Register AgnosticZABufferPtr = AArch64::NoRegister;
+ LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
+ bool HasFullZASaveRestore = false;
} State;
MachineFunction *MF = nullptr;
@@ -261,7 +300,8 @@ struct MachineSMEABI : public MachineFunctionPass {
};
void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
- assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) &&
+ assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
+ SMEFnAttrs.hasZAState()) &&
"Expected function to have ZA/ZT0 state!");
State.Blocks.resize(MF->getNumBlockIDs());
@@ -295,6 +335,7 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
+ auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
for (MachineInstr &MI : reverse(MBB)) {
MachineBasicBlock::iterator MBBI(MI);
LiveUnits.stepBackward(MI);
@@ -303,8 +344,11 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
// buffer was allocated in SelectionDAG. It marks the end of the
// allocation -- which is a safe point for this pass to insert any TPIDR2
// block setup.
- if (MI.getOpcode() == AArch64::SMEStateAllocPseudo)
+ if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
State.AfterSMEProloguePt = MBBI;
+ State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
+ }
+ // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
auto [NeededState, InsertPt] = getZAStateBeforeInst(
*TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
assert((InsertPt == MBBI ||
@@ -313,6 +357,8 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
// TODO: Do something to avoid state changes where NZCV is live.
if (MBBI == FirstTerminatorInsertPt)
Block.PhysLiveRegsAtExit = PhysLiveRegs;
+ if (MBBI == FirstNonPhiInsertPt)
+ Block.PhysLiveRegsAtEntry = PhysLiveRegs;
if (NeededState != ZAState::ANY)
Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
}
@@ -379,7 +425,7 @@ void MachineSMEABI::assignBundleZAStates() {
}
}
-void MachineSMEABI::insertStateChanges() {
+void MachineSMEABI::insertStateChanges(bool IsAgnosticZA) {
for (MachineBasicBlock &MBB : *MF) {
const BlockInfo &Block = State.Blocks[MBB.getNumber()];
ZAState InState = State.BundleStates[Bundles->getBundle(MBB.getNumber(),
@@ -392,7 +438,7 @@ void MachineSMEABI::insertStateChanges() {
for (auto &Inst : Block.Insts) {
if (CurrentState != Inst.NeededState)
emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState,
- Inst.PhysLiveRegs);
+ Inst.PhysLiveRegs, IsAgnosticZA);
CurrentState = Inst.NeededState;
}
@@ -403,7 +449,7 @@ void MachineSMEABI::insertStateChanges() {
State.BundleStates[Bundles->getBundle(MBB.getNumber(), /*Out=*/true)];
if (CurrentState != OutState)
emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState,
- Block.PhysLiveRegsAtExit);
+ Block.PhysLiveRegsAtExit, IsAgnosticZA);
}
}
@@ -617,10 +663,95 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
.addImm(1);
}
+Register MachineSMEABI::getAgnosticZABufferPtr() {
+ if (State.AgnosticZABufferPtr != AArch64::NoRegister)
+ return State.AgnosticZABufferPtr;
+ if (auto BufferPtr =
+ MF->getInfo<AArch64FunctionInfo>()->getEarlyAllocSMESaveBuffer();
+ BufferPtr != AArch64::NoRegister)
+ State.AgnosticZABufferPtr = BufferPtr;
+ else
+ State.AgnosticZABufferPtr =
+ MF->getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ return State.AgnosticZABufferPtr;
+}
+
+void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs, bool IsSave) {
+ auto *TLI = Subtarget->getTargetLowering();
+ State.HasFullZASaveRestore = true;
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+ Register BufferPtr = AArch64::X0;
+
+ PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
+
+ // Copy the buffer pointer into X0.
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
+ .addReg(getAgnosticZABufferPtr());
+
+ // Call __arm_sme_save/__arm_sme_restore.
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+ .addReg(BufferPtr, RegState::Implicit)
+ .addExternalSymbol(TLI->getLibcallName(
+ IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE))
+ .addRegMask(TRI->getCallPreservedMask(
+ *MF,
+ CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+
+ restorePhyRegSave(RegSave, MBB, MBBI, DL);
+}
+
+void MachineSMEABI::emitAllocateFullZASaveBuffer(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ LiveRegs PhysLiveRegs) {
+ auto *AFI = MF->getInfo<AArch64FunctionInfo>();
+
+ // Buffer already allocated in SelectionDAG.
+ if (AFI->getEarlyAllocSMESaveBuffer())
+ return;
+
+ DebugLoc DL = getDebugLoc(MBB, MBBI);
+ Register BufferPtr = getAgnosticZABufferPtr();
+ Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+
+ PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
+
+ // Calculate the SME state size.
+ {
+ auto *TLI = Subtarget->getTargetLowering();
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+ .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_SME_STATE_SIZE))
+ .addReg(AArch64::X0, RegState::ImplicitDefine)
+ .addRegMask(TRI->getCallPreservedMask(
+ *MF, CallingConv::
+ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferSize)
+ .addReg(AArch64::X0);
+ }
+
+ // Allocate a buffer object of the size given __arm_sme_state_size.
+ {
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
+ .addReg(AArch64::SP)
+ .addReg(BufferSize)
+ .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
+ .addReg(AArch64::SP);
+
+ // We have just allocated a variable sized object, tell this to PEI.
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+
+ restorePhyRegSave(RegSave, MBB, MBBI, DL);
+}
+
void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt,
ZAState From, ZAState To,
- LiveRegs PhysLiveRegs) {
+ LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
// ZA not used.
if (From == ZAState::ANY || To == ZAState::ANY)
@@ -652,12 +783,13 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
}
if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
- emitSetupLazySave(MBB, InsertPt);
+ emitZASave(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
- emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs);
+ emitZARestore(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
else if (To == ZAState::OFF) {
assert(From != ZAState::CALLER_DORMANT &&
"CALLER_DORMANT to OFF should have already been handled");
+ assert(!IsAgnosticZA && "Should not turn ZA off in agnostic ZA function");
emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
} else {
dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
@@ -677,7 +809,8 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
- if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State())
+ if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
+ !SMEFnAttrs.hasAgnosticZAInterface())
return false;
assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
@@ -691,20 +824,27 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
TRI = Subtarget->getRegisterInfo();
MRI = &MF.getRegInfo();
+ bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
+
collectNeededZAStates(SMEFnAttrs);
assignBundleZAStates();
- insertStateChanges();
+ insertStateChanges(/*IsAgnosticZA=*/IsAgnosticZA);
// Allocate save buffer (if needed).
- if (State.TPIDR2Block) {
+ if (State.HasFullZASaveRestore || State.TPIDR2Block) {
if (State.AfterSMEProloguePt) {
// Note: With inline stack probes the AfterSMEProloguePt may not be in the
// entry block (due to the probing loop).
- emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
- *State.AfterSMEProloguePt);
+ emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
+ *State.AfterSMEProloguePt,
+ State.PhysLiveRegsAfterSMEPrologue,
+ /*IsAgnosticZA=*/IsAgnosticZA);
} else {
MachineBasicBlock &EntryBlock = MF.front();
- emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+ emitAllocateZASaveBuffer(
+ EntryBlock, EntryBlock.getFirstNonPHI(),
+ State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry,
+ /*IsAgnosticZA=*/IsAgnosticZA);
}
}
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index b31ae68e87ec8..c4b5a3ddabb38 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+sme2 < %s | FileCheck %s
-; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s
+; RUN: llc -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
+; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
target triple = "aarch64"
@@ -9,10 +9,10 @@ declare i64 @agnostic_decl(i64) "aarch64_za_state_agnostic"
; No calls. Test that no buffer is allocated.
define i64 @agnostic_caller_no_callees(ptr %ptr) nounwind "aarch64_za_state_agnostic" {
-; CHECK-LABEL: agnostic_caller_no_callees:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr x0, [x0]
-; CHECK-NEXT: ret
+; CHECK-COMMON-LABEL: agnostic_caller_no_callees:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: ldr x0, [x0]
+; CHECK-COMMON-NEXT: ret
%v = load i64, ptr %ptr
ret i64 %v
}
@@ -51,6 +51,29 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: agnostic_caller_private_za_callee:
+; CHECK-NEWLOWERING: // %bb.0:
+; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x29, sp
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
+; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
+; CHECK-NEWLOWERING-NEXT: mov x19, sp
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: mov sp, x29
+; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
ret i64 %res2
@@ -60,12 +83,12 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) nounwind "aarch64_za_state
;
; Should not result in save/restore code.
define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind "aarch64_za_state_agnostic" {
-; CHECK-LABEL: agnostic_caller_agnostic_callee:
-; CHECK: // %bb.0:
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: bl agnostic_decl
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
+; CHECK-COMMON-LABEL: agnostic_caller_agnostic_callee:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: bl agnostic_decl
+; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ret
%res = call i64 @agnostic_decl(i64 %v)
ret i64 %res
}
@@ -74,12 +97,12 @@ define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind "aarch64_za_state_a
;
; Should not result in lazy-save or save of ZT0
define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "aarch64_inout_zt0" {
-; CHECK-LABEL: shared_caller_agnostic_callee:
-; CHECK: // %bb.0:
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: bl agnostic_decl
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT: ret
+; CHECK-COMMON-LABEL: shared_caller_agnostic_callee:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: bl agnostic_decl
+; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ret
%res = call i64 @agnostic_decl(i64 %v)
ret i64 %res
}
@@ -126,6 +149,45 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee:
+; CHECK-NEWLOWERING: // %bb.0:
+; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x9, x0
+; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: bl __arm_get_current_vg
+; CHECK-NEWLOWERING-NEXT: str x0, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x0, x9
+; CHECK-NEWLOWERING-NEXT: add x29, sp, #64
+; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
+; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
+; CHECK-NEWLOWERING-NEXT: mov x20, sp
+; CHECK-NEWLOWERING-NEXT: mov x0, x20
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT: smstop sm
+; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: smstart sm
+; CHECK-NEWLOWERING-NEXT: smstop sm
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: smstart sm
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: mov x0, x20
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
+; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
ret i64 %res2
@@ -186,6 +248,59 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee:
+; CHECK-NEWLOWERING: // %bb.0:
+; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x9, x0
+; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: bl __arm_get_current_vg
+; CHECK-NEWLOWERING-NEXT: str x0, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x0, x9
+; CHECK-NEWLOWERING-NEXT: add x29, sp, #64
+; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
+; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
+; CHECK-NEWLOWERING-NEXT: mov x19, sp
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state
+; CHECK-NEWLOWERING-NEXT: mov x20, x0
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_2
+; CHECK-NEWLOWERING-NEXT: // %bb.1:
+; CHECK-NEWLOWERING-NEXT: smstop sm
+; CHECK-NEWLOWERING-NEXT: .LBB5_2:
+; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4
+; CHECK-NEWLOWERING-NEXT: // %bb.3:
+; CHECK-NEWLOWERING-NEXT: smstart sm
+; CHECK-NEWLOWERING-NEXT: .LBB5_4:
+; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6
+; CHECK-NEWLOWERING-NEXT: // %bb.5:
+; CHECK-NEWLOWERING-NEXT: smstop sm
+; CHECK-NEWLOWERING-NEXT: .LBB5_6:
+; CHECK-NEWLOWERING-NEXT: bl private_za_decl
+; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8
+; CHECK-NEWLOWERING-NEXT: // %bb.7:
+; CHECK-NEWLOWERING-NEXT: smstart sm
+; CHECK-NEWLOWERING-NEXT: .LBB5_8:
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
+; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
ret i64 %res2
@@ -222,6 +337,31 @@ define i64 @test_many_callee_arguments(
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: test_many_callee_arguments:
+; CHECK-NEWLOWERING: // %bb.0:
+; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x29, sp
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
+; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
+; CHECK-NEWLOWERING-NEXT: mov x19, sp
+; CHECK-NEWLOWERING-NEXT: ldp x9, x10, [x29, #32]
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT: stp x9, x10, [sp, #-16]!
+; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: bl many_args_private_za_callee
+; CHECK-NEWLOWERING-NEXT: add sp, sp, #16
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT: mov x0, x8
+; CHECK-NEWLOWERING-NEXT: mov sp, x29
+; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ret
i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9
) nounwind "aarch64_za_state_agnostic" {
%ret = call i64 @many_args_private_za_callee(
>From e227409ee3a194a09c826ca1704221c8c0e46dc1 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 20 Aug 2025 13:24:43 +0000
Subject: [PATCH 06/11] Fixup checks
Change-Id: I94018ed55c302de670f7a0b25fd28605d9bed2b6
---
llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 28 +++++++-------------
1 file changed, 10 insertions(+), 18 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index c4b5a3ddabb38..d3b7df92095e8 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -152,18 +152,14 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
;
; CHECK-NEWLOWERING-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee:
; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x9, x0
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: bl __arm_get_current_vg
-; CHECK-NEWLOWERING-NEXT: str x0, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x0, x9
; CHECK-NEWLOWERING-NEXT: add x29, sp, #64
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
; CHECK-NEWLOWERING-NEXT: mov x20, sp
@@ -181,12 +177,12 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
@@ -251,18 +247,14 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
;
; CHECK-NEWLOWERING-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee:
; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x9, x0
+; CHECK-NEWLOWERING-NEXT: mov x8, x0
; CHECK-NEWLOWERING-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: bl __arm_get_current_vg
-; CHECK-NEWLOWERING-NEXT: str x0, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x0, x9
; CHECK-NEWLOWERING-NEXT: add x29, sp, #64
-; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: mov x8, x0
+; CHECK-NEWLOWERING-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
; CHECK-NEWLOWERING-NEXT: mov x19, sp
@@ -294,12 +286,12 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
; CHECK-NEWLOWERING-NEXT: mov x0, x8
; CHECK-NEWLOWERING-NEXT: sub sp, x29, #64
-; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
%res = call i64 @private_za_decl(i64 %v)
%res2 = call i64 @private_za_decl(i64 %res)
>From 24f7329d9fde428f9f658f4da29f4391176e3401 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 2 Sep 2025 15:47:44 +0000
Subject: [PATCH 07/11] Fixups
Change-Id: Ib920d9cd4c2baac8dc278e076806504747619d8b
---
.../Target/AArch64/AArch64ISelLowering.cpp | 24 ++++-----
.../AArch64/AArch64MachineFunctionInfo.h | 4 +-
llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 53 ++++++++-----------
3 files changed, 35 insertions(+), 46 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4bde2d043c99b..dfe1e3a0dbd15 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9300,21 +9300,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Determine whether we need any streaming mode changes.
SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+
+ std::optional<unsigned> ZAMarkerNode;
bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
- bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
- auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
- if (!UseNewSMEABILowering)
- return std::nullopt;
- if (IsAgnosticZAFunction) {
- if (CallAttrs.requiresPreservingAllZAState())
- return AArch64ISD::REQUIRES_ZA_SAVE;
- return std::nullopt;
- }
- if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
- return std::nullopt;
- return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
- : AArch64ISD::INOUT_ZA_USE;
- }();
+ if (UseNewSMEABILowering) {
+ if (CallAttrs.requiresLazySave() ||
+ CallAttrs.requiresPreservingAllZAState())
+ ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
+ else if (CallAttrs.caller().hasZAState() ||
+ CallAttrs.caller().hasZT0State())
+ ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
+ }
if (IsTailCall) {
// Check if it's really possible to do a tail call.
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 5dca1861a6d51..993cff112ba84 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -261,7 +261,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
EarlyAllocSMESaveBuffer = Ptr;
}
- Register getEarlyAllocSMESaveBuffer() { return EarlyAllocSMESaveBuffer; }
+ Register getEarlyAllocSMESaveBuffer() const {
+ return EarlyAllocSMESaveBuffer;
+ }
// Old SME ABI lowering state getters/setters:
Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 49beefcb75495..58f80abf86155 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -200,7 +200,7 @@ struct MachineSMEABI : public MachineFunctionPass {
/// Inserts code to handle changes between ZA states within the function.
/// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
- void insertStateChanges(bool IsAgnosticZA);
+ void insertStateChanges();
// Emission routines for private and shared ZA functions (using lazy saves).
void emitNewZAPrologue(MachineBasicBlock &MBB,
@@ -227,26 +227,25 @@ struct MachineSMEABI : public MachineFunctionPass {
LiveRegs PhysLiveRegs);
void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- ZAState From, ZAState To, LiveRegs PhysLiveRegs,
- bool IsAgnosticZA);
+ ZAState From, ZAState To, LiveRegs PhysLiveRegs);
// Helpers for switching between lazy/full ZA save/restore routines.
void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
- if (IsAgnosticZA)
+ LiveRegs PhysLiveRegs) {
+ if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true);
return emitSetupLazySave(MBB, MBBI);
}
void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
- LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
- if (IsAgnosticZA)
+ LiveRegs PhysLiveRegs) {
+ if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false);
return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs);
}
void emitAllocateZASaveBuffer(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
- if (IsAgnosticZA)
+ LiveRegs PhysLiveRegs) {
+ if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs);
return emitAllocateLazySaveBuffer(MBB, MBBI);
}
@@ -288,13 +287,13 @@ struct MachineSMEABI : public MachineFunctionPass {
std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
Register AgnosticZABufferPtr = AArch64::NoRegister;
LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
- bool HasFullZASaveRestore = false;
} State;
MachineFunction *MF = nullptr;
EdgeBundles *Bundles = nullptr;
const AArch64Subtarget *Subtarget = nullptr;
const AArch64RegisterInfo *TRI = nullptr;
+ const AArch64FunctionInfo *AFI = nullptr;
const TargetInstrInfo *TII = nullptr;
MachineRegisterInfo *MRI = nullptr;
};
@@ -425,7 +424,7 @@ void MachineSMEABI::assignBundleZAStates() {
}
}
-void MachineSMEABI::insertStateChanges(bool IsAgnosticZA) {
+void MachineSMEABI::insertStateChanges() {
for (MachineBasicBlock &MBB : *MF) {
const BlockInfo &Block = State.Blocks[MBB.getNumber()];
ZAState InState = State.BundleStates[Bundles->getBundle(MBB.getNumber(),
@@ -438,7 +437,7 @@ void MachineSMEABI::insertStateChanges(bool IsAgnosticZA) {
for (auto &Inst : Block.Insts) {
if (CurrentState != Inst.NeededState)
emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState,
- Inst.PhysLiveRegs, IsAgnosticZA);
+ Inst.PhysLiveRegs);
CurrentState = Inst.NeededState;
}
@@ -449,7 +448,7 @@ void MachineSMEABI::insertStateChanges(bool IsAgnosticZA) {
State.BundleStates[Bundles->getBundle(MBB.getNumber(), /*Out=*/true)];
if (CurrentState != OutState)
emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState,
- Block.PhysLiveRegsAtExit, IsAgnosticZA);
+ Block.PhysLiveRegsAtExit);
}
}
@@ -582,8 +581,6 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
void MachineSMEABI::emitAllocateLazySaveBuffer(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
MachineFrameInfo &MFI = MF->getFrameInfo();
- auto *AFI = MF->getInfo<AArch64FunctionInfo>();
-
DebugLoc DL = getDebugLoc(MBB, MBBI);
Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
@@ -680,7 +677,6 @@ void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs, bool IsSave) {
auto *TLI = Subtarget->getTargetLowering();
- State.HasFullZASaveRestore = true;
DebugLoc DL = getDebugLoc(MBB, MBBI);
Register BufferPtr = AArch64::X0;
@@ -705,8 +701,6 @@ void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
void MachineSMEABI::emitAllocateFullZASaveBuffer(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs) {
- auto *AFI = MF->getInfo<AArch64FunctionInfo>();
-
// Buffer already allocated in SelectionDAG.
if (AFI->getEarlyAllocSMESaveBuffer())
return;
@@ -751,7 +745,7 @@ void MachineSMEABI::emitAllocateFullZASaveBuffer(
void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsertPt,
ZAState From, ZAState To,
- LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
+ LiveRegs PhysLiveRegs) {
// ZA not used.
if (From == ZAState::ANY || To == ZAState::ANY)
@@ -783,13 +777,14 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
}
if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
- emitZASave(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
+ emitZASave(MBB, InsertPt, PhysLiveRegs);
else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
- emitZARestore(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
+ emitZARestore(MBB, InsertPt, PhysLiveRegs);
else if (To == ZAState::OFF) {
assert(From != ZAState::CALLER_DORMANT &&
"CALLER_DORMANT to OFF should have already been handled");
- assert(!IsAgnosticZA && "Should not turn ZA off in agnostic ZA function");
+ assert(!AFI->getSMEFnAttrs().hasAgnosticZAInterface() &&
+ "Should not turn ZA off in agnostic ZA function");
emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
} else {
dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
@@ -807,7 +802,7 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
if (!MF.getSubtarget<AArch64Subtarget>().hasSME())
return false;
- auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+ AFI = MF.getInfo<AArch64FunctionInfo>();
SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
!SMEFnAttrs.hasAgnosticZAInterface())
@@ -824,27 +819,23 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
TRI = Subtarget->getRegisterInfo();
MRI = &MF.getRegInfo();
- bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
-
collectNeededZAStates(SMEFnAttrs);
assignBundleZAStates();
- insertStateChanges(/*IsAgnosticZA=*/IsAgnosticZA);
+ insertStateChanges();
// Allocate save buffer (if needed).
- if (State.HasFullZASaveRestore || State.TPIDR2Block) {
+ if (State.AgnosticZABufferPtr != AArch64::NoRegister || State.TPIDR2Block) {
if (State.AfterSMEProloguePt) {
// Note: With inline stack probes the AfterSMEProloguePt may not be in the
// entry block (due to the probing loop).
emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
*State.AfterSMEProloguePt,
- State.PhysLiveRegsAfterSMEPrologue,
- /*IsAgnosticZA=*/IsAgnosticZA);
+ State.PhysLiveRegsAfterSMEPrologue);
} else {
MachineBasicBlock &EntryBlock = MF.front();
emitAllocateZASaveBuffer(
EntryBlock, EntryBlock.getFirstNonPHI(),
- State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry,
- /*IsAgnosticZA=*/IsAgnosticZA);
+ State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry);
}
}
>From 08faee4c376aad9691a218b170256ac729c1e7ab Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 3 Sep 2025 13:25:02 +0000
Subject: [PATCH 08/11] Add comment
Change-Id: Ia4b65e49c9007ae6b13a10ce2ea8ee1411036dc0
---
llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 58f80abf86155..f628f7bbb1bb3 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -219,6 +219,9 @@ struct MachineSMEABI : public MachineFunctionPass {
void emitSetupFullZASave(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs);
+ // Emit a "full" ZA save or restore. It is "full" in the sense that this
+ // function will emit a call to __arm_sme_save or __arm_sme_restore, which
+ // handles saving and restoring both ZA and ZT0.
void emitFullZASaveRestore(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
LiveRegs PhysLiveRegs, bool IsSave);
>From 255394737411e35f1d33c8bff8f5ac870606df7b Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 3 Sep 2025 13:28:03 +0000
Subject: [PATCH 09/11] Fixups
Change-Id: I53a80b98290e4288aa22656811b85c1d8a3bddc9
---
llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 11 +++--------
1 file changed, 3 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index f628f7bbb1bb3..e460e7a1e774f 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -647,8 +647,7 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
.addImm(AArch64SysReg::TPIDR2_EL0);
// If TPIDR2_EL0 is non-zero, commit the lazy save.
// NOTE: Functions that only use ZT0 don't need to zero ZA.
- bool ZeroZA =
- MF->getInfo<AArch64FunctionInfo>()->getSMEFnAttrs().hasZAState();
+ bool ZeroZA = AFI->getSMEFnAttrs().hasZAState();
auto CommitZASave =
BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo))
.addReg(TPIDR2EL0)
@@ -666,8 +665,7 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
Register MachineSMEABI::getAgnosticZABufferPtr() {
if (State.AgnosticZABufferPtr != AArch64::NoRegister)
return State.AgnosticZABufferPtr;
- if (auto BufferPtr =
- MF->getInfo<AArch64FunctionInfo>()->getEarlyAllocSMESaveBuffer();
+ if (auto BufferPtr = AFI->getEarlyAllocSMESaveBuffer();
BufferPtr != AArch64::NoRegister)
State.AgnosticZABufferPtr = BufferPtr;
else
@@ -762,10 +760,7 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
// TODO: Avoid setting up the save buffer if there's no transition to
// LOCAL_SAVED.
if (From == ZAState::CALLER_DORMANT) {
- assert(MBB.getParent()
- ->getInfo<AArch64FunctionInfo>()
- ->getSMEFnAttrs()
- .hasPrivateZAInterface() &&
+ assert(AFI->getSMEFnAttrs().hasPrivateZAInterface() &&
"CALLER_DORMANT state requires private ZA interface");
assert(&MBB == &MBB.getParent()->front() &&
"CALLER_DORMANT state only valid in entry block");
>From 846aba4c6ae5b697daf2d883c550de78e2bc7255 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 5 Sep 2025 13:10:17 +0000
Subject: [PATCH 10/11] Add support for probed agnostic ZA allocas
Change-Id: I868b20fc09b7c971edf778924ac56a065df19772
---
.../Target/AArch64/AArch64ISelLowering.cpp | 11 +++-
llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 66 +++++++++++++++++++
2 files changed, 76 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dfe1e3a0dbd15..5e11145ecd161 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8496,6 +8496,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
DAG.getConstant(1, DL, MVT::i32));
Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+ } else if (Attrs.hasAgnosticZAInterface()) {
+ RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
+ SDValue Callee = DAG.getExternalSymbol(
+ getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
+ auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+ getLibcallCallingConv(LC), RetTy, Callee, {});
+ std::tie(Size, Chain) = LowerCallTo(CLI);
}
if (Size) {
SDValue Buffer = DAG.getNode(
@@ -8561,7 +8570,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
Register BufferPtr =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
FuncInfo->setSMESaveBufferAddr(BufferPtr);
- Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
+ Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
}
}
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index d3b7df92095e8..0fda40ee0e6f8 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -5,6 +5,7 @@
target triple = "aarch64"
declare i64 @private_za_decl(i64)
+declare void @private_za()
declare i64 @agnostic_decl(i64) "aarch64_za_state_agnostic"
; No calls. Test that no buffer is allocated.
@@ -360,3 +361,68 @@ define i64 @test_many_callee_arguments(
i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9)
ret i64 %ret
}
+
+; FIXME: The new lowering should avoid saves/restores in the probing loop.
+define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{
+; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: bl __arm_sme_state_size
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: sub x19, x8, x0
+; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT: cmp sp, x19
+; CHECK-NEXT: b.le .LBB7_3
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b .LBB7_1
+; CHECK-NEXT: .LBB7_3:
+; CHECK-NEXT: mov sp, x19
+; CHECK-NEXT: ldr xzr, [sp]
+; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: bl __arm_sme_save
+; CHECK-NEXT: bl private_za
+; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: bl __arm_sme_restore
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
+; CHECK-NEWLOWERING: // %bb.0:
+; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: mov x29, sp
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
+; CHECK-NEWLOWERING-NEXT: mov x8, sp
+; CHECK-NEWLOWERING-NEXT: sub x19, x8, x0
+; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEWLOWERING-NEXT: cmp sp, x19
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: mrs x8, NZCV
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT: msr NZCV, x8
+; CHECK-NEWLOWERING-NEXT: b.le .LBB7_3
+; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: str xzr, [sp]
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT: b .LBB7_1
+; CHECK-NEWLOWERING-NEXT: .LBB7_3:
+; CHECK-NEWLOWERING-NEXT: mov sp, x19
+; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp]
+; CHECK-NEWLOWERING-NEXT: bl private_za
+; CHECK-NEWLOWERING-NEXT: mov x0, x19
+; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT: mov sp, x29
+; CHECK-NEWLOWERING-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: ret
+ call void @private_za()
+ ret void
+}
>From 4242cb3f06669ec7265d1101bb5a7e61d58ffd1d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 9 Sep 2025 09:55:36 +0000
Subject: [PATCH 11/11] Update checks after rebase
Change-Id: I474c67d0cd84711214491a260d16fc697e9294a3
---
llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 0fda40ee0e6f8..a0a14f2ffae3f 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -259,8 +259,7 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size
; CHECK-NEWLOWERING-NEXT: sub sp, sp, x0
; CHECK-NEWLOWERING-NEXT: mov x19, sp
-; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state
-; CHECK-NEWLOWERING-NEXT: mov x20, x0
+; CHECK-NEWLOWERING-NEXT: mrs x20, SVCR
; CHECK-NEWLOWERING-NEXT: mov x0, x19
; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save
; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_2
More information about the llvm-commits
mailing list