[llvm-branch-commits] [llvm] release/22.x: [AArch64][SME] Limit where SME ABI optimizations apply (#179273) (PR #179473)
Benjamin Maxwell via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Feb 3 06:25:06 PST 2026
https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/179473
This cherry-picks 79eb804. These optimizations were added shortly before the branch, however, we're concerned they're not quite ready for production use. This commit limits the optimizations to the simplest cases.
>From 987494d9ba1f881a477667fd4a5154b6704c4b19 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 3 Feb 2026 14:21:20 +0000
Subject: [PATCH] [AArch64][SME] Limit where SME ABI optimizations apply
(#179273)
These were added recently with a fairly complex propagation step,
however, these optimizations can cause regressions in some cases.
This patch limits the cross-block optimizations to the simple case
picking a state that matches all incoming blocks. If any block doesn't
match, we fallback to using "ACTIVE", the default state.
---
llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 168 ++----------------
llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 82 +++------
.../CodeGen/AArch64/sme-new-za-function.ll | 21 +--
.../CodeGen/AArch64/sme-za-control-flow.ll | 26 +--
.../test/CodeGen/AArch64/sme-za-exceptions.ll | 69 +++++--
.../AArch64/sme-za-lazy-save-buffer.ll | 141 +++++----------
6 files changed, 162 insertions(+), 345 deletions(-)
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 823c754a0ac05..9b96bed823817 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -201,23 +201,6 @@ class EmitContext {
Register AgnosticZABufferPtr = AArch64::NoRegister;
};
-/// Checks if \p State is a legal edge bundle state. For a state to be a legal
-/// bundle state, it must be possible to transition from it to any other bundle
-/// state without losing any ZA state. This is the case for ACTIVE/LOCAL_SAVED,
-/// as you can transition between those states by saving/restoring ZA. The OFF
-/// state would not be legal, as transitioning to it drops the content of ZA.
-static bool isLegalEdgeBundleZAState(ZAState State) {
- switch (State) {
- case ZAState::ACTIVE: // ZA state within the accumulator/ZT0.
- case ZAState::ACTIVE_ZT0_SAVED: // ZT0 is saved (ZA is active).
- case ZAState::LOCAL_SAVED: // ZA state may be saved on the stack.
- case ZAState::LOCAL_COMMITTED: // ZA state is saved on the stack.
- return true;
- default:
- return false;
- }
-}
-
StringRef getZAStateString(ZAState State) {
#define MAKE_CASE(V) \
case V: \
@@ -325,11 +308,6 @@ struct MachineSMEABI : public MachineFunctionPass {
const EdgeBundles &Bundles,
ArrayRef<ZAState> BundleStates);
- /// Propagates desired states forwards (from predecessors -> successors) if
- /// \p Forwards, otherwise, propagates backwards (from successors ->
- /// predecessors).
- void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true);
-
void emitZT0SaveRestore(EmitContext &, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, bool IsSave);
@@ -526,110 +504,36 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
PhysLiveRegsAfterSMEPrologue};
}
-void MachineSMEABI::propagateDesiredStates(FunctionInfo &FnInfo,
- bool Forwards) {
- // If `Forwards`, this propagates desired states from predecessors to
- // successors, otherwise, this propagates states from successors to
- // predecessors.
- auto GetBlockState = [](BlockInfo &Block, bool Incoming) -> ZAState & {
- return Incoming ? Block.DesiredIncomingState : Block.DesiredOutgoingState;
- };
-
- SmallVector<MachineBasicBlock *> Worklist;
- for (auto [BlockID, BlockInfo] : enumerate(FnInfo.Blocks)) {
- if (!isLegalEdgeBundleZAState(GetBlockState(BlockInfo, Forwards)))
- Worklist.push_back(MF->getBlockNumbered(BlockID));
- }
-
- while (!Worklist.empty()) {
- MachineBasicBlock *MBB = Worklist.pop_back_val();
- BlockInfo &Block = FnInfo.Blocks[MBB->getNumber()];
-
- // Pick a legal edge bundle state that matches the majority of
- // predecessors/successors.
- int StateCounts[ZAState::NUM_ZA_STATE] = {0};
- for (MachineBasicBlock *PredOrSucc :
- Forwards ? predecessors(MBB) : successors(MBB)) {
- BlockInfo &PredOrSuccBlock = FnInfo.Blocks[PredOrSucc->getNumber()];
- ZAState ZAState = GetBlockState(PredOrSuccBlock, !Forwards);
- if (isLegalEdgeBundleZAState(ZAState))
- StateCounts[ZAState]++;
- }
-
- ZAState PropagatedState = ZAState(max_element(StateCounts) - StateCounts);
- ZAState &CurrentState = GetBlockState(Block, Forwards);
- if (PropagatedState != CurrentState) {
- CurrentState = PropagatedState;
- ZAState &OtherState = GetBlockState(Block, !Forwards);
- // Propagate to the incoming/outgoing state if that is also "ANY".
- if (OtherState == ZAState::ANY)
- OtherState = PropagatedState;
- // Push any successors/predecessors that may need updating to the
- // worklist.
- for (MachineBasicBlock *SuccOrPred :
- Forwards ? successors(MBB) : predecessors(MBB)) {
- BlockInfo &SuccOrPredBlock = FnInfo.Blocks[SuccOrPred->getNumber()];
- if (!isLegalEdgeBundleZAState(GetBlockState(SuccOrPredBlock, Forwards)))
- Worklist.push_back(SuccOrPred);
- }
- }
- }
-}
-
/// Assigns each edge bundle a ZA state based on the needed states of blocks
-/// that have incoming or outgoing edges in that bundle.
+/// that have incoming or outgoing blocks in that bundle.
SmallVector<ZAState>
MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
const FunctionInfo &FnInfo) {
SmallVector<ZAState> BundleStates(Bundles.getNumBundles());
for (unsigned I = 0, E = Bundles.getNumBundles(); I != E; ++I) {
- LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n');
-
- // Attempt to assign a ZA state for this bundle that minimizes state
- // transitions. Edges within loops are given a higher weight as we assume
- // they will be executed more than once.
- int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
+ std::optional<ZAState> BundleState;
for (unsigned BlockID : Bundles.getBlocks(I)) {
- LLVM_DEBUG(dbgs() << "- bb." << BlockID);
-
const BlockInfo &Block = FnInfo.Blocks[BlockID];
- bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I;
- bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I;
-
- bool LegalInEdge =
- InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState);
- bool LegalOutEgde =
- OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState);
- if (LegalInEdge) {
- LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
- << getZAStateString(Block.DesiredIncomingState));
- EdgeStateCounts[Block.DesiredIncomingState]++;
- }
- if (LegalOutEgde) {
- LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
- << getZAStateString(Block.DesiredOutgoingState));
- EdgeStateCounts[Block.DesiredOutgoingState]++;
- }
- if (!LegalInEdge && !LegalOutEgde)
- LLVM_DEBUG(dbgs() << " (no state preference)");
- LLVM_DEBUG(dbgs() << '\n');
+ // Check if the block is an incoming block in the bundle. Note: We skip
+ // Block.FixedEntryState != ANY to ignore EH pads (which are only
+ // reachable via exceptions).
+ if (Block.FixedEntryState != ZAState::ANY ||
+ Bundles.getBundle(BlockID, /*Out=*/false) != I)
+ continue;
+
+ // Pick a state that matches all incoming blocks. Fallback to "ACTIVE" if
+ // any blocks doesn't match. This will hoist the state from incoming
+ // blocks to outgoing blocks.
+ if (!BundleState)
+ BundleState = Block.DesiredIncomingState;
+ else if (BundleState != Block.DesiredIncomingState)
+ BundleState = ZAState::ACTIVE;
}
- ZAState BundleState =
- ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
-
- if (BundleState == ZAState::ANY)
+ if (!BundleState || BundleState == ZAState::ANY)
BundleState = ZAState::ACTIVE;
- LLVM_DEBUG({
- dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n'
- << "Edge counts:";
- for (auto [State, Count] : enumerate(EdgeStateCounts))
- dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count;
- dbgs() << "\n\n";
- });
-
- BundleStates[I] = BundleState;
+ BundleStates[I] = *BundleState;
}
return BundleStates;
@@ -1268,42 +1172,6 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs);
- if (OptLevel != CodeGenOptLevel::None) {
- // Propagate desired states forward, then backwards. Most of the propagation
- // should be done in the forward step, and backwards propagation is then
- // used to fill in the gaps. Note: Doing both in one step can give poor
- // results. For example, consider this subgraph:
- //
- // ┌─────┐
- // ┌─┤ BB0 ◄───┐
- // │ └─┬───┘ │
- // │ ┌─▼───◄──┐│
- // │ │ BB1 │ ││
- // │ └─┬┬──┘ ││
- // │ │└─────┘│
- // │ ┌─▼───┐ │
- // │ │ BB2 ├───┘
- // │ └─┬───┘
- // │ ┌─▼───┐
- // └─► BB3 │
- // └─────┘
- //
- // If:
- // - "BB0" and "BB2" (outer loop) has no state preference
- // - "BB1" (inner loop) desires the ACTIVE state on entry/exit
- // - "BB3" desires the LOCAL_SAVED state on entry
- //
- // If we propagate forwards first, ACTIVE is propagated from BB1 to BB2,
- // then from BB2 to BB0. Which results in the inner and outer loops having
- // the "ACTIVE" state. This avoids any state changes in the loops.
- //
- // If we propagate backwards first, we _could_ propagate LOCAL_SAVED from
- // BB3 to BB0, which would result in a transition from ACTIVE -> LOCAL_SAVED
- // in the outer loop.
- for (bool Forwards : {true, false})
- propagateDesiredStates(FnInfo, Forwards);
- }
-
SmallVector<ZAState> BundleStates = assignBundleZAStates(Bundles, FnInfo);
EmitContext Context;
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 344f1ef24b843..4a18b9f61d69f 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -352,61 +352,33 @@ define i64 @test_many_callee_arguments(
}
define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{
-; CHECK-SDAG-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
-; CHECK-SDAG: // %bb.0:
-; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-SDAG-NEXT: mov x29, sp
-; CHECK-SDAG-NEXT: bl __arm_sme_state_size
-; CHECK-SDAG-NEXT: mov x8, sp
-; CHECK-SDAG-NEXT: sub x19, x8, x0
-; CHECK-SDAG-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-NEXT: sub sp, sp, #16, lsl #12 // =65536
-; CHECK-SDAG-NEXT: cmp sp, x19
-; CHECK-SDAG-NEXT: b.le .LBB7_3
-; CHECK-SDAG-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
-; CHECK-SDAG-NEXT: str xzr, [sp]
-; CHECK-SDAG-NEXT: b .LBB7_1
-; CHECK-SDAG-NEXT: .LBB7_3:
-; CHECK-SDAG-NEXT: mov sp, x19
-; CHECK-SDAG-NEXT: ldr xzr, [sp]
-; CHECK-SDAG-NEXT: mov x0, x19
-; CHECK-SDAG-NEXT: bl __arm_sme_save
-; CHECK-SDAG-NEXT: bl private_za
-; CHECK-SDAG-NEXT: mov x0, x19
-; CHECK-SDAG-NEXT: bl __arm_sme_restore
-; CHECK-SDAG-NEXT: mov sp, x29
-; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-SDAG-NEXT: ret
-;
-; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
-; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: bl __arm_sme_state_size
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: sub x19, x8, x0
-; CHECK-NEXT: mov x0, x19
-; CHECK-NEXT: bl __arm_sme_save
-; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
-; CHECK-NEXT: cmp sp, x19
-; CHECK-NEXT: b.le .LBB7_3
-; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
-; CHECK-NEXT: str xzr, [sp]
-; CHECK-NEXT: b .LBB7_1
-; CHECK-NEXT: .LBB7_3:
-; CHECK-NEXT: mov sp, x19
-; CHECK-NEXT: ldr xzr, [sp]
-; CHECK-NEXT: bl private_za
-; CHECK-NEXT: mov x0, x19
-; CHECK-NEXT: bl __arm_sme_restore
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
-; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT: ret
+; CHECK-COMMON-LABEL: agnostic_za_buffer_alloc_with_stack_probes:
+; CHECK-COMMON: // %bb.0:
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Spill
+; CHECK-COMMON-NEXT: mov x29, sp
+; CHECK-COMMON-NEXT: bl __arm_sme_state_size
+; CHECK-COMMON-NEXT: mov x8, sp
+; CHECK-COMMON-NEXT: sub x19, x8, x0
+; CHECK-COMMON-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
+; CHECK-COMMON-NEXT: sub sp, sp, #16, lsl #12 // =65536
+; CHECK-COMMON-NEXT: cmp sp, x19
+; CHECK-COMMON-NEXT: b.le .LBB7_3
+; CHECK-COMMON-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
+; CHECK-COMMON-NEXT: str xzr, [sp]
+; CHECK-COMMON-NEXT: b .LBB7_1
+; CHECK-COMMON-NEXT: .LBB7_3:
+; CHECK-COMMON-NEXT: mov sp, x19
+; CHECK-COMMON-NEXT: ldr xzr, [sp]
+; CHECK-COMMON-NEXT: mov x0, x19
+; CHECK-COMMON-NEXT: bl __arm_sme_save
+; CHECK-COMMON-NEXT: bl private_za
+; CHECK-COMMON-NEXT: mov x0, x19
+; CHECK-COMMON-NEXT: bl __arm_sme_restore
+; CHECK-COMMON-NEXT: mov sp, x29
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ret
call void @private_za()
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
index d2715b58439d8..6995cfae8e459 100644
--- a/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
+++ b/llvm/test/CodeGen/AArch64/sme-new-za-function.ll
@@ -51,7 +51,6 @@ define void @private_za() "aarch64_new_za" {
}
; Note: This test must run at -O0 as otherwise the multiple exits are optimized out.
-; TODO: We should be able to omit the ZA save here (as this function does not use ZA).
define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za" {
; CHECK-SDAG-LABEL: private_za_multiple_exit:
; CHECK-SDAG: // %bb.0: // %prelude
@@ -99,33 +98,21 @@ define i32 @private_za_multiple_exit(i32 %a, i32 %b, i64 %cond) "aarch64_new_za"
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-NEXT: cbnz x8, .LBB1_1
-; CHECK-NEXT: b .LBB1_2
-; CHECK-NEXT: .LBB1_1: // %entry
-; CHECK-NEXT: bl __arm_tpidr2_save
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: zero {za}
-; CHECK-NEXT: b .LBB1_2
-; CHECK-NEXT: .LBB1_2: // %entry
-; CHECK-NEXT: smstart za
; CHECK-NEXT: str w1, [sp, #8] // 4-byte Spill
; CHECK-NEXT: str w0, [sp, #12] // 4-byte Spill
; CHECK-NEXT: subs x8, x2, #1
-; CHECK-NEXT: b.ne .LBB1_4
-; CHECK-NEXT: b .LBB1_3
-; CHECK-NEXT: .LBB1_3: // %if.else
+; CHECK-NEXT: b.ne .LBB1_2
+; CHECK-NEXT: b .LBB1_1
+; CHECK-NEXT: .LBB1_1: // %if.else
; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Reload
; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Reload
; CHECK-NEXT: add w0, w8, w9
-; CHECK-NEXT: smstop za
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
-; CHECK-NEXT: .LBB1_4: // %if.end
+; CHECK-NEXT: .LBB1_2: // %if.end
; CHECK-NEXT: ldr w8, [sp, #12] // 4-byte Reload
; CHECK-NEXT: ldr w9, [sp, #8] // 4-byte Reload
; CHECK-NEXT: subs w0, w8, w9
-; CHECK-NEXT: smstop za
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
index 50449172ce85b..aae1d3b756f4e 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll
@@ -49,36 +49,40 @@ define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind {
; CHECK-LABEL: private_za_loop:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill
+; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
; CHECK-NEXT: cmp w0, #1
; CHECK-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
-; CHECK-NEXT: b.lt .LBB0_3
+; CHECK-NEXT: b.lt .LBB0_5
; CHECK-NEXT: // %bb.1: // %loop.preheader
; CHECK-NEXT: mov w19, w0
+; CHECK-NEXT: sub x20, x29, #16
+; CHECK-NEXT: b .LBB0_3
; CHECK-NEXT: .LBB0_2: // %loop
+; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: cbz w19, .LBB0_5
+; CHECK-NEXT: .LBB0_3: // %loop
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_call
-; CHECK-NEXT: subs w19, w19, #1
-; CHECK-NEXT: b.ne .LBB0_2
-; CHECK-NEXT: .LBB0_3: // %exit
+; CHECK-NEXT: sub w19, w19, #1
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB0_5
-; CHECK-NEXT: // %bb.4: // %exit
+; CHECK-NEXT: cbnz x8, .LBB0_2
+; CHECK-NEXT: // %bb.4: // %loop
+; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_5: // %exit
-; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index 5243b8d7203d8..19ea1e47f84ff 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -63,17 +63,25 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe
; CHECK-NEXT: ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr]
; CHECK-NEXT: bl __cxa_throw
; CHECK-NEXT: .Ltmp1: // EH_LABEL
-; CHECK-NEXT: // %bb.3: // %throw_fail
-; CHECK-NEXT: .LBB0_4: // %unwind_dtors
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB0_4
+; CHECK-NEXT: // %bb.3: // %throw_exception
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB0_4: // %throw_exception
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: // %bb.5: // %throw_fail
+; CHECK-NEXT: .LBB0_6: // %unwind_dtors
; CHECK-NEXT: .Ltmp2: // EH_LABEL
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB0_6
-; CHECK-NEXT: // %bb.5: // %unwind_dtors
+; CHECK-NEXT: cbnz x8, .LBB0_8
+; CHECK-NEXT: // %bb.7: // %unwind_dtors
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB0_6: // %unwind_dtors
+; CHECK-NEXT: .LBB0_8: // %unwind_dtors
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: bl shared_za_call
; CHECK-NEXT: sub x8, x29, #16
@@ -224,15 +232,15 @@ define void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v
; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: bl may_throw
; CHECK-NEXT: .Ltmp4: // EH_LABEL
-; CHECK-NEXT: .LBB1_1: // %after_catch
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB1_3
-; CHECK-NEXT: // %bb.2: // %after_catch
+; CHECK-NEXT: cbnz x8, .LBB1_2
+; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB1_3: // %after_catch
+; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: .LBB1_3: // %after_catch
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: b shared_za_call
@@ -251,7 +259,15 @@ define void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v
; CHECK-NEXT: sub x8, x29, #16
; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: bl __cxa_end_catch
-; CHECK-NEXT: b .LBB1_1
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB1_8
+; CHECK-NEXT: // %bb.7: // %catch
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB1_8: // %catch
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: b .LBB1_3
;
; CHECK-SDAG-LABEL: try_catch:
; CHECK-SDAG: .Lfunc_begin1:
@@ -387,8 +403,8 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx
; CHECK-NEXT: .Ltmp6: // EH_LABEL
; CHECK-NEXT: bl shared_za_call
; CHECK-NEXT: .Ltmp7: // EH_LABEL
-; CHECK-NEXT: .LBB2_3: // %exit
; CHECK-NEXT: smstop za
+; CHECK-NEXT: .LBB2_3: // %exit
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
@@ -408,6 +424,7 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx
; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: bl __cxa_end_catch
; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: smstop za
; CHECK-NEXT: b .LBB2_3
;
; CHECK-SDAG-LABEL: try_catch_shared_za_callee:
@@ -636,9 +653,9 @@ define void @try_catch_agnostic_za() "aarch64_za_state_agnostic" personality ptr
; CHECK-NEXT: bl __arm_sme_save
; CHECK-NEXT: bl may_throw
; CHECK-NEXT: .Ltmp13: // EH_LABEL
-; CHECK-NEXT: .LBB4_1: // %exit
; CHECK-NEXT: mov x0, x19
; CHECK-NEXT: bl __arm_sme_restore
+; CHECK-NEXT: .LBB4_1: // %exit
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
@@ -647,6 +664,8 @@ define void @try_catch_agnostic_za() "aarch64_za_state_agnostic" personality ptr
; CHECK-NEXT: .Ltmp14: // EH_LABEL
; CHECK-NEXT: bl __cxa_begin_catch
; CHECK-NEXT: bl __cxa_end_catch
+; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: bl __arm_sme_restore
; CHECK-NEXT: b .LBB4_1
;
; CHECK-SDAG-LABEL: try_catch_agnostic_za:
@@ -746,9 +765,9 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal
; CHECK-NEXT: bl __arm_sme_save
; CHECK-NEXT: bl agnostic_za_call
; CHECK-NEXT: .Ltmp16: // EH_LABEL
-; CHECK-NEXT: .LBB5_1: // %exit
; CHECK-NEXT: mov x0, x19
; CHECK-NEXT: bl __arm_sme_restore
+; CHECK-NEXT: .LBB5_1: // %exit
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload
; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
@@ -757,6 +776,8 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal
; CHECK-NEXT: .Ltmp17: // EH_LABEL
; CHECK-NEXT: bl __cxa_begin_catch
; CHECK-NEXT: bl __cxa_end_catch
+; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: bl __arm_sme_restore
; CHECK-NEXT: b .LBB5_1
;
; CHECK-SDAG-LABEL: try_catch_agnostic_za_invoke:
@@ -845,15 +866,15 @@ define void @try_catch_inout_za_agnostic_za_callee() "aarch64_inout_za" personal
; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: bl agnostic_za_call
; CHECK-NEXT: .Ltmp19: // EH_LABEL
-; CHECK-NEXT: .LBB6_1: // %exit
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB6_3
-; CHECK-NEXT: // %bb.2: // %exit
+; CHECK-NEXT: cbnz x8, .LBB6_2
+; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB6_3: // %exit
+; CHECK-NEXT: .LBB6_2: // %entry
; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: .LBB6_3: // %exit
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
@@ -861,7 +882,15 @@ define void @try_catch_inout_za_agnostic_za_callee() "aarch64_inout_za" personal
; CHECK-NEXT: .Ltmp20: // EH_LABEL
; CHECK-NEXT: bl __cxa_begin_catch
; CHECK-NEXT: bl __cxa_end_catch
-; CHECK-NEXT: b .LBB6_1
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB6_6
+; CHECK-NEXT: // %bb.5: // %catch
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB6_6: // %catch
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: b .LBB6_3
;
; CHECK-SDAG-LABEL: try_catch_inout_za_agnostic_za_callee:
; CHECK-SDAG: .Lfunc_begin6:
@@ -967,9 +996,9 @@ define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr @__gxx_pe
; CHECK-NEXT: smstop za
; CHECK-NEXT: bl may_throw
; CHECK-NEXT: .Ltmp22: // EH_LABEL
-; CHECK-NEXT: .LBB7_1: // %exit
; CHECK-NEXT: smstart za
; CHECK-NEXT: ldr zt0, [x19]
+; CHECK-NEXT: .LBB7_1: // %exit
; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
@@ -977,6 +1006,8 @@ define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr @__gxx_pe
; CHECK-NEXT: .Ltmp23: // EH_LABEL
; CHECK-NEXT: bl __cxa_begin_catch
; CHECK-NEXT: bl __cxa_end_catch
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: ldr zt0, [x19]
; CHECK-NEXT: b .LBB7_1
;
; CHECK-SDAG-LABEL: try_catch_inout_zt0:
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index d4840f77c5392..f5c11146a7ca6 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -1,52 +1,52 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi=false < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-SDAG
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi=false < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
define i32 @no_tpidr2_save_required() "aarch64_inout_za" {
-; CHECK-COMMON-LABEL: no_tpidr2_save_required:
-; CHECK-COMMON: // %bb.0: // %entry
-; CHECK-COMMON-NEXT: mov w0, #42 // =0x2a
-; CHECK-COMMON-NEXT: ret
+; CHECK-LABEL: no_tpidr2_save_required:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w0, #42 // =0x2a
+; CHECK-NEXT: ret
entry:
ret i32 42
}
define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" {
-; CHECK-COMMON-LABEL: multi_bb_stpidr2_save_required:
-; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: mov x29, sp
-; CHECK-COMMON-NEXT: sub sp, sp, #16
-; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 16
-; CHECK-COMMON-NEXT: .cfi_offset w30, -8
-; CHECK-COMMON-NEXT: .cfi_offset w29, -16
-; CHECK-COMMON-NEXT: rdsvl x8, #1
-; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT: mov sp, x9
-; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-COMMON-NEXT: cbz w0, .LBB1_2
-; CHECK-COMMON-NEXT: // %bb.1: // %use_b
-; CHECK-COMMON-NEXT: fmov s1, #4.00000000
-; CHECK-COMMON-NEXT: fadd s0, s0, s1
-; CHECK-COMMON-NEXT: b .LBB1_5
-; CHECK-COMMON-NEXT: .LBB1_2: // %use_c
-; CHECK-COMMON-NEXT: fmov s0, s1
-; CHECK-COMMON-NEXT: sub x8, x29, #16
-; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8
-; CHECK-COMMON-NEXT: bl cosf
-; CHECK-COMMON-NEXT: smstart za
-; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-COMMON-NEXT: sub x0, x29, #16
-; CHECK-COMMON-NEXT: cbnz x8, .LBB1_4
-; CHECK-COMMON-NEXT: // %bb.3: // %use_c
-; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore
-; CHECK-COMMON-NEXT: .LBB1_4: // %use_c
-; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-COMMON-NEXT: .LBB1_5: // %exit
-; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: ret
+; CHECK-LABEL: multi_bb_stpidr2_save_required:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x9, x8, x8, x9
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: stp x9, x8, [x29, #-16]
+; CHECK-NEXT: cbz w0, .LBB1_2
+; CHECK-NEXT: // %bb.1: // %use_b
+; CHECK-NEXT: fmov s1, #4.00000000
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: b .LBB1_5
+; CHECK-NEXT: .LBB1_2: // %use_c
+; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: sub x8, x29, #16
+; CHECK-NEXT: msr TPIDR2_EL0, x8
+; CHECK-NEXT: bl cosf
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB1_4
+; CHECK-NEXT: // %bb.3: // %use_c
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB1_4: // %use_c
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: .LBB1_5: // %exit
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
%cmp = icmp ne i32 %a, 0
br i1 %cmp, label %use_b, label %use_c
@@ -64,51 +64,6 @@ exit:
}
define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
-; CHECK-SDAG-LABEL: multi_bb_stpidr2_save_required_stackprobe:
-; CHECK-SDAG: // %bb.0:
-; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-SDAG-NEXT: mov x29, sp
-; CHECK-SDAG-NEXT: str xzr, [sp, #-16]!
-; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 16
-; CHECK-SDAG-NEXT: .cfi_offset w30, -8
-; CHECK-SDAG-NEXT: .cfi_offset w29, -16
-; CHECK-SDAG-NEXT: rdsvl x8, #1
-; CHECK-SDAG-NEXT: mov x9, sp
-; CHECK-SDAG-NEXT: msub x9, x8, x8, x9
-; CHECK-SDAG-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
-; CHECK-SDAG-NEXT: sub sp, sp, #16, lsl #12 // =65536
-; CHECK-SDAG-NEXT: cmp sp, x9
-; CHECK-SDAG-NEXT: b.le .LBB2_3
-; CHECK-SDAG-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
-; CHECK-SDAG-NEXT: str xzr, [sp]
-; CHECK-SDAG-NEXT: b .LBB2_1
-; CHECK-SDAG-NEXT: .LBB2_3:
-; CHECK-SDAG-NEXT: mov sp, x9
-; CHECK-SDAG-NEXT: ldr xzr, [sp]
-; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16]
-; CHECK-SDAG-NEXT: cbz w0, .LBB2_5
-; CHECK-SDAG-NEXT: // %bb.4: // %use_b
-; CHECK-SDAG-NEXT: fmov s1, #4.00000000
-; CHECK-SDAG-NEXT: fadd s0, s0, s1
-; CHECK-SDAG-NEXT: b .LBB2_8
-; CHECK-SDAG-NEXT: .LBB2_5: // %use_c
-; CHECK-SDAG-NEXT: fmov s0, s1
-; CHECK-SDAG-NEXT: sub x8, x29, #16
-; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8
-; CHECK-SDAG-NEXT: bl cosf
-; CHECK-SDAG-NEXT: smstart za
-; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0
-; CHECK-SDAG-NEXT: sub x0, x29, #16
-; CHECK-SDAG-NEXT: cbnz x8, .LBB2_7
-; CHECK-SDAG-NEXT: // %bb.6: // %use_c
-; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore
-; CHECK-SDAG-NEXT: .LBB2_7: // %use_c
-; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-SDAG-NEXT: .LBB2_8: // %exit
-; CHECK-SDAG-NEXT: mov sp, x29
-; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK-SDAG-NEXT: ret
-;
; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
@@ -119,9 +74,7 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: sub x10, x29, #16
; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
; CHECK-NEXT: cmp sp, x9
@@ -137,19 +90,21 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
; CHECK-NEXT: // %bb.4: // %use_b
; CHECK-NEXT: fmov s1, #4.00000000
; CHECK-NEXT: fadd s0, s0, s1
-; CHECK-NEXT: b .LBB2_6
+; CHECK-NEXT: b .LBB2_8
; CHECK-NEXT: .LBB2_5: // %use_c
; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: sub x8, x29, #16
+; CHECK-NEXT: msr TPIDR2_EL0, x8
; CHECK-NEXT: bl cosf
-; CHECK-NEXT: .LBB2_6: // %exit
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: sub x0, x29, #16
-; CHECK-NEXT: cbnz x8, .LBB2_8
-; CHECK-NEXT: // %bb.7: // %exit
+; CHECK-NEXT: cbnz x8, .LBB2_7
+; CHECK-NEXT: // %bb.6: // %use_c
; CHECK-NEXT: bl __arm_tpidr2_restore
-; CHECK-NEXT: .LBB2_8: // %exit
+; CHECK-NEXT: .LBB2_7: // %use_c
; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: .LBB2_8: // %exit
; CHECK-NEXT: mov sp, x29
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
More information about the llvm-branch-commits
mailing list