[llvm] [AArch64][SME] Support Windows/stack probes in MachineSMEABIPass (PR #149063)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 5 05:20:50 PDT 2025


https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/149063

>From a62e9e0bb68f5491a1eca03f392e1d819701468c Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 15 Jul 2025 11:47:39 +0000
Subject: [PATCH 1/5] [AArch64][SME] Support Windows/stack probes in
 MachineSMEABIPass

On Windows or with stack probes on other targets, additional code needs
to be inserted after dynamic stack allocations to validate stack
accesses and/or ensure enough stack space has been allocated.

Rather than handle this case in the MachineSMEABIPass (like we do for
the standard case), we allocate the memory for the lazy save buffer in
SelectionDAG, which allows the existing expansions to emit the correct
code.

Note: This means in these cases, we may allocate a lazy save buffer when
there are no lazy saves present in the function (as we have to allocate
the buffer before the MachineSMEABIPass runs).

Change-Id: If89ab54c4de79f6fe5513a6b387e9e349f7bc7d1
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  1 +
 .../Target/AArch64/AArch64ISelLowering.cpp    | 34 ++++++++-
 .../AArch64/AArch64MachineFunctionInfo.h      | 10 +++
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  7 ++
 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 25 +++++--
 .../CodeGen/AArch64/sme-lazy-save-windows.ll  | 71 +++++++++++++++++++
 .../AArch64/sme-za-lazy-save-buffer.ll        | 26 ++++---
 7 files changed, 160 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 57dcd68595ff1..79655e1c9529c 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1688,6 +1688,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
    }
    case AArch64::InOutZAUsePseudo:
    case AArch64::RequiresZASavePseudo:
+   case AArch64::SMEStateAllocPseudo:
    case AArch64::COALESCER_BARRIER_FPR16:
    case AArch64::COALESCER_BARRIER_FPR32:
    case AArch64::COALESCER_BARRIER_FPR64:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b8335113e4687..ca71205205b53 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8291,7 +8291,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   if (Subtarget->hasCustomCallingConv())
     Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
 
-  if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
+  if (getTM().useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) {
+    if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
+      SDValue Size;
+      if (Attrs.hasZAState()) {
+        SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                                  DAG.getConstant(1, DL, MVT::i32));
+        Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+      } else if (Attrs.hasAgnosticZAInterface()) {
+        RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
+        SDValue Callee = DAG.getExternalSymbol(
+            getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
+        auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
+        TargetLowering::CallLoweringInfo CLI(DAG);
+        CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+            getLibcallCallingConv(LC), RetTy, Callee, {});
+        std::tie(Size, Chain) = LowerCallTo(CLI);
+      }
+      if (Size) {
+        SDValue Buffer = DAG.getNode(
+            ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
+            {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
+        Chain = Buffer.getValue(1);
+
+        Register BufferPtr =
+            MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+        Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
+        Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
+                            DAG.getVTList(MVT::Other), Chain);
+        FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
+        MFI.CreateVariableSizedObject(Align(16), nullptr);
+      }
+    }
+  } else {
     // Old SME ABI lowering (deprecated):
     // Create a 16 Byte TPIDR2 object. The dynamic buffer
     // will be expanded and stored in the static object later using a
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 1fde87e65a34b..31bd72bfa77a0 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -238,6 +238,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   // Holds the SME function attributes (streaming mode, ZA/ZT0 state).
   SMEAttrs SMEFnAttrs;
 
+  // Holds the TPIDR2 block if allocated early (for Windows/stack probes
+  // support).
+  Register EarlyAllocSMESaveBuffer = AArch64::NoRegister;
+
   // Note: The following properties are only used for the old SME ABI lowering:
   /// The frame-index for the TPIDR2 object used for lazy saves.
   TPIDR2Object TPIDR2;
@@ -256,6 +260,12 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
         const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
       const override;
 
+  void setEarlyAllocSMESaveBuffer(Register Ptr) {
+    EarlyAllocSMESaveBuffer = Ptr;
+  }
+
+  Register getEarlyAllocSMESaveBuffer() { return EarlyAllocSMESaveBuffer; }
+
   // Old SME ABI lowering state getters/setters:
   Register getSMESaveBufferAddr() const { return SMESaveBufferAddr; };
   void setSMESaveBufferAddr(Register Reg) { SMESaveBufferAddr = Reg; };
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 0d8cb3a76d0be..601dc34d74b9c 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -93,6 +93,8 @@ let hasSideEffects = 1, isMeta = 1 in {
   def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
 }
 
+def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+
 def CommitZASavePseudo
   : Pseudo<(outs),
            (ins GPR64:$tpidr2_el0, i1imm:$zero_za, i64imm:$commit_routine, variable_ops), []>,
@@ -108,6 +110,11 @@ def AArch64_requires_za_save
            [SDNPHasChain, SDNPInGlue]>;
 def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
 
+def AArch64_sme_state_alloc
+  : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
+           [SDNPHasChain]>;
+def : Pat<(AArch64_sme_state_alloc), (SMEStateAllocPseudo)>;
+
 //===----------------------------------------------------------------------===//
 // Instruction naming conventions.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 5dfaa891193cf..3501b86bc5f3a 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -249,6 +249,7 @@ struct MachineSMEABI : public MachineFunctionPass {
     SmallVector<BlockInfo> Blocks;
     SmallVector<ZAState> BundleStates;
     std::optional<TPIDR2State> TPIDR2Block;
+    std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
   } State;
 
   MachineFunction *MF = nullptr;
@@ -298,6 +299,13 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
       MachineBasicBlock::iterator MBBI(MI);
       LiveUnits.stepBackward(MI);
       LiveRegs PhysLiveRegs = GetPhysLiveRegs();
+      // The SMEStateAllocPseudo marker is added to a function if the save
+      // buffer was allocated in SelectionDAG. It marks the end of the
+      // allocation -- which is a safe point for this pass to insert any TPIDR2
+      // block setup.
+      if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
+        State.AfterSMEProloguePt = MBBI;
+      }
       auto [NeededState, InsertPt] = getZAStateBeforeInst(
           *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
       assert((InsertPt == MBBI ||
@@ -529,23 +537,25 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
 void MachineSMEABI::emitAllocateLazySaveBuffer(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
   MachineFrameInfo &MFI = MF->getFrameInfo();
+  auto *AFI = MF->getInfo<AArch64FunctionInfo>();
 
   DebugLoc DL = getDebugLoc(MBB, MBBI);
   Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
   Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-  Register Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
 
   // Calculate SVL.
   BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
 
   // 1. Allocate the lazy save buffer.
-  {
+  if (Buffer == AArch64::NoRegister) {
     // TODO This function grows the stack with a subtraction, which doesn't work
     // on Windows. Some refactoring to share the functionality in
     // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
     // supports SME
     assert(!Subtarget->isTargetWindows() &&
            "Lazy ZA save is not yet supported on Windows");
+    Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
     // Get original stack pointer.
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
         .addReg(AArch64::SP);
@@ -686,8 +696,15 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
 
   // Allocate save buffer (if needed).
   if (State.TPIDR2Block) {
-    MachineBasicBlock &EntryBlock = MF.front();
-    emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+    if (State.AfterSMEProloguePt) {
+      // Note: With inline stack probes the AfterSMEProloguePt may not be in the
+      // entry block (due to the probing loop).
+      emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
+                                 *State.AfterSMEProloguePt);
+    } else {
+      MachineBasicBlock &EntryBlock = MF.front();
+      emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+    }
   }
 
   return true;
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
new file mode 100644
index 0000000000000..873b6d9244f46
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING
+
+declare void @private_za_callee()
+declare void @shared_za_callee() "aarch64_inout_za"
+
+define void @test_lazy_save() nounwind "aarch64_inout_za" {
+; CHECK-LABEL: test_lazy_save:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mul x9, x8, x8
+; CHECK-NEXT:    lsr x15, x9, #4
+; CHECK-NEXT:    bl __chkstk
+; CHECK-NEXT:    sub x9, sp, x15, lsl #4
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    bl private_za_callee
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB0_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x29, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+;
+; CHECK-NEWLOWERING-LABEL: test_lazy_save:
+; CHECK-NEWLOWERING:       // %bb.0:
+; CHECK-NEWLOWERING-NEXT:    stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x29, sp
+; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16
+; CHECK-NEWLOWERING-NEXT:    rdsvl x8, #1
+; CHECK-NEWLOWERING-NEXT:    mul x9, x8, x8
+; CHECK-NEWLOWERING-NEXT:    lsr x15, x9, #4
+; CHECK-NEWLOWERING-NEXT:    bl __chkstk
+; CHECK-NEWLOWERING-NEXT:    sub x9, sp, x15, lsl #4
+; CHECK-NEWLOWERING-NEXT:    mov sp, x9
+; CHECK-NEWLOWERING-NEXT:    sub x10, x29, #16
+; CHECK-NEWLOWERING-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEWLOWERING-NEXT:    bl private_za_callee
+; CHECK-NEWLOWERING-NEXT:    smstart za
+; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEWLOWERING-NEXT:    sub x0, x29, #16
+; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB0_2
+; CHECK-NEWLOWERING-NEXT:  // %bb.1:
+; CHECK-NEWLOWERING-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEWLOWERING-NEXT:  .LBB0_2:
+; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEWLOWERING-NEXT:    mov sp, x29
+; CHECK-NEWLOWERING-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp x30, x29, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ret
+  call void @private_za_callee()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index 4ab553d79405d..066ee3b040469 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -99,7 +99,6 @@ exit:
   ret float %ret
 }
 
-; FIXME: This is missing stack probes with -aarch64-new-sme-abi.
 define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
 ; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
 ; CHECK:       // %bb.0:
@@ -157,26 +156,35 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float
 ; CHECK-NEWLOWERING-NEXT:    rdsvl x8, #1
 ; CHECK-NEWLOWERING-NEXT:    mov x9, sp
 ; CHECK-NEWLOWERING-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEWLOWERING-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEWLOWERING-NEXT:    cmp sp, x9
+; CHECK-NEWLOWERING-NEXT:    b.le .LBB2_3
+; CHECK-NEWLOWERING-NEXT:  // %bb.2: // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEWLOWERING-NEXT:    str xzr, [sp]
+; CHECK-NEWLOWERING-NEXT:    b .LBB2_1
+; CHECK-NEWLOWERING-NEXT:  .LBB2_3:
 ; CHECK-NEWLOWERING-NEXT:    mov sp, x9
+; CHECK-NEWLOWERING-NEXT:    ldr xzr, [sp]
 ; CHECK-NEWLOWERING-NEXT:    sub x10, x29, #16
 ; CHECK-NEWLOWERING-NEXT:    stp x9, x8, [x29, #-16]
 ; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT:    cbz w0, .LBB2_2
-; CHECK-NEWLOWERING-NEXT:  // %bb.1: // %use_b
+; CHECK-NEWLOWERING-NEXT:    cbz w0, .LBB2_5
+; CHECK-NEWLOWERING-NEXT:  // %bb.4: // %use_b
 ; CHECK-NEWLOWERING-NEXT:    fmov s1, #4.00000000
 ; CHECK-NEWLOWERING-NEXT:    fadd s0, s0, s1
-; CHECK-NEWLOWERING-NEXT:    b .LBB2_3
-; CHECK-NEWLOWERING-NEXT:  .LBB2_2: // %use_c
+; CHECK-NEWLOWERING-NEXT:    b .LBB2_6
+; CHECK-NEWLOWERING-NEXT:  .LBB2_5: // %use_c
 ; CHECK-NEWLOWERING-NEXT:    fmov s0, s1
 ; CHECK-NEWLOWERING-NEXT:    bl cosf
-; CHECK-NEWLOWERING-NEXT:  .LBB2_3: // %exit
+; CHECK-NEWLOWERING-NEXT:  .LBB2_6: // %exit
 ; CHECK-NEWLOWERING-NEXT:    smstart za
 ; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB2_5
-; CHECK-NEWLOWERING-NEXT:  // %bb.4: // %exit
+; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB2_8
+; CHECK-NEWLOWERING-NEXT:  // %bb.7: // %exit
 ; CHECK-NEWLOWERING-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT:  .LBB2_5: // %exit
+; CHECK-NEWLOWERING-NEXT:  .LBB2_8: // %exit
 ; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEWLOWERING-NEXT:    mov sp, x29
 ; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload

>From ba0759aa8d53b81481a5e9449b564ab79efbce94 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 18 Jul 2025 13:29:39 +0000
Subject: [PATCH 2/5] Update comment

Change-Id: I5dca6eaca8613a33e89a5cec9cc7d2c0f9cc7fb5
---
 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 3501b86bc5f3a..c9e31bf170417 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -549,10 +549,12 @@ void MachineSMEABI::emitAllocateLazySaveBuffer(
 
   // 1. Allocate the lazy save buffer.
   if (Buffer == AArch64::NoRegister) {
-    // TODO This function grows the stack with a subtraction, which doesn't work
-    // on Windows. Some refactoring to share the functionality in
-    // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
-    // supports SME
+    // TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
+    // Buffer != AArch64::NoRegister). This is done to reuse the existing
+    // expansions (which can insert stack checks). This works, but it means we
+    // will always allocate the lazy save buffer (even if the function contains
+    // no lazy saves). If we want to handle Windows here, we'll need to
+    // implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
     assert(!Subtarget->isTargetWindows() &&
            "Lazy ZA save is not yet supported on Windows");
     Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);

>From a87dcab220736f2527b62581cddd54a727e9be61 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 27 Aug 2025 09:11:59 +0000
Subject: [PATCH 3/5] Update tests after rebase

Change-Id: I0c3703e432386814830492e46d896ac2395840fc
---
 .../CodeGen/AArch64/sme-lazy-save-windows.ll  | 40 ++-----------------
 1 file changed, 4 insertions(+), 36 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
index 873b6d9244f46..1c341e8daf491 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-windows.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s --check-prefix=CHECK-NEWLOWERING
+; RUN: llc -mtriple=aarch64-windows-msvc -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme -aarch64-new-sme-abi < %s | FileCheck %s
 
 declare void @private_za_callee()
 declare void @shared_za_callee() "aarch64_inout_za"
@@ -18,12 +18,9 @@ define void @test_lazy_save() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    bl __chkstk
 ; CHECK-NEXT:    sub x9, sp, x15, lsl #4
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    stur x9, [x29, #-16]
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    sub x10, x29, #16
+; CHECK-NEXT:    stp x9, x8, [x29, #-16]
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -37,35 +34,6 @@ define void @test_lazy_save() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x30, x29, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
-;
-; CHECK-NEWLOWERING-LABEL: test_lazy_save:
-; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    stp x30, x29, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    mov x29, sp
-; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16
-; CHECK-NEWLOWERING-NEXT:    rdsvl x8, #1
-; CHECK-NEWLOWERING-NEXT:    mul x9, x8, x8
-; CHECK-NEWLOWERING-NEXT:    lsr x15, x9, #4
-; CHECK-NEWLOWERING-NEXT:    bl __chkstk
-; CHECK-NEWLOWERING-NEXT:    sub x9, sp, x15, lsl #4
-; CHECK-NEWLOWERING-NEXT:    mov sp, x9
-; CHECK-NEWLOWERING-NEXT:    sub x10, x29, #16
-; CHECK-NEWLOWERING-NEXT:    stp x9, x8, [x29, #-16]
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT:    bl private_za_callee
-; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEWLOWERING-NEXT:    sub x0, x29, #16
-; CHECK-NEWLOWERING-NEXT:    cbnz x8, .LBB0_2
-; CHECK-NEWLOWERING-NEXT:  // %bb.1:
-; CHECK-NEWLOWERING-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEWLOWERING-NEXT:  .LBB0_2:
-; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEWLOWERING-NEXT:    mov sp, x29
-; CHECK-NEWLOWERING-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    ldp x30, x29, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    ret
   call void @private_za_callee()
   ret void
 }

>From 58a8b727f2e435ae6c008a2bc16432ba53c976a9 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 5 Sep 2025 12:15:28 +0000
Subject: [PATCH 4/5] Remove agnostic ZA handling

Change-Id: Id48a5e1cbf3f246165b41657372ff046b6ff0c84
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ca71205205b53..90eedc9d2c922 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8298,15 +8298,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
                                   DAG.getConstant(1, DL, MVT::i32));
         Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
-      } else if (Attrs.hasAgnosticZAInterface()) {
-        RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
-        SDValue Callee = DAG.getExternalSymbol(
-            getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
-        auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
-        TargetLowering::CallLoweringInfo CLI(DAG);
-        CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
-            getLibcallCallingConv(LC), RetTy, Callee, {});
-        std::tie(Size, Chain) = LowerCallTo(CLI);
       }
       if (Size) {
         SDValue Buffer = DAG.getNode(

>From 94fd328a8ec8728a4e94a297d9e097fcec790702 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 5 Sep 2025 12:20:33 +0000
Subject: [PATCH 5/5] Remove braces

Change-Id: I6755e71c47d03fba8931876b8607289df30feaae
---
 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index c9e31bf170417..d95d170a813ac 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -303,9 +303,8 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
       // buffer was allocated in SelectionDAG. It marks the end of the
       // allocation -- which is a safe point for this pass to insert any TPIDR2
       // block setup.
-      if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
+      if (MI.getOpcode() == AArch64::SMEStateAllocPseudo)
         State.AfterSMEProloguePt = MBBI;
-      }
       auto [NeededState, InsertPt] = getZAStateBeforeInst(
           *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
       assert((InsertPt == MBBI ||



More information about the llvm-commits mailing list