[llvm] [AArch64][SME] Remove unused ZA lazy-save (PR #81648)

Sam Tebbs via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 17 02:18:46 PDT 2024


https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/81648

>From cf3b779e640c93abdb3b9c4993697934c87e12a9 Mon Sep 17 00:00:00 2001
From: Matt Devereau <matthew.devereau at arm.com>
Date: Mon, 29 Jan 2024 09:49:38 +0000
Subject: [PATCH 01/22] [AArch64][SME] Remove unused ZA lazy-save

This patch removes the TPIDR2 lazy-save object and buffer
if no lazy save is required.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 97 +++++++++++++++++--
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 +
 .../AArch64/AArch64MachineFunctionInfo.h      | 11 ++-
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  4 +
 .../AArch64/sme-disable-gisel-fisel.ll        | 33 ++++---
 .../CodeGen/AArch64/sme-framelower-use-bp.ll  |  9 +-
 .../CodeGen/AArch64/sme-lazy-save-call.ll     | 50 +++++-----
 .../AArch64/sme-shared-za-interface.ll        | 26 ++---
 .../AArch64/sme-za-lazy-save-buffer.ll        | 45 +++++++++
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    | 77 ++++-----------
 10 files changed, 230 insertions(+), 124 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index af8b9d9576ff7..0125b1267561a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2991,6 +2991,68 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
   return BB;
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
+                                          MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+
+  std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj();
+  if (!TPIDR2)
+    llvm_unreachable("Cannot ExpandZABuffer without valid TPIDR2 object");
+
+  if (TPIDR2->Uses == 0) {
+    BB->remove_instr(&MI);
+    MFI.RemoveStackObject(TPIDR2->Addr);
+    return BB;
+  }
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  Register RDSVL = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL)
+      .addImm(1);
+
+  Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
+      .addReg(AArch64::SP);
+
+  Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub)
+      .addReg(RDSVL)
+      .addReg(RDSVL)
+      .addReg(SP);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
+      .addReg(MSub);
+
+  uint64_t TPIDR2Object = TPIDR2->Addr;
+
+  MFI.CreateVariableSizedObject(Align(1), nullptr);
+
+  Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+  MachineInstrBuilder Wzr =
+      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32)
+          .addReg(AArch64::WZR);
+
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+      .addReg(MSub)
+      .addFrameIndex(TPIDR2Object)
+      .addImm(0);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
+      .addReg(Wzr.getReg(0))
+      .addFrameIndex(TPIDR2Object)
+      .addImm(5);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
+      .addReg(Wzr.getReg(0))
+      .addFrameIndex(TPIDR2Object)
+      .addImm(3);
+
+  BB->remove_instr(&MI);
+  return BB;
+}
+
 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
 
@@ -3021,6 +3083,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MI.dump();
 #endif
     llvm_unreachable("Unexpected instruction for custom inserter!");
+  case AArch64::ExpandZABuffer:
+    return EmitExpandZABuffer(MI, BB);
 
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
@@ -7485,10 +7549,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   if (Subtarget->hasCustomCallingConv())
     Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
 
-  // Conservatively assume the function requires the lazy-save mechanism.
+  // Create a 16 Byte TPIDR2 object. The dynamic buffer
+  // will be expanded and stored in the static object later using a pseudonode.
   if (SMEAttrs(MF.getFunction()).hasZAState()) {
-    unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
-    FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
+    Chain = SDValue(
+        DAG.getMachineNode(AArch64::ExpandZABuffer, DL, MVT::Other, Chain), 0);
+    TPIDR2Object TPIDR2;
+    TPIDR2.Addr = MFI.CreateStackObject(16, Align(16), false);
+    FuncInfo->setTPIDR2Obj(TPIDR2);
   }
 
   if (CallConv == CallingConv::PreserveNone) {
@@ -8174,9 +8242,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
   if (RequiresLazySave) {
-    unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
-    MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
-    SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
+    TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
+    MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2.Addr);
+    SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
+        TPIDR2.Addr,
         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
     SDValue NumZaSaveSlicesAddr =
         DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
@@ -8719,7 +8788,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (RequiresLazySave) {
     // Conditionally restore the lazy save using a pseudo node.
-    unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
+    TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
     SDValue RegMask = DAG.getRegisterMask(
         TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
     SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
@@ -8732,7 +8801,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     // RESTORE_ZA pseudo.
     SDValue Glue;
     SDValue TPIDR2Block = DAG.getFrameIndex(
-        FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+        TPIDR2.Addr,
+        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
     Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
     Result =
         DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
@@ -8744,6 +8814,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
         DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
         DAG.getConstant(0, DL, MVT::i64));
+    TPIDR2.Uses++;
+    FuncInfo->setTPIDR2Obj(TPIDR2);
+  }
+
+  if (std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj()) {
+    if (auto Global = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      if (Global->getGlobal()->getName() == "__arm_tpidr2_save") {
+        TPIDR2->Uses++;
+        FuncInfo->setTPIDR2Obj(*TPIDR2);
+      }
+    }
   }
 
   if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index b57ba097847cd..f144d49fbd290 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -658,6 +658,8 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB,
                                  unsigned Opcode, bool Op0IsDef) const;
   MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitExpandZABuffer(MachineInstr &MI,
+                                        MachineBasicBlock *BB) const;
 
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 839a3a3878076..ce4143c74195f 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -36,6 +36,11 @@ struct AArch64FunctionInfo;
 class AArch64Subtarget;
 class MachineInstr;
 
+struct TPIDR2Object {
+  uint64_t Addr = 0;
+  uint32_t Uses = 0;
+};
+
 /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
 /// contains private AArch64-specific information for each MachineFunction.
 class AArch64FunctionInfo final : public MachineFunctionInfo {
@@ -196,7 +201,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool IsSVECC = false;
 
   /// The frame-index for the TPIDR2 object used for lazy saves.
-  Register LazySaveTPIDR2Obj = 0;
+  std::optional<TPIDR2Object> TPIDR2;
 
   /// Whether this function changes streaming mode within the function.
   bool HasStreamingModeChanges = false;
@@ -248,8 +253,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool isSVECC() const { return IsSVECC; };
   void setIsSVECC(bool s) { IsSVECC = s; };
 
-  unsigned getLazySaveTPIDR2Obj() const { return LazySaveTPIDR2Obj; }
-  void setLazySaveTPIDR2Obj(unsigned Reg) { LazySaveTPIDR2Obj = Reg; }
+  std::optional<TPIDR2Object> getTPIDR2Obj() { return TPIDR2; }
+  void setTPIDR2Obj(TPIDR2Object Obj) { TPIDR2 = Obj; }
 
   void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index fea70b7ffb074..bf6f67a1df754 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -37,6 +37,10 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
 def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
                               [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
 
+let usesCustomInserter = 1 in {
+  def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {}
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction naming conventions.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index d786ffd412c47..60bdbb8c17c2c 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -277,11 +277,12 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    mov sp, x8
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
 ; CHECK-COMMON-NEXT:    sub x8, x29, #16
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x8
@@ -319,14 +320,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    sub x10, x29, #16
-; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    mov sp, x8
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
@@ -384,14 +386,15 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    sub x10, x29, #16
-; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    mov sp, x8
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl fmod
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
index 7db0cf7f18c58..62836dde2fc6d 100644
--- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
+++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
@@ -35,12 +35,11 @@ define void @quux() #1 {
 ; CHECK-NEXT:    .cfi_offset w30, -88
 ; CHECK-NEXT:    .cfi_offset w29, -96
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mul x9, x8, x8
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    subs x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    str x8, [x19, #384]
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str x9, [x19, #384]
 ; CHECK-NEXT:    strh w8, [x19, #394]
 ; CHECK-NEXT:    str w8, [x19, #396]
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index b0d6e046042e6..de2c7dab0eb81 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -14,14 +14,15 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -48,14 +49,15 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x20, #1
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    msub x8, x20, x20, x8
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    rdsvl x20, #1
 ; CHECK-NEXT:    sub x21, x29, #16
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w20, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x21
 ; CHECK-NEXT:    bl private_za_callee
@@ -98,14 +100,15 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -139,14 +142,15 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #80
-; CHECK-NEXT:    stur wzr, [x29, #-68]
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-80]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #80
 ; CHECK-NEXT:    sturh wzr, [x29, #-70]
-; CHECK-NEXT:    stur x9, [x29, #-80]
+; CHECK-NEXT:    stur wzr, [x29, #-68]
 ; CHECK-NEXT:    sturh w8, [x29, #-72]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x20, x0, #0x1
 ; CHECK-NEXT:    tbz w20, #0, .LBB3_2
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index cd7460b177c4b..46672c364b73d 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -12,14 +12,15 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -45,14 +46,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
new file mode 100644
index 0000000000000..19ba4b9cbcbf4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+
+define i32 @no_tpidr2_save_required() "aarch64_pstate_za_shared" {
+; CHECK-LABEL: no_tpidr2_save_required:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, #42 // =0x2a
+; CHECK-NEXT:    ret
+entry:
+  ret i32 42
+}
+
+define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_pstate_za_shared" {
+; CHECK-LABEL: multi_bb_stpidr2_save_required:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cbz w0, .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %use_b
+; CHECK-NEXT:    fmov s1, #4.00000000
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_2: // %use_c
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    bl cosf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %a, 0
+  br i1 %cmp, label %use_b, label %use_c
+
+use_b:
+  %faddr = fadd float %b, 4.0
+  br label %exit
+
+use_c:
+  %res2 = call float @llvm.cos.f32(float %c)
+  br label %exit
+
+exit:
+  %ret = phi float [%faddr, %use_b], [%res2, %use_c]
+  ret float %ret
+}
+
+declare float @llvm.cos.f32(float)
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 7f40b5e7e1344..cbbfb4a7ca7a6 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -39,15 +39,16 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sub x19, x29, #80
-; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart za
@@ -87,24 +88,14 @@ define void @zt0_shared_caller_zt0_shared_callee() "aarch64_in_zt0" nounwind {
 define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind {
 ; CHECK-LABEL: za_zt0_shared_caller_za_shared_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    sub x19, x29, #80
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, sp
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    ldr zt0, [x19]
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_inout_za";
   ret void;
@@ -114,19 +105,9 @@ define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64
 define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind {
 ; CHECK-LABEL: za_zt0_shared_caller_za_zt0_shared_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
   ret void;
@@ -199,9 +180,9 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    cbz x8, .LBB7_2
 ; CHECK-NEXT:  // %bb.1: // %save.za
@@ -227,20 +208,10 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwind {
 ; CHECK-LABEL: new_za_shared_zt0_caller:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    zero {za}
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
   ret void;
@@ -250,20 +221,10 @@ define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwi
 define void @shared_za_new_zt0() "aarch64_inout_za" "aarch64_new_zt0" nounwind {
 ; CHECK-LABEL: shared_za_new_zt0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    zero { zt0 }
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
   ret void;

>From f8b1ac1f75dffc85f43fe473f037ed660315c5ac Mon Sep 17 00:00:00 2001
From: Matt Devereau <matthew.devereau at arm.com>
Date: Wed, 14 Feb 2024 17:26:20 +0000
Subject: [PATCH 02/22] Add implicit uses

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 59 +++----------------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  3 -
 .../AArch64/AArch64MachineFunctionInfo.h      |  4 +-
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  2 +-
 4 files changed, 12 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0125b1267561a..3e4969999907f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3004,7 +3004,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
 
   if (TPIDR2->Uses == 0) {
     BB->remove_instr(&MI);
-    MFI.RemoveStackObject(TPIDR2->Addr);
+    MFI.RemoveStackObject(TPIDR2->FrameIndex);
     return BB;
   }
 
@@ -3027,9 +3027,8 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
       .addReg(MSub);
 
-  uint64_t TPIDR2Object = TPIDR2->Addr;
-
-  MFI.CreateVariableSizedObject(Align(1), nullptr);
+  unsigned TPIDR2Object = TPIDR2->FrameIndex;
+  MFI.CreateVariableSizedObject(Align(16), nullptr);
 
   Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
   MachineInstrBuilder Wzr =
@@ -7093,47 +7092,6 @@ AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
   }
 }
 
-
-unsigned
-AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
-                                              SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo &MFI = MF.getFrameInfo();
-
-  // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
-  SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
-                          DAG.getConstant(1, DL, MVT::i32));
-  SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
-  SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
-  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
-  SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
-  Chain = Buffer.getValue(1);
-  MFI.CreateVariableSizedObject(Align(1), nullptr);
-
-  // Allocate an additional TPIDR2 object on the stack (16 bytes)
-  unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
-
-  // Store the buffer pointer to the TPIDR2 stack object.
-  MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
-  SDValue Ptr = DAG.getFrameIndex(
-      TPIDR2Obj,
-      DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
-  Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
-
-  // Set the reserved bytes (10-15) to zero
-  EVT PtrTy = Ptr.getValueType();
-  SDValue ReservedPtr =
-      DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
-  Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
-                       MPI);
-  ReservedPtr =
-      DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
-  Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
-                       MPI);
-
-  return TPIDR2Obj;
-}
-
 static bool isPassedInFPR(EVT VT) {
   return VT.isFixedLengthVector() ||
          (VT.isFloatingPoint() && !VT.isScalableVector());
@@ -7555,7 +7513,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     Chain = SDValue(
         DAG.getMachineNode(AArch64::ExpandZABuffer, DL, MVT::Other, Chain), 0);
     TPIDR2Object TPIDR2;
-    TPIDR2.Addr = MFI.CreateStackObject(16, Align(16), false);
+    TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
     FuncInfo->setTPIDR2Obj(TPIDR2);
   }
 
@@ -8242,10 +8200,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
   if (RequiresLazySave) {
-    TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
-    MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2.Addr);
+    const TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
+    MachinePointerInfo MPI =
+        MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
     SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
-        TPIDR2.Addr,
+        TPIDR2.FrameIndex,
         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
     SDValue NumZaSaveSlicesAddr =
         DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
@@ -8801,7 +8760,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     // RESTORE_ZA pseudo.
     SDValue Glue;
     SDValue TPIDR2Block = DAG.getFrameIndex(
-        TPIDR2.Addr,
+        TPIDR2.FrameIndex,
         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
     Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
     Result =
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f144d49fbd290..2468564d09223 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1039,9 +1039,6 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override;
 
-  unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
-                                  SelectionDAG &DAG) const;
-
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index ce4143c74195f..95affb326ab93 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -37,8 +37,8 @@ class AArch64Subtarget;
 class MachineInstr;
 
 struct TPIDR2Object {
-  uint64_t Addr = 0;
-  uint32_t Uses = 0;
+  unsigned FrameIndex = 0;
+  unsigned Uses = 0;
 };
 
 /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index bf6f67a1df754..f4fdc640b9929 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -37,7 +37,7 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
 def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
                               [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
 
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
   def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {}
 }
 

>From 253b004ad49df080accc0348e9079648d7969ce6 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 7 Mar 2024 13:45:25 +0000
Subject: [PATCH 03/22] fixup: add comments from lazy save function

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3e4969999907f..7304385e16fc6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3019,6 +3019,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
       .addReg(AArch64::SP);
 
+  // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
   Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub)
       .addReg(RDSVL)
@@ -3027,6 +3028,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
       .addReg(MSub);
 
+  // Allocate an additional TPIDR2 object on the stack (16 bytes)
   unsigned TPIDR2Object = TPIDR2->FrameIndex;
   MFI.CreateVariableSizedObject(Align(16), nullptr);
 
@@ -3035,10 +3037,12 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
       BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32)
           .addReg(AArch64::WZR);
 
+  // Store the buffer pointer to the TPIDR2 stack object.
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
       .addReg(MSub)
       .addFrameIndex(TPIDR2Object)
       .addImm(0);
+  // Set the reserved bytes (10-15) to zero
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
       .addReg(Wzr.getReg(0))
       .addFrameIndex(TPIDR2Object)

>From 804135e5f079d76ce0cbd3554c2e922a815f2994 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 7 Mar 2024 13:48:54 +0000
Subject: [PATCH 04/22] fixup: set FrameIndex to max int

---
 llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 95affb326ab93..354e234b1c363 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -37,7 +37,7 @@ class AArch64Subtarget;
 class MachineInstr;
 
 struct TPIDR2Object {
-  unsigned FrameIndex = 0;
+  unsigned FrameIndex = std::numeric_limits<int>::max();
   unsigned Uses = 0;
 };
 

>From f8f1c57328007dee93c8df9dd7230b4969b8397e Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Mar 2024 16:01:56 +0000
Subject: [PATCH 05/22] fixup: add Windows assertion

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7304385e16fc6..f993bb160c036 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2997,6 +2997,12 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
   MachineFunction *MF = BB->getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+  // TODO This function grows the stack with a subtraction, which doesn't work
+  // on Windows. Some refactoring to share the functionality in
+  // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
+  // supports SME
+  assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
+         "Lazy ZA save is not yet supported on Windows");
 
   std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj();
   if (!TPIDR2)

>From dd0d4d5e3a3f264b727632da867417d1f61d562e Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 26 Mar 2024 16:19:59 +0000
Subject: [PATCH 06/22] fixup: lower to STACKALLOC pseudo

---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      | 16 +++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    | 28 ++++++++++---------
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  7 ++++-
 .../AArch64/sme-disable-gisel-fisel.ll        | 12 +++++---
 .../CodeGen/AArch64/sme-framelower-use-bp.ll  |  3 +-
 .../CodeGen/AArch64/sme-lazy-save-call.ll     | 12 +++++---
 .../AArch64/sme-shared-za-interface.ll        |  6 ++--
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    |  6 ++--
 8 files changed, 63 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 9b7fc228d5de8..d381152880843 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1166,6 +1166,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   default:
     break;
 
+  case AArch64::STACKALLOC: {
+    Register Dest = MI.getOperand(0).getReg();
+    Register Src = MI.getOperand(1).getReg();
+    Register SPCopy = MI.getOperand(2).getReg();
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), Dest)
+        .addReg(SPCopy)
+        .add(MI.getOperand(1))
+        .addImm(0);
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(Dest)
+        .addImm(0)
+        .addImm(0);
+    MI.eraseFromParent();
+    return true;
+  }
   case AArch64::BSPv8i8:
   case AArch64::BSPv16i8: {
     Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f993bb160c036..b94c80171d99c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3021,22 +3021,24 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL)
       .addImm(1);
 
-  Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
-      .addReg(AArch64::SP);
-
-  // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
-  Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub)
+  // Allocate the ZA buffer
+  Register BufferSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MADDXrrr), BufferSize)
       .addReg(RDSVL)
       .addReg(RDSVL)
-      .addReg(SP);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
-      .addReg(MSub);
+      .addReg(AArch64::XZR);
+  Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  Register SPCopy = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SPCopy)
+      .addReg(AArch64::SP);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STACKALLOC), BufferAddr)
+      .addReg(BufferSize)
+      .addReg(SPCopy);
+  MFI.CreateVariableSizedObject(Align(16), nullptr);
+
+  // expand pseudo in expand pass or remove pseudo and remove stack object
 
-  // Allocate an additional TPIDR2 object on the stack (16 bytes)
   unsigned TPIDR2Object = TPIDR2->FrameIndex;
-  MFI.CreateVariableSizedObject(Align(16), nullptr);
 
   Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
   MachineInstrBuilder Wzr =
@@ -3045,7 +3047,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
 
   // Store the buffer pointer to the TPIDR2 stack object.
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
-      .addReg(MSub)
+      .addReg(BufferAddr)
       .addFrameIndex(TPIDR2Object)
       .addImm(0);
   // Set the reserved bytes (10-15) to zero
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 91e5bc3caa102..1aa49b2f1941b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1021,7 +1021,11 @@ include "SMEInstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 1, isCodeGenOnly = 1 in {
-let Defs = [SP], Uses = [SP] in {
+let Defs = [SP] in {
+
+def STACKALLOC : Pseudo<(outs GPR64:$addr), (ins GPR64:$size, GPR64:$sp), []>, Sched<[]>;
+
+let Uses = [SP] in {
 // We set Sched to empty list because we expect these instructions to simply get
 // removed in most cases.
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
@@ -1032,6 +1036,7 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             Sched<[]>;
 
 }
+}
 
 let Defs = [SP, NZCV], Uses = [SP] in {
 // Probed stack allocation of a constant size, used in function prologues when
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 60bdbb8c17c2c..b4325412860fa 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -237,8 +237,9 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
@@ -276,8 +277,9 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
@@ -320,7 +322,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    mul x8, x8, x8
+; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
@@ -386,7 +389,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    mul x8, x8, x8
+; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
index 62836dde2fc6d..29d15f34d680b 100644
--- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
+++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
@@ -35,8 +35,9 @@ define void @quux() #1 {
 ; CHECK-NEXT:    .cfi_offset w30, -88
 ; CHECK-NEXT:    .cfi_offset w29, -96
 ; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    sub x9, x9, x8
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    str x9, [x19, #384]
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index de2c7dab0eb81..16e64f924780c 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -14,7 +14,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -51,7 +52,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    rdsvl x20, #1
 ; CHECK-NEXT:    sub x21, x29, #16
@@ -100,7 +102,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -142,7 +145,8 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-80]
 ; CHECK-NEXT:    rdsvl x8, #1
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 46672c364b73d..03b49c39a4539 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -12,7 +12,8 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -46,7 +47,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    rdsvl x8, #1
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index cbbfb4a7ca7a6..f810054eac831 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -39,7 +39,8 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -178,7 +179,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]

>From 32582c6319ec88d167130391d3337eebdb1c6082 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 28 Mar 2024 10:59:50 +0000
Subject: [PATCH 07/22] fixup: lower to STORETPIDR2 pseudo

---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      | 23 +++++++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 26 ++--------
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  1 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  3 ++
 .../AArch64/sme-disable-gisel-fisel.ll        | 44 +++++++++-------
 .../CodeGen/AArch64/sme-lazy-save-call.ll     | 52 ++++++++++---------
 .../AArch64/sme-shared-za-interface.ll        | 30 ++++++-----
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    | 15 +++---
 8 files changed, 106 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index d381152880843..70d27fd2fb2b3 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1166,9 +1166,30 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   default:
     break;
 
+  case AArch64::STORETPIDR2: {
+    Register BufferAddr = MI.getOperand(0).getReg();
+    auto TPIDR2Object = MI.getOperand(1).getReg();
+    unsigned Offset = MI.getOperand(2).getImm();
+    // Store the buffer pointer to the TPIDR2 stack object.
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+        .addReg(BufferAddr)
+        .addUse(TPIDR2Object)
+        .addImm(0 + Offset);
+    // Set the reserved bytes (10-15) to zero
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
+        .addReg(AArch64::WZR)
+        .addUse(TPIDR2Object)
+        .addImm(5 + Offset);
+    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
+        .addReg(AArch64::WZR)
+        .addUse(TPIDR2Object)
+        .addImm(3 + Offset);
+    MI.eraseFromParent();
+    return true;
+  }
+
   case AArch64::STACKALLOC: {
     Register Dest = MI.getOperand(0).getReg();
-    Register Src = MI.getOperand(1).getReg();
     Register SPCopy = MI.getOperand(2).getReg();
     BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), Dest)
         .addReg(SPCopy)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b94c80171d99c..1ed68958291be 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3036,30 +3036,12 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
       .addReg(SPCopy);
   MFI.CreateVariableSizedObject(Align(16), nullptr);
 
-  // expand pseudo in expand pass or remove pseudo and remove stack object
-
   unsigned TPIDR2Object = TPIDR2->FrameIndex;
 
-  Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MachineInstrBuilder Wzr =
-      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32)
-          .addReg(AArch64::WZR);
-
-  // Store the buffer pointer to the TPIDR2 stack object.
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
-      .addReg(BufferAddr)
-      .addFrameIndex(TPIDR2Object)
-      .addImm(0);
-  // Set the reserved bytes (10-15) to zero
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
-      .addReg(Wzr.getReg(0))
-      .addFrameIndex(TPIDR2Object)
-      .addImm(5);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
-      .addReg(Wzr.getReg(0))
-      .addFrameIndex(TPIDR2Object)
-      .addImm(3);
-
+  auto MI2 = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STORETPIDR2))
+                 .addReg(BufferAddr)
+                 .addFrameIndex(TPIDR2Object)
+                 .addImm(0);
   BB->remove_instr(&MI);
   return BB;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index f4b5fd7a003c2..cb2d43f4d5f1f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3640,6 +3640,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LDRDui:
   case AArch64::STRXui:
   case AArch64::STRDui:
+  case AArch64::STORETPIDR2:
     Scale = TypeSize::getFixed(8);
     Width = TypeSize::getFixed(8);
     MinOffset = 0;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1aa49b2f1941b..e127ca9117cdb 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1021,6 +1021,9 @@ include "SMEInstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 1, isCodeGenOnly = 1 in {
+
+def STORETPIDR2 : Pseudo<(outs), (ins GPR64:$addr, GPR64sp:$frameindex, i32imm:$offset), []>, Sched<[]>;
+
 let Defs = [SP] in {
 
 def STACKALLOC : Pseudo<(outs GPR64:$addr), (ins GPR64:$size, GPR64:$sp), []>, Sched<[]>;
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index b4325412860fa..4c3a93c19fc87 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -241,9 +241,10 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:    mov x9, sp
 ; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
+; CHECK-COMMON-NEXT:    str x8, [x9]
+; CHECK-COMMON-NEXT:    strh wzr, [x9, #10]
+; CHECK-COMMON-NEXT:    str wzr, [x9, #12]
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-COMMON-NEXT:    cbz x8, .LBB6_2
 ; CHECK-COMMON-NEXT:    b .LBB6_1
@@ -281,9 +282,10 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    mov x9, sp
 ; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
+; CHECK-COMMON-NEXT:    str x8, [x9]
+; CHECK-COMMON-NEXT:    strh wzr, [x9, #10]
+; CHECK-COMMON-NEXT:    str wzr, [x9, #12]
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
 ; CHECK-COMMON-NEXT:    sub x8, x29, #16
@@ -325,13 +327,14 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    sub x9, x29, #16
-; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
-; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-COMMON-NEXT:    rdsvl x9, #1
+; CHECK-COMMON-NEXT:    sub x10, x29, #16
+; CHECK-COMMON-NEXT:    sub x11, x29, #16
+; CHECK-COMMON-NEXT:    str x8, [x11]
+; CHECK-COMMON-NEXT:    strh wzr, [x11, #10]
+; CHECK-COMMON-NEXT:    str wzr, [x11, #12]
+; CHECK-COMMON-NEXT:    sturh w9, [x29, #-8]
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
@@ -392,13 +395,14 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    sub x9, x29, #16
-; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
-; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-COMMON-NEXT:    rdsvl x9, #1
+; CHECK-COMMON-NEXT:    sub x10, x29, #16
+; CHECK-COMMON-NEXT:    sub x11, x29, #16
+; CHECK-COMMON-NEXT:    str x8, [x11]
+; CHECK-COMMON-NEXT:    strh wzr, [x11, #10]
+; CHECK-COMMON-NEXT:    str wzr, [x11, #12]
+; CHECK-COMMON-NEXT:    sturh w9, [x29, #-8]
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-COMMON-NEXT:    bl fmod
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 16e64f924780c..6bfe670582e29 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -17,13 +17,14 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    sub x10, x29, #16
+; CHECK-NEXT:    sub x11, x29, #16
+; CHECK-NEXT:    str x8, [x11]
+; CHECK-NEXT:    strh wzr, [x11, #10]
+; CHECK-NEXT:    str wzr, [x11, #12]
+; CHECK-NEXT:    sturh w9, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -57,9 +58,10 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    rdsvl x20, #1
 ; CHECK-NEXT:    sub x21, x29, #16
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    str x8, [x9]
+; CHECK-NEXT:    strh wzr, [x9, #10]
+; CHECK-NEXT:    str wzr, [x9, #12]
 ; CHECK-NEXT:    sturh w20, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x21
 ; CHECK-NEXT:    bl private_za_callee
@@ -105,13 +107,14 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    sub x10, x29, #16
+; CHECK-NEXT:    sub x11, x29, #16
+; CHECK-NEXT:    str x8, [x11]
+; CHECK-NEXT:    strh wzr, [x11, #10]
+; CHECK-NEXT:    str wzr, [x11, #12]
+; CHECK-NEXT:    sturh w9, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -148,13 +151,14 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-80]
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    sub x9, x29, #80
-; CHECK-NEXT:    sturh wzr, [x29, #-70]
-; CHECK-NEXT:    stur wzr, [x29, #-68]
-; CHECK-NEXT:    sturh w8, [x29, #-72]
-; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    sub x10, x29, #80
+; CHECK-NEXT:    sub x11, x29, #80
+; CHECK-NEXT:    str x8, [x11]
+; CHECK-NEXT:    strh wzr, [x11, #10]
+; CHECK-NEXT:    str wzr, [x11, #12]
+; CHECK-NEXT:    sturh w9, [x29, #-72]
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x20, x0, #0x1
 ; CHECK-NEXT:    tbz w20, #0, .LBB3_2
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 03b49c39a4539..dcd9dbadc7066 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -15,13 +15,14 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    sub x10, x29, #16
+; CHECK-NEXT:    sub x11, x29, #16
+; CHECK-NEXT:    str x8, [x11]
+; CHECK-NEXT:    strh wzr, [x11, #10]
+; CHECK-NEXT:    str wzr, [x11, #12]
+; CHECK-NEXT:    sturh w9, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -50,13 +51,14 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    sub x10, x29, #16
+; CHECK-NEXT:    sub x11, x29, #16
+; CHECK-NEXT:    str x8, [x11]
+; CHECK-NEXT:    strh wzr, [x11, #10]
+; CHECK-NEXT:    str wzr, [x11, #12]
+; CHECK-NEXT:    sturh w9, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index f810054eac831..dfd7714b38724 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -42,12 +42,12 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sub x19, x29, #80
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    str x8, [x9]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    strh wzr, [x9, #10]
+; CHECK-NEXT:    str wzr, [x9, #12]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    str zt0, [x19]
@@ -182,9 +182,10 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    str x8, [x9]
+; CHECK-NEXT:    strh wzr, [x9, #10]
+; CHECK-NEXT:    str wzr, [x9, #12]
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    cbz x8, .LBB7_2
 ; CHECK-NEXT:  // %bb.1: // %save.za

>From 5ef82703b2e4f8a518d431f3e1612961dd47a8a4 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Mon, 8 Apr 2024 10:41:19 +0100
Subject: [PATCH 08/22] fixup: update za attribute in test

---
 llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index 19ba4b9cbcbf4..b655c6c69a007 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
 
-define i32 @no_tpidr2_save_required() "aarch64_pstate_za_shared" {
+define i32 @no_tpidr2_save_required() "aarch64_inout_za" {
 ; CHECK-LABEL: no_tpidr2_save_required:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov w0, #42 // =0x2a
@@ -10,7 +10,7 @@ entry:
   ret i32 42
 }
 
-define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_pstate_za_shared" {
+define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" {
 ; CHECK-LABEL: multi_bb_stpidr2_save_required:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cbz w0, .LBB1_2

>From de64455ac9b3a16b17d10ceaa05f33d670d1732c Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 23 Apr 2024 16:23:03 +0100
Subject: [PATCH 09/22] Revert "fixup: lower to STORETPIDR2 pseudo"

This reverts commit 19a7169671a964b1b5126468d34a9b5731e23e66.
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      | 23 +---------
 .../Target/AArch64/AArch64ISelLowering.cpp    | 26 +++++++++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  1 -
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  3 --
 .../AArch64/sme-disable-gisel-fisel.ll        | 44 +++++++++----------
 .../AArch64/sme-shared-za-interface.ll        | 30 ++++++-------
 .../AArch64/sme-za-lazy-save-buffer.ll        | 35 ++++++++++++---
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    | 15 +++----
 8 files changed, 94 insertions(+), 83 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 70d27fd2fb2b3..d381152880843 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1166,30 +1166,9 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   default:
     break;
 
-  case AArch64::STORETPIDR2: {
-    Register BufferAddr = MI.getOperand(0).getReg();
-    auto TPIDR2Object = MI.getOperand(1).getReg();
-    unsigned Offset = MI.getOperand(2).getImm();
-    // Store the buffer pointer to the TPIDR2 stack object.
-    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
-        .addReg(BufferAddr)
-        .addUse(TPIDR2Object)
-        .addImm(0 + Offset);
-    // Set the reserved bytes (10-15) to zero
-    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
-        .addReg(AArch64::WZR)
-        .addUse(TPIDR2Object)
-        .addImm(5 + Offset);
-    BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
-        .addReg(AArch64::WZR)
-        .addUse(TPIDR2Object)
-        .addImm(3 + Offset);
-    MI.eraseFromParent();
-    return true;
-  }
-
   case AArch64::STACKALLOC: {
     Register Dest = MI.getOperand(0).getReg();
+    Register Src = MI.getOperand(1).getReg();
     Register SPCopy = MI.getOperand(2).getReg();
     BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), Dest)
         .addReg(SPCopy)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1ed68958291be..b94c80171d99c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3036,12 +3036,30 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
       .addReg(SPCopy);
   MFI.CreateVariableSizedObject(Align(16), nullptr);
 
+  // expand pseudo in expand pass or remove pseudo and remove stack object
+
   unsigned TPIDR2Object = TPIDR2->FrameIndex;
 
-  auto MI2 = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STORETPIDR2))
-                 .addReg(BufferAddr)
-                 .addFrameIndex(TPIDR2Object)
-                 .addImm(0);
+  Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+  MachineInstrBuilder Wzr =
+      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32)
+          .addReg(AArch64::WZR);
+
+  // Store the buffer pointer to the TPIDR2 stack object.
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+      .addReg(BufferAddr)
+      .addFrameIndex(TPIDR2Object)
+      .addImm(0);
+  // Set the reserved bytes (10-15) to zero
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
+      .addReg(Wzr.getReg(0))
+      .addFrameIndex(TPIDR2Object)
+      .addImm(5);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
+      .addReg(Wzr.getReg(0))
+      .addFrameIndex(TPIDR2Object)
+      .addImm(3);
+
   BB->remove_instr(&MI);
   return BB;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index cb2d43f4d5f1f..f4b5fd7a003c2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3640,7 +3640,6 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LDRDui:
   case AArch64::STRXui:
   case AArch64::STRDui:
-  case AArch64::STORETPIDR2:
     Scale = TypeSize::getFixed(8);
     Width = TypeSize::getFixed(8);
     MinOffset = 0;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index e127ca9117cdb..1aa49b2f1941b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1021,9 +1021,6 @@ include "SMEInstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 1, isCodeGenOnly = 1 in {
-
-def STORETPIDR2 : Pseudo<(outs), (ins GPR64:$addr, GPR64sp:$frameindex, i32imm:$offset), []>, Sched<[]>;
-
 let Defs = [SP] in {
 
 def STACKALLOC : Pseudo<(outs GPR64:$addr), (ins GPR64:$size, GPR64:$sp), []>, Sched<[]>;
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 4c3a93c19fc87..b4325412860fa 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -241,10 +241,9 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:    mov x9, sp
 ; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    sub x9, x29, #16
-; CHECK-COMMON-NEXT:    str x8, [x9]
-; CHECK-COMMON-NEXT:    strh wzr, [x9, #10]
-; CHECK-COMMON-NEXT:    str wzr, [x9, #12]
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-COMMON-NEXT:    cbz x8, .LBB6_2
 ; CHECK-COMMON-NEXT:    b .LBB6_1
@@ -282,10 +281,9 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    mov x9, sp
 ; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    sub x9, x29, #16
-; CHECK-COMMON-NEXT:    str x8, [x9]
-; CHECK-COMMON-NEXT:    strh wzr, [x9, #10]
-; CHECK-COMMON-NEXT:    str wzr, [x9, #12]
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
 ; CHECK-COMMON-NEXT:    sub x8, x29, #16
@@ -327,14 +325,13 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    rdsvl x9, #1
-; CHECK-COMMON-NEXT:    sub x10, x29, #16
-; CHECK-COMMON-NEXT:    sub x11, x29, #16
-; CHECK-COMMON-NEXT:    str x8, [x11]
-; CHECK-COMMON-NEXT:    strh wzr, [x11, #10]
-; CHECK-COMMON-NEXT:    str wzr, [x11, #12]
-; CHECK-COMMON-NEXT:    sturh w9, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
+; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
@@ -395,14 +392,13 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    rdsvl x9, #1
-; CHECK-COMMON-NEXT:    sub x10, x29, #16
-; CHECK-COMMON-NEXT:    sub x11, x29, #16
-; CHECK-COMMON-NEXT:    str x8, [x11]
-; CHECK-COMMON-NEXT:    strh wzr, [x11, #10]
-; CHECK-COMMON-NEXT:    str wzr, [x11, #12]
-; CHECK-COMMON-NEXT:    sturh w9, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
+; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl fmod
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index dcd9dbadc7066..03b49c39a4539 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -15,14 +15,13 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    sub x11, x29, #16
-; CHECK-NEXT:    str x8, [x11]
-; CHECK-NEXT:    strh wzr, [x11, #10]
-; CHECK-NEXT:    str wzr, [x11, #12]
-; CHECK-NEXT:    sturh w9, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -51,14 +50,13 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    sub x11, x29, #16
-; CHECK-NEXT:    str x8, [x11]
-; CHECK-NEXT:    strh wzr, [x11, #10]
-; CHECK-NEXT:    str wzr, [x11, #12]
-; CHECK-NEXT:    sturh w9, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index b655c6c69a007..686fe1115c275 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -13,18 +13,43 @@ entry:
 define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" {
 ; CHECK-LABEL: multi_bb_stpidr2_save_required:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    str x8, [x9]
+; CHECK-NEXT:    strh wzr, [x9, #10]
+; CHECK-NEXT:    str wzr, [x9, #12]
 ; CHECK-NEXT:    cbz w0, .LBB1_2
 ; CHECK-NEXT:  // %bb.1: // %use_b
 ; CHECK-NEXT:    fmov s1, #4.00000000
 ; CHECK-NEXT:    fadd s0, s0, s1
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b .LBB1_5
 ; CHECK-NEXT:  .LBB1_2: // %use_c
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl cosf
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB1_4
+; CHECK-NEXT:  // %bb.3: // %use_c
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB1_4: // %use_c
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB1_5: // %exit
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %cmp = icmp ne i32 %a, 0
   br i1 %cmp, label %use_b, label %use_c
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index dfd7714b38724..f810054eac831 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -42,12 +42,12 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sub x19, x29, #80
-; CHECK-NEXT:    str x8, [x9]
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    strh wzr, [x9, #10]
-; CHECK-NEXT:    str wzr, [x9, #12]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    str zt0, [x19]
@@ -182,10 +182,9 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-NEXT:    mul x8, x8, x8
 ; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    str x8, [x9]
-; CHECK-NEXT:    strh wzr, [x9, #10]
-; CHECK-NEXT:    str wzr, [x9, #12]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    cbz x8, .LBB7_2
 ; CHECK-NEXT:  // %bb.1: // %save.za

>From d740a3a06b788a50feeab094b4833c28eba9fe85 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 23 Apr 2024 16:23:17 +0100
Subject: [PATCH 10/22] Revert "fixup: lower to STACKALLOC pseudo"

This reverts commit 34f9e9ce4b549b5919e56fd9212afb2b0ff9762f.
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      | 16 -----------
 .../Target/AArch64/AArch64ISelLowering.cpp    | 28 +++++++++----------
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  7 +----
 .../AArch64/sme-disable-gisel-fisel.ll        | 12 +++-----
 .../CodeGen/AArch64/sme-lazy-save-call.ll     | 12 +++-----
 .../AArch64/sme-shared-za-interface.ll        |  6 ++--
 .../AArch64/sme-za-lazy-save-buffer.ll        |  3 +-
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    |  6 ++--
 8 files changed, 27 insertions(+), 63 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index d381152880843..9b7fc228d5de8 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1166,22 +1166,6 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   default:
     break;
 
-  case AArch64::STACKALLOC: {
-    Register Dest = MI.getOperand(0).getReg();
-    Register Src = MI.getOperand(1).getReg();
-    Register SPCopy = MI.getOperand(2).getReg();
-    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), Dest)
-        .addReg(SPCopy)
-        .add(MI.getOperand(1))
-        .addImm(0);
-    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
-        .addReg(AArch64::SP, RegState::Define)
-        .addReg(Dest)
-        .addImm(0)
-        .addImm(0);
-    MI.eraseFromParent();
-    return true;
-  }
   case AArch64::BSPv8i8:
   case AArch64::BSPv16i8: {
     Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b94c80171d99c..f993bb160c036 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3021,24 +3021,22 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL)
       .addImm(1);
 
-  // Allocate the ZA buffer
-  Register BufferSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MADDXrrr), BufferSize)
-      .addReg(RDSVL)
-      .addReg(RDSVL)
-      .addReg(AArch64::XZR);
-  Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-  Register SPCopy = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SPCopy)
+  Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
       .addReg(AArch64::SP);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STACKALLOC), BufferAddr)
-      .addReg(BufferSize)
-      .addReg(SPCopy);
-  MFI.CreateVariableSizedObject(Align(16), nullptr);
 
-  // expand pseudo in expand pass or remove pseudo and remove stack object
+  // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
+  Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub)
+      .addReg(RDSVL)
+      .addReg(RDSVL)
+      .addReg(SP);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
+      .addReg(MSub);
 
+  // Allocate an additional TPIDR2 object on the stack (16 bytes)
   unsigned TPIDR2Object = TPIDR2->FrameIndex;
+  MFI.CreateVariableSizedObject(Align(16), nullptr);
 
   Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
   MachineInstrBuilder Wzr =
@@ -3047,7 +3045,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
 
   // Store the buffer pointer to the TPIDR2 stack object.
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
-      .addReg(BufferAddr)
+      .addReg(MSub)
       .addFrameIndex(TPIDR2Object)
       .addImm(0);
   // Set the reserved bytes (10-15) to zero
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1aa49b2f1941b..91e5bc3caa102 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1021,11 +1021,7 @@ include "SMEInstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 1, isCodeGenOnly = 1 in {
-let Defs = [SP] in {
-
-def STACKALLOC : Pseudo<(outs GPR64:$addr), (ins GPR64:$size, GPR64:$sp), []>, Sched<[]>;
-
-let Uses = [SP] in {
+let Defs = [SP], Uses = [SP] in {
 // We set Sched to empty list because we expect these instructions to simply get
 // removed in most cases.
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
@@ -1036,7 +1032,6 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             Sched<[]>;
 
 }
-}
 
 let Defs = [SP, NZCV], Uses = [SP] in {
 // Probed stack allocation of a constant size, used in function prologues when
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index b4325412860fa..60bdbb8c17c2c 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -237,9 +237,8 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    sub x8, x9, x8
+; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
@@ -277,9 +276,8 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    sub x8, x9, x8
+; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
@@ -322,8 +320,7 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    mul x8, x8, x8
-; CHECK-COMMON-NEXT:    sub x8, x9, x8
+; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
@@ -389,8 +386,7 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    mul x8, x8, x8
-; CHECK-COMMON-NEXT:    sub x8, x9, x8
+; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 6bfe670582e29..db8799020cc14 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -14,8 +14,7 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    rdsvl x9, #1
 ; CHECK-NEXT:    sub x10, x29, #16
@@ -53,8 +52,7 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    rdsvl x20, #1
 ; CHECK-NEXT:    sub x21, x29, #16
@@ -104,8 +102,7 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    rdsvl x9, #1
 ; CHECK-NEXT:    sub x10, x29, #16
@@ -148,8 +145,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    rdsvl x9, #1
 ; CHECK-NEXT:    sub x10, x29, #80
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 03b49c39a4539..46672c364b73d 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -12,8 +12,7 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -47,8 +46,7 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    rdsvl x8, #1
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index 686fe1115c275..b62a9ba080e2b 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -21,8 +21,7 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    str x8, [x9]
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index f810054eac831..cbbfb4a7ca7a6 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -39,8 +39,7 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -179,8 +178,7 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]

>From 139fe73755943a16ffce4803291aaaeb4cbaca69 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Wed, 24 Apr 2024 14:58:06 +0100
Subject: [PATCH 11/22] fixup: pass alloc size to ExpandZABuffer

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 41 ++++++++---------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  1 +
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  6 ++-
 .../AArch64/sme-disable-gisel-fisel.ll        | 46 ++++++++++---------
 .../CodeGen/AArch64/sme-lazy-save-call.ll     | 42 -----------------
 .../AArch64/sme-shared-za-interface.ll        | 28 +++++------
 .../AArch64/sme-za-lazy-save-buffer.ll        |  3 +-
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    | 17 +++----
 8 files changed, 75 insertions(+), 109 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f993bb160c036..ba42d3451b1bb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2492,6 +2492,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((AArch64ISD::NodeType)Opcode) {
   case AArch64ISD::FIRST_NUMBER:
     break;
+    MAKE_CASE(AArch64ISD::EXPAND_ZA_BUFFER)
     MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
     MAKE_CASE(AArch64ISD::VG_SAVE)
     MAKE_CASE(AArch64ISD::VG_RESTORE)
@@ -3017,44 +3018,36 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
-  Register RDSVL = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL)
-      .addImm(1);
-
+  // The SUBXrs below won't always be emitted in a form that accepts SP directly
   Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
       .addReg(AArch64::SP);
 
-  // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
-  Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub)
-      .addReg(RDSVL)
-      .addReg(RDSVL)
-      .addReg(SP);
+  // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
+  Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), BufferAddr)
+      .addReg(SP)
+      .add(MI.getOperand(0))
+      .addImm(0);
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
-      .addReg(MSub);
+      .addReg(BufferAddr);
 
   // Allocate an additional TPIDR2 object on the stack (16 bytes)
   unsigned TPIDR2Object = TPIDR2->FrameIndex;
   MFI.CreateVariableSizedObject(Align(16), nullptr);
 
-  Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MachineInstrBuilder Wzr =
-      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32)
-          .addReg(AArch64::WZR);
-
   // Store the buffer pointer to the TPIDR2 stack object.
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
-      .addReg(MSub)
+      .addReg(BufferAddr)
       .addFrameIndex(TPIDR2Object)
       .addImm(0);
   // Set the reserved bytes (10-15) to zero
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
-      .addReg(Wzr.getReg(0))
+      .addReg(AArch64::WZR)
       .addFrameIndex(TPIDR2Object)
       .addImm(5);
   BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
-      .addReg(Wzr.getReg(0))
+      .addReg(AArch64::WZR)
       .addFrameIndex(TPIDR2Object)
       .addImm(3);
 
@@ -7520,11 +7513,17 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // Create a 16 Byte TPIDR2 object. The dynamic buffer
   // will be expanded and stored in the static object later using a pseudonode.
   if (SMEAttrs(MF.getFunction()).hasZAState()) {
-    Chain = SDValue(
-        DAG.getMachineNode(AArch64::ExpandZABuffer, DL, MVT::Other, Chain), 0);
     TPIDR2Object TPIDR2;
     TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
     FuncInfo->setTPIDR2Obj(TPIDR2);
+    SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                              DAG.getConstant(1, DL, MVT::i32));
+    SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+    SDValue FI = DAG.getFrameIndex(
+        TPIDR2.FrameIndex,
+        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+    Chain = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL,
+                        DAG.getVTList(MVT::Other), {Chain, Size, FI});
   }
 
   if (CallConv == CallingConv::PreserveNone) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2468564d09223..6d240ee8a6ce2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -457,6 +457,7 @@ enum NodeType : unsigned {
   // SME
   RDSVL,
   REVD_MERGE_PASSTHRU,
+  EXPAND_ZA_BUFFER,
 
   // Asserts that a function argument (i32) is zero-extended to i8 by
   // the caller
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index f4fdc640b9929..0aa5380847eed 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -37,8 +37,12 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
 def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
                               [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
 
+def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<0, 2,
+                              [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+                              [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+
 let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
-  def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {}
+  def ExpandZABuffer : Pseudo<(outs), (ins GPR64:$size, GPR64:$fi), [(AArch64ExpandZABuffer GPR64:$size, GPR64:$fi)]>, Sched<[WriteI]> {}
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 60bdbb8c17c2c..bf763a420e3f6 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -237,8 +237,10 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    mul x8, x8, x8
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    sub x8, x9, x8
 ; CHECK-COMMON-NEXT:    mov sp, x8
 ; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
@@ -276,16 +278,16 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
-; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    mul x9, x8, x8
+; CHECK-COMMON-NEXT:    sub x10, x29, #16
+; CHECK-COMMON-NEXT:    mov x11, sp
+; CHECK-COMMON-NEXT:    sub x9, x11, x9
+; CHECK-COMMON-NEXT:    mov sp, x9
+; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
-; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    sub x8, x29, #16
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x8
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-COMMON-NEXT:    bl normal_callee
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
@@ -319,16 +321,16 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
-; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    sub x9, x29, #16
+; CHECK-COMMON-NEXT:    mov x10, sp
+; CHECK-COMMON-NEXT:    mul x9, x8, x8
+; CHECK-COMMON-NEXT:    sub x9, x10, x9
+; CHECK-COMMON-NEXT:    sub x10, x29, #16
+; CHECK-COMMON-NEXT:    mov sp, x9
+; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
@@ -385,16 +387,16 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
-; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    sub x9, x29, #16
+; CHECK-COMMON-NEXT:    mov x10, sp
+; CHECK-COMMON-NEXT:    mul x9, x8, x8
+; CHECK-COMMON-NEXT:    sub x9, x10, x9
+; CHECK-COMMON-NEXT:    sub x10, x29, #16
+; CHECK-COMMON-NEXT:    mov sp, x9
+; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-COMMON-NEXT:    bl fmod
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index db8799020cc14..f218d97faf738 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -13,16 +13,6 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    sub x11, x29, #16
-; CHECK-NEXT:    str x8, [x11]
-; CHECK-NEXT:    strh wzr, [x11, #10]
-; CHECK-NEXT:    str wzr, [x11, #12]
-; CHECK-NEXT:    sturh w9, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
@@ -50,18 +40,6 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    rdsvl x20, #1
-; CHECK-NEXT:    sub x21, x29, #16
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    str x8, [x9]
-; CHECK-NEXT:    strh wzr, [x9, #10]
-; CHECK-NEXT:    str wzr, [x9, #12]
-; CHECK-NEXT:    sturh w20, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x21
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -101,16 +79,6 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    sub x11, x29, #16
-; CHECK-NEXT:    str x8, [x11]
-; CHECK-NEXT:    strh wzr, [x11, #10]
-; CHECK-NEXT:    str wzr, [x11, #12]
-; CHECK-NEXT:    sturh w9, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    smstart za
@@ -144,16 +112,6 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    sub x10, x29, #80
-; CHECK-NEXT:    sub x11, x29, #80
-; CHECK-NEXT:    str x8, [x11]
-; CHECK-NEXT:    strh wzr, [x11, #10]
-; CHECK-NEXT:    str wzr, [x11, #12]
-; CHECK-NEXT:    sturh w9, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x20, x0, #0x1
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 46672c364b73d..94cce06cb9903 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -11,16 +11,16 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    mul x9, x8, x8
+; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    sub x10, x29, #16
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stur x9, [x29, #-16]
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -45,16 +45,16 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    mul x9, x8, x8
+; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    sub x10, x29, #16
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stur x9, [x29, #-16]
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index b62a9ba080e2b..686fe1115c275 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -21,7 +21,8 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    str x8, [x9]
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index cbbfb4a7ca7a6..4de18d114f3e7 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -38,17 +38,17 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    mul x9, x8, x8
+; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    sub x10, x29, #16
+; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    sub x19, x29, #80
+; CHECK-NEXT:    stur x9, [x29, #-16]
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart za
@@ -178,7 +178,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mul x8, x8, x8
+; CHECK-NEXT:    sub x8, x9, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]

>From a84d6a83aa1b344d9d3fb83af2528eb7bf0da157 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 25 Apr 2024 09:48:12 +0100
Subject: [PATCH 12/22] fixup: remove __arm_tpidr2_save checking

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ba42d3451b1bb..164588c5ad728 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8786,15 +8786,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     FuncInfo->setTPIDR2Obj(TPIDR2);
   }
 
-  if (std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj()) {
-    if (auto Global = dyn_cast<GlobalAddressSDNode>(Callee)) {
-      if (Global->getGlobal()->getName() == "__arm_tpidr2_save") {
-        TPIDR2->Uses++;
-        FuncInfo->setTPIDR2Obj(*TPIDR2);
-      }
-    }
-  }
-
   if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
     for (unsigned I = 0; I < InVals.size(); ++I) {
       // The smstart/smstop is chained as part of the call, but when the

>From 792c5e0ba53ea609e610dcc7ef57ba38c2962b57 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 25 Apr 2024 16:26:42 +0100
Subject: [PATCH 13/22] fixup: don't pass frame index to SDNode

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  5 +--
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  6 +--
 .../AArch64/sme-disable-gisel-fisel.ll        | 32 +++++---------
 .../CodeGen/AArch64/sme-lazy-save-call.ll     | 44 +++++++++++++++++--
 .../AArch64/sme-shared-za-interface.ll        |  8 ++--
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    | 23 +++-------
 6 files changed, 66 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 164588c5ad728..1d6d958e34fe4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7519,11 +7519,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
                               DAG.getConstant(1, DL, MVT::i32));
     SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
-    SDValue FI = DAG.getFrameIndex(
-        TPIDR2.FrameIndex,
-        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
     Chain = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL,
-                        DAG.getVTList(MVT::Other), {Chain, Size, FI});
+                        DAG.getVTList(MVT::Other), {Chain, Size});
   }
 
   if (CallConv == CallingConv::PreserveNone) {
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 0aa5380847eed..9cb902c5bff61 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -37,12 +37,12 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
 def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
                               [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
 
-def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<0, 2,
-                              [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<0, 1,
+                              [SDTCisInt<0>]>,
                               [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
 
 let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
-  def ExpandZABuffer : Pseudo<(outs), (ins GPR64:$size, GPR64:$fi), [(AArch64ExpandZABuffer GPR64:$size, GPR64:$fi)]>, Sched<[WriteI]> {}
+  def ExpandZABuffer : Pseudo<(outs), (ins GPR64:$size), [(AArch64ExpandZABuffer GPR64:$size)]>, Sched<[WriteI]> {}
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index bf763a420e3f6..d4a6113905ef1 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -232,19 +232,9 @@ declare double @za_shared_callee(double) "aarch64_inout_za"
 define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{
 ; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
 ; CHECK-COMMON:       // %bb.0: // %prelude
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    mov x29, sp
-; CHECK-COMMON-NEXT:    sub sp, sp, #16
+; CHECK-COMMON-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mul x8, x8, x8
-; CHECK-COMMON-NEXT:    sub x9, x29, #16
-; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    sub x8, x9, x8
-; CHECK-COMMON-NEXT:    mov sp, x8
-; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-COMMON-NEXT:    cbz x8, .LBB6_2
 ; CHECK-COMMON-NEXT:    b .LBB6_1
@@ -260,9 +250,7 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:    fmov d1, x8
 ; CHECK-COMMON-NEXT:    fadd d0, d0, d1
 ; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    mov sp, x29
-; CHECK-COMMON-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
 entry:
   %call = call double @za_shared_callee(double %x)
@@ -279,15 +267,15 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mul x9, x8, x8
-; CHECK-COMMON-NEXT:    sub x10, x29, #16
-; CHECK-COMMON-NEXT:    mov x11, sp
-; CHECK-COMMON-NEXT:    sub x9, x11, x9
+; CHECK-COMMON-NEXT:    mov x10, sp
+; CHECK-COMMON-NEXT:    sub x9, x10, x9
 ; CHECK-COMMON-NEXT:    mov sp, x9
 ; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    sub x8, x29, #16
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x8
 ; CHECK-COMMON-NEXT:    bl normal_callee
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
@@ -324,13 +312,13 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    mov x10, sp
 ; CHECK-COMMON-NEXT:    mul x9, x8, x8
 ; CHECK-COMMON-NEXT:    sub x9, x10, x9
-; CHECK-COMMON-NEXT:    sub x10, x29, #16
 ; CHECK-COMMON-NEXT:    mov sp, x9
 ; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
@@ -390,13 +378,13 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    mov x10, sp
 ; CHECK-COMMON-NEXT:    mul x9, x8, x8
 ; CHECK-COMMON-NEXT:    sub x9, x10, x9
-; CHECK-COMMON-NEXT:    sub x10, x29, #16
 ; CHECK-COMMON-NEXT:    mov sp, x9
 ; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl fmod
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index f218d97faf738..2930ef9e2bdc7 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -13,7 +13,16 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    mul x9, x8, x8
+; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -40,6 +49,17 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    rdsvl x20, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mul x8, x20, x20
+; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    sub x21, x29, #16
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh w20, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x21
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -79,7 +99,16 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    mul x9, x8, x8
+; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -112,7 +141,16 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    mul x9, x8, x8
+; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    stur x9, [x29, #-80]
+; CHECK-NEXT:    sub x9, x29, #80
+; CHECK-NEXT:    sturh wzr, [x29, #-70]
+; CHECK-NEXT:    stur wzr, [x29, #-68]
+; CHECK-NEXT:    sturh w8, [x29, #-72]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x20, x0, #0x1
 ; CHECK-NEXT:    tbz w20, #0, .LBB3_2
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 94cce06cb9903..9eacffd6558ba 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -14,13 +14,13 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    mul x9, x8, x8
 ; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    sub x10, x29, #16
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -48,13 +48,13 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    mul x9, x8, x8
 ; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    sub x10, x29, #16
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 4de18d114f3e7..fa12e7cc04407 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -41,14 +41,14 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    mul x9, x8, x8
 ; CHECK-NEXT:    sub x9, x10, x9
-; CHECK-NEXT:    sub x10, x29, #16
 ; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x19, x29, #80
 ; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    sub x19, x29, #80
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart za
@@ -173,21 +173,12 @@ define void @zt0_new_caller() "aarch64_new_zt0" nounwind {
 define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-LABEL: new_za_zt0_caller:
 ; CHECK:       // %bb.0: // %prelude
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    cbz x8, .LBB7_2
 ; CHECK-NEXT:  // %bb.1: // %save.za
-; CHECK-NEXT:    sub x8, x29, #80
+; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    str zt0, [x8]
 ; CHECK-NEXT:    bl __arm_tpidr2_save
 ; CHECK-NEXT:    ldr zt0, [x8]
@@ -198,8 +189,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-NEXT:    zero { zt0 }
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstop za
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
   ret void;

>From 573c320a808b5a5fbaf24112d5948a1bb158bf03 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 14 May 2024 16:11:53 +0100
Subject: [PATCH 14/22] Go back to two pseudos and use reference for TPIDR2
 object

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 127 +++++++++++-------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   3 +
 .../AArch64/AArch64MachineFunctionInfo.h      |   5 +-
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  17 ++-
 .../AArch64/sme-za-lazy-save-buffer.ll        |  65 +++++++++
 5 files changed, 158 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1d6d958e34fe4..a177758817390 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2493,6 +2493,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::FIRST_NUMBER:
     break;
     MAKE_CASE(AArch64ISD::EXPAND_ZA_BUFFER)
+    MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ)
     MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
     MAKE_CASE(AArch64ISD::VG_SAVE)
     MAKE_CASE(AArch64ISD::VG_RESTORE)
@@ -2992,6 +2993,36 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
   return BB;
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
+                                            MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+  TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
+  if (TPIDR2.Uses > 0) {
+    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+    // Store the buffer pointer to the TPIDR2 stack object.
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+        .addReg(MI.getOperand(0).getReg())
+        .addFrameIndex(TPIDR2.FrameIndex)
+        .addImm(0);
+    // Set the reserved bytes (10-15) to zero
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
+        .addReg(AArch64::WZR)
+        .addFrameIndex(TPIDR2.FrameIndex)
+        .addImm(5);
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
+        .addReg(AArch64::WZR)
+        .addFrameIndex(TPIDR2.FrameIndex)
+        .addImm(3);
+  } else
+    MFI.RemoveStackObject(TPIDR2.FrameIndex);
+
+  BB->remove_instr(&MI);
+  return BB;
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
                                           MachineBasicBlock *BB) const {
@@ -3005,52 +3036,35 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
   assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
          "Lazy ZA save is not yet supported on Windows");
 
-  std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj();
-  if (!TPIDR2)
-    llvm_unreachable("Cannot ExpandZABuffer without valid TPIDR2 object");
+  TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
 
-  if (TPIDR2->Uses == 0) {
-    BB->remove_instr(&MI);
-    MFI.RemoveStackObject(TPIDR2->FrameIndex);
-    return BB;
+  if (TPIDR2.Uses > 0) {
+    const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    // The SUBXrs below won't always be emitted in a form that accepts SP
+    // directly
+    Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
+        .addReg(AArch64::SP);
+
+    // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
+    Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), BufferAddr)
+        .addReg(SP)
+        .add(MI.getOperand(1))
+        .addImm(0);
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+            AArch64::SP)
+        .addReg(BufferAddr);
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+            MI.getOperand(0).getReg())
+        .addReg(BufferAddr);
+
+    // We have just allocated a variable sized object, tell this to PEI.
+    MFI.CreateVariableSizedObject(Align(16), nullptr);
   }
 
-  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-
-  // The SUBXrs below won't always be emitted in a form that accepts SP directly
-  Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
-      .addReg(AArch64::SP);
-
-  // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
-  Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), BufferAddr)
-      .addReg(SP)
-      .add(MI.getOperand(0))
-      .addImm(0);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
-      .addReg(BufferAddr);
-
-  // Allocate an additional TPIDR2 object on the stack (16 bytes)
-  unsigned TPIDR2Object = TPIDR2->FrameIndex;
-  MFI.CreateVariableSizedObject(Align(16), nullptr);
-
-  // Store the buffer pointer to the TPIDR2 stack object.
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
-      .addReg(BufferAddr)
-      .addFrameIndex(TPIDR2Object)
-      .addImm(0);
-  // Set the reserved bytes (10-15) to zero
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
-      .addReg(AArch64::WZR)
-      .addFrameIndex(TPIDR2Object)
-      .addImm(5);
-  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
-      .addReg(AArch64::WZR)
-      .addFrameIndex(TPIDR2Object)
-      .addImm(3);
-
   BB->remove_instr(&MI);
   return BB;
 }
@@ -3085,9 +3099,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MI.dump();
 #endif
     llvm_unreachable("Unexpected instruction for custom inserter!");
+  case AArch64::InitTPIDR2Obj:
+    return EmitInitTPIDR2Object(MI, BB);
   case AArch64::ExpandZABuffer:
     return EmitExpandZABuffer(MI, BB);
-
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
   case TargetOpcode::STATEPOINT:
@@ -7513,14 +7528,25 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // Create a 16 Byte TPIDR2 object. The dynamic buffer
   // will be expanded and stored in the static object later using a pseudonode.
   if (SMEAttrs(MF.getFunction()).hasZAState()) {
-    TPIDR2Object TPIDR2;
+    TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
     TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
-    FuncInfo->setTPIDR2Obj(TPIDR2);
     SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
                               DAG.getConstant(1, DL, MVT::i32));
     SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
-    Chain = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL,
-                        DAG.getVTList(MVT::Other), {Chain, Size});
+
+    SDValue Buffer;
+    if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
+      Buffer = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL,
+                           DAG.getVTList(MVT::i64, MVT::Other), {Chain, Size});
+    } else {
+      Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
+                           DAG.getVTList(MVT::i64, MVT::Other),
+                           {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
+      MFI.CreateVariableSizedObject(Align(1), nullptr);
+    }
+    Chain = DAG.getNode(
+        AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
+        {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
   }
 
   if (CallConv == CallingConv::PreserveNone) {
@@ -8206,7 +8232,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
   if (RequiresLazySave) {
-    const TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
+    const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
     MachinePointerInfo MPI =
         MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
     SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
@@ -8753,7 +8779,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (RequiresLazySave) {
     // Conditionally restore the lazy save using a pseudo node.
-    TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
+    TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
     SDValue RegMask = DAG.getRegisterMask(
         TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
     SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
@@ -8780,7 +8806,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
         DAG.getConstant(0, DL, MVT::i64));
     TPIDR2.Uses++;
-    FuncInfo->setTPIDR2Obj(TPIDR2);
   }
 
   if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6d240ee8a6ce2..d301ea7bf877b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -458,6 +458,7 @@ enum NodeType : unsigned {
   RDSVL,
   REVD_MERGE_PASSTHRU,
   EXPAND_ZA_BUFFER,
+  INIT_TPIDR2OBJ,
 
   // Asserts that a function argument (i32) is zero-extended to i8 by
   // the caller
@@ -659,6 +660,8 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB,
                                  unsigned Opcode, bool Op0IsDef) const;
   MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI,
+                                          MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitExpandZABuffer(MachineInstr &MI,
                                         MachineBasicBlock *BB) const;
 
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 354e234b1c363..3da2c9d12c62f 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -201,7 +201,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool IsSVECC = false;
 
   /// The frame-index for the TPIDR2 object used for lazy saves.
-  std::optional<TPIDR2Object> TPIDR2;
+  TPIDR2Object TPIDR2;
 
   /// Whether this function changes streaming mode within the function.
   bool HasStreamingModeChanges = false;
@@ -253,8 +253,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool isSVECC() const { return IsSVECC; };
   void setIsSVECC(bool s) { IsSVECC = s; };
 
-  std::optional<TPIDR2Object> getTPIDR2Obj() { return TPIDR2; }
-  void setTPIDR2Obj(TPIDR2Object Obj) { TPIDR2 = Obj; }
+  TPIDR2Object &getTPIDR2Obj() { return TPIDR2; }
 
   void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 9cb902c5bff61..4f89c599eff55 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -37,12 +37,19 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
 def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
                               [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
 
-def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<0, 1,
-                              [SDTCisInt<0>]>,
-                              [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
-
+def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<1, 1,
+                              [SDTCisInt<0>, SDTCisInt<1>]>,
+                              [SDNPHasChain, SDNPSideEffect]>;
 let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
-  def ExpandZABuffer : Pseudo<(outs), (ins GPR64:$size), [(AArch64ExpandZABuffer GPR64:$size)]>, Sched<[WriteI]> {}
+  def ExpandZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {}
+}
+def : Pat<(i64 (AArch64ExpandZABuffer GPR64:$size)),
+          (ExpandZABuffer $size)>;
+
+def AArch64InitTPIDR2Obj  : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1,
+                              [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>;
+let usesCustomInserter = 1 in {
+  def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {}
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index 686fe1115c275..2a35ea8e88c1f 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -67,4 +67,69 @@ exit:
   ret float %ret
 }
 
+define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" {
+; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    str xzr, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB2_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_3:
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    ldr xzr, [sp]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    cbz w0, .LBB2_5
+; CHECK-NEXT:  // %bb.4: // %use_b
+; CHECK-NEXT:    fmov s1, #4.00000000
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    b .LBB2_8
+; CHECK-NEXT:  .LBB2_5: // %use_c
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    bl cosf
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB2_7
+; CHECK-NEXT:  // %bb.6: // %use_c
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB2_7: // %use_c
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB2_8: // %exit
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %a, 0
+  br i1 %cmp, label %use_b, label %use_c
+
+use_b:
+  %faddr = fadd float %b, 4.0
+  br label %exit
+
+use_c:
+  %res2 = call float @llvm.cos.f32(float %c)
+  br label %exit
+
+exit:
+  %ret = phi float [%faddr, %use_b], [%res2, %use_c]
+  ret float %ret
+}
+
 declare float @llvm.cos.f32(float)

>From 2841aee11b801c175bbcc74d9f7541fb61b4944c Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 7 Jun 2024 09:44:45 +0100
Subject: [PATCH 15/22] Pass SVL to pseudo and remove unnecessary copy

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a177758817390..f96317ad21a66 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3049,17 +3049,15 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
         .addReg(AArch64::SP);
 
     // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
-    Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
-    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), BufferAddr)
-        .addReg(SP)
-        .add(MI.getOperand(1))
-        .addImm(0);
+    auto Size = MI.getOperand(1).getReg();
+    auto Dest = MI.getOperand(0).getReg();
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
+        .addReg(Size)
+        .addReg(Size)
+        .addReg(SP);
     BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
             AArch64::SP)
-        .addReg(BufferAddr);
-    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
-            MI.getOperand(0).getReg())
-        .addReg(BufferAddr);
+        .addReg(Dest);
 
     // We have just allocated a variable sized object, tell this to PEI.
     MFI.CreateVariableSizedObject(Align(16), nullptr);
@@ -7532,13 +7530,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
     SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
                               DAG.getConstant(1, DL, MVT::i32));
-    SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
 
     SDValue Buffer;
     if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
       Buffer = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL,
-                           DAG.getVTList(MVT::i64, MVT::Other), {Chain, Size});
+                           DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
     } else {
+      SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
       Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
                            DAG.getVTList(MVT::i64, MVT::Other),
                            {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});

>From 2bd66b5783f93f0d1fd151e26d359f4a8f566738 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 7 Jun 2024 09:45:02 +0100
Subject: [PATCH 16/22] Fix stack object alignment

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f96317ad21a66..15e037a1e0c73 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7540,7 +7540,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
                            DAG.getVTList(MVT::i64, MVT::Other),
                            {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
-      MFI.CreateVariableSizedObject(Align(1), nullptr);
+      MFI.CreateVariableSizedObject(Align(16), nullptr);
     }
     Chain = DAG.getNode(
         AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),

>From 2a20f1f9bc99a5d558e45fdc70d0c19b19dec085 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 7 Jun 2024 09:45:09 +0100
Subject: [PATCH 17/22] Make FrameIndex an int

---
 llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 3da2c9d12c62f..001521d1101eb 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -37,7 +37,7 @@ class AArch64Subtarget;
 class MachineInstr;
 
 struct TPIDR2Object {
-  unsigned FrameIndex = std::numeric_limits<int>::max();
+  int FrameIndex = std::numeric_limits<int>::max();
   unsigned Uses = 0;
 };
 

>From 500b48fe949ed92b2ec924546cb0a321f57919b0 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 7 Jun 2024 11:26:11 +0100
Subject: [PATCH 18/22] Rename SDNode

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++--
 llvm/lib/Target/AArch64/AArch64ISelLowering.h   | 2 +-
 llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 15e037a1e0c73..f5de26da73bfd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2492,7 +2492,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((AArch64ISD::NodeType)Opcode) {
   case AArch64ISD::FIRST_NUMBER:
     break;
-    MAKE_CASE(AArch64ISD::EXPAND_ZA_BUFFER)
+    MAKE_CASE(AArch64ISD::ALLOCATE_ZA_BUFFER)
     MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ)
     MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
     MAKE_CASE(AArch64ISD::VG_SAVE)
@@ -7533,7 +7533,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
     SDValue Buffer;
     if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
-      Buffer = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL,
+      Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
                            DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
     } else {
       SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d301ea7bf877b..20a5ecb2090a8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -457,7 +457,7 @@ enum NodeType : unsigned {
   // SME
   RDSVL,
   REVD_MERGE_PASSTHRU,
-  EXPAND_ZA_BUFFER,
+  ALLOCATE_ZA_BUFFER,
   INIT_TPIDR2OBJ,
 
   // Asserts that a function argument (i32) is zero-extended to i8 by
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 4f89c599eff55..5a393a53f4ccd 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -37,7 +37,7 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
 def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
                               [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
 
-def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<1, 1,
+def AArch64ExpandZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1,
                               [SDTCisInt<0>, SDTCisInt<1>]>,
                               [SDNPHasChain, SDNPSideEffect]>;
 let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {

>From 5c1ac94222d95930280b7ad26f1849eb18a0cc9e Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 7 Jun 2024 11:37:18 +0100
Subject: [PATCH 19/22] Update tests

---
 .../AArch64/sme-disable-gisel-fisel.ll        | 16 ++++++---------
 .../CodeGen/AArch64/sme-lazy-save-call.ll     | 20 ++++++++-----------
 .../AArch64/sme-shared-za-interface.ll        | 10 ++++------
 .../AArch64/sme-za-lazy-save-buffer.ll        |  3 +--
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    |  5 ++---
 5 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index d4a6113905ef1..42dba22d25708 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -234,7 +234,6 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON:       // %bb.0: // %prelude
 ; CHECK-COMMON-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mul x8, x8, x8
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-COMMON-NEXT:    cbz x8, .LBB6_2
 ; CHECK-COMMON-NEXT:    b .LBB6_1
@@ -266,9 +265,8 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mul x9, x8, x8
-; CHECK-COMMON-NEXT:    mov x10, sp
-; CHECK-COMMON-NEXT:    sub x9, x10, x9
+; CHECK-COMMON-NEXT:    mov x9, sp
+; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x9
 ; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
@@ -309,9 +307,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mov x10, sp
-; CHECK-COMMON-NEXT:    mul x9, x8, x8
-; CHECK-COMMON-NEXT:    sub x9, x10, x9
+; CHECK-COMMON-NEXT:    mov x9, sp
+; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x9
 ; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sub x9, x29, #16
@@ -375,9 +372,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
-; CHECK-COMMON-NEXT:    mov x10, sp
-; CHECK-COMMON-NEXT:    mul x9, x8, x8
-; CHECK-COMMON-NEXT:    sub x9, x10, x9
+; CHECK-COMMON-NEXT:    mov x9, sp
+; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
 ; CHECK-COMMON-NEXT:    mov sp, x9
 ; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sub x9, x29, #16
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 2930ef9e2bdc7..4ade335c254dc 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -13,9 +13,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    mul x9, x8, x8
-; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-16]
 ; CHECK-NEXT:    sub x9, x29, #16
@@ -50,9 +49,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x20, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x20, x20
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    msub x8, x20, x20, x8
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    sub x21, x29, #16
 ; CHECK-NEXT:    stur x8, [x29, #-16]
@@ -99,9 +97,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    mul x9, x8, x8
-; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-16]
 ; CHECK-NEXT:    sub x9, x29, #16
@@ -141,9 +138,8 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    mul x9, x8, x8
-; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-80]
 ; CHECK-NEXT:    sub x9, x29, #80
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 9eacffd6558ba..393ff3b79aedf 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -11,9 +11,8 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    mul x9, x8, x8
-; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-16]
 ; CHECK-NEXT:    sub x9, x29, #16
@@ -45,9 +44,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    mul x9, x8, x8
-; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-16]
 ; CHECK-NEXT:    sub x9, x29, #16
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index 2a35ea8e88c1f..992cf8722d2b2 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -21,8 +21,7 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    sub x8, x9, x8
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
 ; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    str x8, [x9]
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index fa12e7cc04407..312537630e77a 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -38,9 +38,8 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    mul x9, x8, x8
-; CHECK-NEXT:    sub x9, x10, x9
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-16]
 ; CHECK-NEXT:    sub x9, x29, #16

>From 41d5097bd81277042084d5c190eac6f6e3d26c07 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Wed, 12 Jun 2024 10:53:10 +0100
Subject: [PATCH 20/22] Fix remaining renamings

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 +++---
 llvm/lib/Target/AArch64/AArch64ISelLowering.h   | 2 +-
 llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td  | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f5de26da73bfd..d048a14b6719e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3024,7 +3024,7 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
 }
 
 MachineBasicBlock *
-AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
+AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI,
                                           MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -3099,8 +3099,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     llvm_unreachable("Unexpected instruction for custom inserter!");
   case AArch64::InitTPIDR2Obj:
     return EmitInitTPIDR2Object(MI, BB);
-  case AArch64::ExpandZABuffer:
-    return EmitExpandZABuffer(MI, BB);
+  case AArch64::AllocateZABuffer:
+    return EmitAllocateZABuffer(MI, BB);
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
   case TargetOpcode::STATEPOINT:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 20a5ecb2090a8..3ca8ce9eadf8b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -662,7 +662,7 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
-  MachineBasicBlock *EmitExpandZABuffer(MachineInstr &MI,
+  MachineBasicBlock *EmitAllocateZABuffer(MachineInstr &MI,
                                         MachineBasicBlock *BB) const;
 
   MachineBasicBlock *
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 5a393a53f4ccd..adc8e6d3ff877 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -37,14 +37,14 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
 def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
                               [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
 
-def AArch64ExpandZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1,
+def AArch64AllocateZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1,
                               [SDTCisInt<0>, SDTCisInt<1>]>,
                               [SDNPHasChain, SDNPSideEffect]>;
 let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
-  def ExpandZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {}
+  def AllocateZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {}
 }
-def : Pat<(i64 (AArch64ExpandZABuffer GPR64:$size)),
-          (ExpandZABuffer $size)>;
+def : Pat<(i64 (AArch64AllocateZABuffer GPR64:$size)),
+          (AllocateZABuffer $size)>;
 
 def AArch64InitTPIDR2Obj  : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1,
                               [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>;

>From 95464729f388263f4ccb63d4f24c9fd8d8de2dd4 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 13 Jun 2024 16:04:49 +0100
Subject: [PATCH 21/22] Format

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d048a14b6719e..3fdf9528e27bf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3025,7 +3025,7 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
 
 MachineBasicBlock *
 AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI,
-                                          MachineBasicBlock *BB) const {
+                                            MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 3ca8ce9eadf8b..986f1b67ee513 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -663,7 +663,7 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitAllocateZABuffer(MachineInstr &MI,
-                                        MachineBasicBlock *BB) const;
+                                          MachineBasicBlock *BB) const;
 
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,

>From 9a7a6b767ed692a9b0e11fea5049f6beb3f1c342 Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs at arm.com>
Date: Fri, 14 Jun 2024 12:52:54 +0100
Subject: [PATCH 22/22] rebase

---
 llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll   | 10 +---------
 llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll |  8 ++++----
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
index 29d15f34d680b..8d028c11b4a6b 100644
--- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
+++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll
@@ -18,7 +18,7 @@ define void @quux() #1 {
 ; CHECK-NEXT:    stp x22, x21, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #400
+; CHECK-NEXT:    sub sp, sp, #384
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    mov x19, sp
 ; CHECK-NEXT:    .cfi_def_cfa w29, 96
@@ -35,14 +35,6 @@ define void @quux() #1 {
 ; CHECK-NEXT:    .cfi_offset w30, -88
 ; CHECK-NEXT:    .cfi_offset w29, -96
 ; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    sub x9, x9, x8
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    str x9, [x19, #384]
-; CHECK-NEXT:    strh w8, [x19, #394]
-; CHECK-NEXT:    str w8, [x19, #396]
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    cbz x8, .LBB0_2
 ; CHECK-NEXT:    b .LBB0_1
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index 992cf8722d2b2..ad3f7f5514d0e 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -23,10 +23,9 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    sub x9, x29, #16
-; CHECK-NEXT:    str x8, [x9]
-; CHECK-NEXT:    strh wzr, [x9, #10]
-; CHECK-NEXT:    str wzr, [x9, #12]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    cbz w0, .LBB1_2
 ; CHECK-NEXT:  // %bb.1: // %use_b
 ; CHECK-NEXT:    fmov s1, #4.00000000
@@ -35,6 +34,7 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6
 ; CHECK-NEXT:  .LBB1_2: // %use_c
 ; CHECK-NEXT:    fmov s0, s1
 ; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl cosf



More information about the llvm-commits mailing list