[llvm] [AArch64][SME] Remove unused ZA lazy-save (PR #81648)

via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 13 10:56:01 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Matthew Devereau (MDevereau)

<details>
<summary>Changes</summary>

This patch removes the TPIDR2 lazy-save object and buffer if no lazy save is required.

---

Patch is 25.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81648.diff


9 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+89-8) 
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+2) 
- (modified) llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h (+8-3) 
- (modified) llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td (+4) 
- (modified) llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll (+18-15) 
- (modified) llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll (+27-23) 
- (modified) llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll (+14-12) 
- (added) llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll (+69) 
- (modified) llvm/test/CodeGen/AArch64/sme-zt0-state.ll (+19-58) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 332fb37655288c..d09fd6a5eb7bf1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2867,6 +2867,68 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
   return BB;
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
+                                          MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+
+  std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj();
+  if (!TPIDR2)
+    llvm_unreachable("Cannot ExpandZABuffer without valid TPIDR2 object");
+
+  if (TPIDR2->Uses == 0) {
+    BB->remove_instr(&MI);
+    MFI.RemoveStackObject(TPIDR2->Addr);
+    return BB;
+  }
+
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  Register RDSVL = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL)
+      .addImm(1);
+
+  Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
+      .addReg(AArch64::SP);
+
+  Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub)
+      .addReg(RDSVL)
+      .addReg(RDSVL)
+      .addReg(SP);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
+      .addReg(MSub);
+
+  uint64_t TPIDR2Object = TPIDR2->Addr;
+
+  MFI.CreateVariableSizedObject(Align(1), nullptr);
+
+  Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+  MachineInstrBuilder Wzr =
+      BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32)
+          .addReg(AArch64::WZR);
+
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+      .addReg(MSub)
+      .addFrameIndex(TPIDR2Object)
+      .addImm(0);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
+      .addReg(Wzr.getReg(0))
+      .addFrameIndex(TPIDR2Object)
+      .addImm(5);
+  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
+      .addReg(Wzr.getReg(0))
+      .addFrameIndex(TPIDR2Object)
+      .addImm(3);
+
+  BB->remove_instr(&MI);
+  return BB;
+}
+
 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
 
@@ -2897,6 +2959,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MI.dump();
 #endif
     llvm_unreachable("Unexpected instruction for custom inserter!");
+  case AArch64::ExpandZABuffer:
+    return EmitExpandZABuffer(MI, BB);
 
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
@@ -7051,10 +7115,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   if (Subtarget->hasCustomCallingConv())
     Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
 
-  // Conservatively assume the function requires the lazy-save mechanism.
+  // Create a 16 Byte TPIDR2 object. The dynamic buffer
+  // will be expanded and stored in the static object later using a pseudonode.
   if (SMEAttrs(MF.getFunction()).hasZAState()) {
-    unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
-    FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
+    Chain = SDValue(
+        DAG.getMachineNode(AArch64::ExpandZABuffer, DL, MVT::Other, Chain), 0);
+    TPIDR2Object TPIDR2;
+    TPIDR2.Addr = MFI.CreateStackObject(16, Align(16), false);
+    FuncInfo->setTPIDR2Obj(TPIDR2);
   }
 
   return Chain;
@@ -7677,9 +7745,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
   if (RequiresLazySave) {
-    unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
-    MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
-    SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
+    TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
+    MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2.Addr);
+    SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
+        TPIDR2.Addr,
         DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
     SDValue NumZaSaveSlicesAddr =
         DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
@@ -8178,7 +8247,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (RequiresLazySave) {
     // Conditionally restore the lazy save using a pseudo node.
-    unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
+    TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
     SDValue RegMask = DAG.getRegisterMask(
         TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
     SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
@@ -8191,7 +8260,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     // RESTORE_ZA pseudo.
     SDValue Glue;
     SDValue TPIDR2Block = DAG.getFrameIndex(
-        FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+        TPIDR2.Addr,
+        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
     Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
     Result =
         DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
@@ -8203,6 +8273,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
         DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
         DAG.getConstant(0, DL, MVT::i64));
+    TPIDR2.Uses++;
+    FuncInfo->setTPIDR2Obj(TPIDR2);
+  }
+
+  if (std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj()) {
+    if (auto Global = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      if (Global->getGlobal()->getName() == "__arm_tpidr2_save") {
+        TPIDR2->Uses++;
+        FuncInfo->setTPIDR2Obj(*TPIDR2);
+      }
+    }
   }
 
   if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6505931e17e18d..66048409f81ab5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -639,6 +639,8 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB,
                                  unsigned Opcode, bool Op0IsDef) const;
   MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitExpandZABuffer(MachineInstr &MI,
+                                        MachineBasicBlock *BB) const;
 
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d5941e6284111a..b43db1883b92e3 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -35,6 +35,11 @@ struct AArch64FunctionInfo;
 class AArch64Subtarget;
 class MachineInstr;
 
+struct TPIDR2Object {
+  uint64_t Addr = 0;
+  uint32_t Uses = 0;
+};
+
 /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
 /// contains private AArch64-specific information for each MachineFunction.
 class AArch64FunctionInfo final : public MachineFunctionInfo {
@@ -195,7 +200,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool IsSVECC = false;
 
   /// The frame-index for the TPIDR2 object used for lazy saves.
-  Register LazySaveTPIDR2Obj = 0;
+  std::optional<TPIDR2Object> TPIDR2;
 
   /// Whether this function changes streaming mode within the function.
   bool HasStreamingModeChanges = false;
@@ -226,8 +231,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool isSVECC() const { return IsSVECC; };
   void setIsSVECC(bool s) { IsSVECC = s; };
 
-  unsigned getLazySaveTPIDR2Obj() const { return LazySaveTPIDR2Obj; }
-  void setLazySaveTPIDR2Obj(unsigned Reg) { LazySaveTPIDR2Obj = Reg; }
+  std::optional<TPIDR2Object> getTPIDR2Obj() { return TPIDR2; }
+  void setTPIDR2Obj(TPIDR2Object Obj) { TPIDR2 = Obj; }
 
   void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index eeae5303a3f898..0ffd709a48b5e9 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -29,6 +29,10 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
                              [SDTCisInt<0>, SDTCisPtrTy<1>]>,
                              [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
 
+let usesCustomInserter = 1 in {
+  def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {}
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction naming conventions.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index e18e18a1cfad18..c72d3ef0258362 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -254,11 +254,12 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    mov sp, x8
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
 ; CHECK-COMMON-NEXT:    sub x8, x29, #16
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x8
@@ -294,14 +295,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwi
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    sub x10, x29, #16
-; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    mov sp, x8
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
@@ -356,14 +358,15 @@ define double @frem_call_za(double %a, double %b) "aarch64_pstate_za_shared" nou
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
 ; CHECK-COMMON-NEXT:    mov x9, sp
-; CHECK-COMMON-NEXT:    msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT:    mov sp, x9
-; CHECK-COMMON-NEXT:    sub x10, x29, #16
-; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT:    msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT:    mov sp, x8
+; CHECK-COMMON-NEXT:    stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    sub x9, x29, #16
 ; CHECK-COMMON-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT:    stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-COMMON-NEXT:    sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-COMMON-NEXT:    bl fmod
 ; CHECK-COMMON-NEXT:    smstart za
 ; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9625e139bd0bc5..ec2e6b44e8af0f 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -13,14 +13,15 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -45,14 +46,15 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
 ; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x19, #1
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    msub x8, x19, x19, x8
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    rdsvl x19, #1
 ; CHECK-NEXT:    sub x20, x29, #16
-; CHECK-NEXT:    stur wzr, [x29, #-4]
-; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w19, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x20
 ; CHECK-NEXT:    bl private_za_callee
@@ -93,14 +95,15 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_psta
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -131,14 +134,15 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #80
-; CHECK-NEXT:    stur wzr, [x29, #-68]
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-80]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #80
 ; CHECK-NEXT:    sturh wzr, [x29, #-70]
-; CHECK-NEXT:    stur x9, [x29, #-80]
+; CHECK-NEXT:    stur wzr, [x29, #-68]
 ; CHECK-NEXT:    sturh w8, [x29, #-72]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB3_2
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index a2e20013d94ff1..99e2edf41e6f4a 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -12,14 +12,15 @@ define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind {
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -45,14 +46,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwi
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
-; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
new file mode 100644
index 00000000000000..aaf11bf2ba64a6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+
+define i32 @no_tpidr2_save_required() "aarch64_pstate_za_shared" {
+; CHECK-LABEL: no_tpidr2_save_required:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, #42 // =0x2a
+; CHECK-NEXT:    ret
+entry:
+  ret i32 42
+}
+
+define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_pstate_za_shared" {
+; CHECK-LABEL: multi_bb_stpidr2_save_required:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    cbz w0, .LBB1_2
+; CHECK-NEXT:  // %bb.1: // %use_b
+; CHECK-NEXT:    fmov s1, #4.00000000
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    b .LBB1_5
+; CHECK-NEXT:  .LBB1_2: // %use_c
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    sub x9, x29, #16
+; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x9
+; CHECK-NEXT:    bl cosf
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB1_4
+; CHECK-NEXT:  // %bb.3: // %use_c
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB1_4: // %use_c
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB1_5: // %exit
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %cmp = icmp ne i32 %a, 0
+  br i1 %cmp, label %use_b, label %use_c
+
+use_b:
+  %faddr = fadd float %b, 4.0
+  br label %exit
+
+use_c:
+  %res2 = call float @llvm.cos.f32(float %c)
+  br label %exit
+
+exit:
+  %ret = phi float [%faddr, %use_b], [%res2, %use_c]
+  ret float %ret
+}
+
+declare float @llvm.cos.f32(float)
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/81648


More information about the llvm-commits mailing list