[llvm] [AArch64][SME] Remove unused ZA lazy-save (PR #81648)
Sam Tebbs via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 2 09:10:52 PDT 2024
https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/81648
>From b7f6b9c4cc53bdca6153ed8fed4ec98e5fac768b Mon Sep 17 00:00:00 2001
From: Matt Devereau <matthew.devereau at arm.com>
Date: Mon, 29 Jan 2024 09:49:38 +0000
Subject: [PATCH 1/7] [AArch64][SME] Remove unused ZA lazy-save
This patch removes the TPIDR2 lazy-save object and buffer
if no lazy save is required.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 97 +++++++++++++++++--
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +
.../AArch64/AArch64MachineFunctionInfo.h | 11 ++-
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 4 +
.../AArch64/sme-disable-gisel-fisel.ll | 33 ++++---
.../CodeGen/AArch64/sme-lazy-save-call.ll | 50 +++++-----
.../AArch64/sme-shared-za-interface.ll | 26 ++---
.../AArch64/sme-za-lazy-save-buffer.ll | 69 +++++++++++++
llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 77 ++++-----------
9 files changed, 250 insertions(+), 119 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f552f91929201c..f9f702a24916be 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2929,6 +2929,68 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+
+ std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj();
+ if (!TPIDR2)
+ llvm_unreachable("Cannot ExpandZABuffer without valid TPIDR2 object");
+
+ if (TPIDR2->Uses == 0) {
+ BB->remove_instr(&MI);
+ MFI.RemoveStackObject(TPIDR2->Addr);
+ return BB;
+ }
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ Register RDSVL = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL)
+ .addImm(1);
+
+ Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
+ .addReg(AArch64::SP);
+
+ Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub)
+ .addReg(RDSVL)
+ .addReg(RDSVL)
+ .addReg(SP);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
+ .addReg(MSub);
+
+ uint64_t TPIDR2Object = TPIDR2->Addr;
+
+ MFI.CreateVariableSizedObject(Align(1), nullptr);
+
+ Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ MachineInstrBuilder Wzr =
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32)
+ .addReg(AArch64::WZR);
+
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+ .addReg(MSub)
+ .addFrameIndex(TPIDR2Object)
+ .addImm(0);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
+ .addReg(Wzr.getReg(0))
+ .addFrameIndex(TPIDR2Object)
+ .addImm(5);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
+ .addReg(Wzr.getReg(0))
+ .addFrameIndex(TPIDR2Object)
+ .addImm(3);
+
+ BB->remove_instr(&MI);
+ return BB;
+}
+
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
@@ -2959,6 +3021,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MI.dump();
#endif
llvm_unreachable("Unexpected instruction for custom inserter!");
+ case AArch64::ExpandZABuffer:
+ return EmitExpandZABuffer(MI, BB);
case AArch64::F128CSEL:
return EmitF128CSEL(MI, BB);
@@ -7311,10 +7375,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
- // Conservatively assume the function requires the lazy-save mechanism.
+ // Create a 16 Byte TPIDR2 object. The dynamic buffer
+ // will be expanded and stored in the static object later using a pseudonode.
if (SMEAttrs(MF.getFunction()).hasZAState()) {
- unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
- FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
+ Chain = SDValue(
+ DAG.getMachineNode(AArch64::ExpandZABuffer, DL, MVT::Other, Chain), 0);
+ TPIDR2Object TPIDR2;
+ TPIDR2.Addr = MFI.CreateStackObject(16, Align(16), false);
+ FuncInfo->setTPIDR2Obj(TPIDR2);
}
return Chain;
@@ -7985,9 +8053,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
if (RequiresLazySave) {
- unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
- MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
- SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
+ TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
+ MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2.Addr);
+ SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
+ TPIDR2.Addr,
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
SDValue NumZaSaveSlicesAddr =
DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
@@ -8502,7 +8571,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (RequiresLazySave) {
// Conditionally restore the lazy save using a pseudo node.
- unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
+ TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
SDValue RegMask = DAG.getRegisterMask(
TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
@@ -8515,7 +8584,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// RESTORE_ZA pseudo.
SDValue Glue;
SDValue TPIDR2Block = DAG.getFrameIndex(
- FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+ TPIDR2.Addr,
+ DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
Result =
DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
@@ -8527,6 +8597,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i64));
+ TPIDR2.Uses++;
+ FuncInfo->setTPIDR2Obj(TPIDR2);
+ }
+
+ if (std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj()) {
+ if (auto Global = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ if (Global->getGlobal()->getName() == "__arm_tpidr2_save") {
+ TPIDR2->Uses++;
+ FuncInfo->setTPIDR2Obj(*TPIDR2);
+ }
+ }
}
if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 3465f3be887543..13bf1a5ee44927 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -642,6 +642,8 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB,
unsigned Opcode, bool Op0IsDef) const;
MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitExpandZABuffer(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d5941e6284111a..b43db1883b92e3 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -35,6 +35,11 @@ struct AArch64FunctionInfo;
class AArch64Subtarget;
class MachineInstr;
+struct TPIDR2Object {
+ uint64_t Addr = 0;
+ uint32_t Uses = 0;
+};
+
/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
/// contains private AArch64-specific information for each MachineFunction.
class AArch64FunctionInfo final : public MachineFunctionInfo {
@@ -195,7 +200,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
bool IsSVECC = false;
/// The frame-index for the TPIDR2 object used for lazy saves.
- Register LazySaveTPIDR2Obj = 0;
+ std::optional<TPIDR2Object> TPIDR2;
/// Whether this function changes streaming mode within the function.
bool HasStreamingModeChanges = false;
@@ -226,8 +231,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
bool isSVECC() const { return IsSVECC; };
void setIsSVECC(bool s) { IsSVECC = s; };
- unsigned getLazySaveTPIDR2Obj() const { return LazySaveTPIDR2Obj; }
- void setLazySaveTPIDR2Obj(unsigned Reg) { LazySaveTPIDR2Obj = Reg; }
+ std::optional<TPIDR2Object> getTPIDR2Obj() { return TPIDR2; }
+ void setTPIDR2Obj(TPIDR2Object Obj) { TPIDR2 = Obj; }
void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2db0fa25343450..2ad98c3313caee 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -31,6 +31,10 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
+let usesCustomInserter = 1 in {
+ def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {}
+}
+
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index cd348be5d771d1..694bc6d0bd9377 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -256,11 +256,12 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT: mov sp, x9
-; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT: msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT: mov sp, x8
+; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
; CHECK-COMMON-NEXT: sub x8, x29, #16
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8
@@ -296,14 +297,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT: mov sp, x9
-; CHECK-COMMON-NEXT: sub x10, x29, #16
-; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT: msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT: mov sp, x8
+; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT: rdsvl x8, #1
+; CHECK-COMMON-NEXT: sub x9, x29, #16
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9
; CHECK-COMMON-NEXT: bl __addtf3
; CHECK-COMMON-NEXT: smstart za
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
@@ -358,14 +360,15 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT: mov sp, x9
-; CHECK-COMMON-NEXT: sub x10, x29, #16
-; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT: msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT: mov sp, x8
+; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT: rdsvl x8, #1
+; CHECK-COMMON-NEXT: sub x9, x29, #16
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9
; CHECK-COMMON-NEXT: bl fmod
; CHECK-COMMON-NEXT: smstart za
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9d635f0b88f191..4eb21ed70dd0fb 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -13,14 +13,15 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -45,14 +46,15 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x19, #1
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: msub x8, x19, x19, x8
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x8, x8, x8, x9
; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: rdsvl x19, #1
; CHECK-NEXT: sub x20, x29, #16
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w19, [x29, #-8]
; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_callee
@@ -93,14 +95,15 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl cosf
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -131,14 +134,15 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #80
-; CHECK-NEXT: stur wzr, [x29, #-68]
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-80]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #80
; CHECK-NEXT: sturh wzr, [x29, #-70]
-; CHECK-NEXT: stur x9, [x29, #-80]
+; CHECK-NEXT: stur wzr, [x29, #-68]
; CHECK-NEXT: sturh w8, [x29, #-72]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB3_2
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index cd7460b177c4bc..46672c364b73d8 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -12,14 +12,15 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -45,14 +46,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl __addtf3
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
new file mode 100644
index 00000000000000..aaf11bf2ba64a6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+
+define i32 @no_tpidr2_save_required() "aarch64_pstate_za_shared" {
+; CHECK-LABEL: no_tpidr2_save_required:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w0, #42 // =0x2a
+; CHECK-NEXT: ret
+entry:
+ ret i32 42
+}
+
+define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_pstate_za_shared" {
+; CHECK-LABEL: multi_bb_stpidr2_save_required:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: cbz w0, .LBB1_2
+; CHECK-NEXT: // %bb.1: // %use_b
+; CHECK-NEXT: fmov s1, #4.00000000
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: b .LBB1_5
+; CHECK-NEXT: .LBB1_2: // %use_c
+; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
+; CHECK-NEXT: sturh w8, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: bl cosf
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB1_4
+; CHECK-NEXT: // %bb.3: // %use_c
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB1_4: // %use_c
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: .LBB1_5: // %exit
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %cmp = icmp ne i32 %a, 0
+ br i1 %cmp, label %use_b, label %use_c
+
+use_b:
+ %faddr = fadd float %b, 4.0
+ br label %exit
+
+use_c:
+ %res2 = call float @llvm.cos.f32(float %c)
+ br label %exit
+
+exit:
+ %ret = phi float [%faddr, %use_b], [%res2, %use_c]
+ ret float %ret
+}
+
+declare float @llvm.cos.f32(float)
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 7f40b5e7e13446..cbbfb4a7ca7a68 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -39,15 +39,16 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
; CHECK-NEXT: sub sp, sp, #80
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sub x19, x29, #80
-; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: str zt0, [x19]
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart za
@@ -87,24 +88,14 @@ define void @zt0_shared_caller_zt0_shared_callee() "aarch64_in_zt0" nounwind {
define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind {
; CHECK-LABEL: za_zt0_shared_caller_za_shared_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #80
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: sub x19, x29, #80
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: mov x19, sp
; CHECK-NEXT: str zt0, [x19]
; CHECK-NEXT: bl callee
; CHECK-NEXT: ldr zt0, [x19]
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
call void @callee() "aarch64_inout_za";
ret void;
@@ -114,19 +105,9 @@ define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64
define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind {
; CHECK-LABEL: za_zt0_shared_caller_za_zt0_shared_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: bl callee
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
ret void;
@@ -199,9 +180,9 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x8, x8, x8, x9
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: cbz x8, .LBB7_2
; CHECK-NEXT: // %bb.1: // %save.za
@@ -227,20 +208,10 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwind {
; CHECK-LABEL: new_za_shared_zt0_caller:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: zero {za}
; CHECK-NEXT: bl callee
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
ret void;
@@ -250,20 +221,10 @@ define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwi
define void @shared_za_new_zt0() "aarch64_inout_za" "aarch64_new_zt0" nounwind {
; CHECK-LABEL: shared_za_new_zt0:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
-; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: zero { zt0 }
; CHECK-NEXT: bl callee
-; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
ret void;
>From 9afe21e9b499da83b64059afcbbbc3e67a41353c Mon Sep 17 00:00:00 2001
From: Matt Devereau <matthew.devereau at arm.com>
Date: Wed, 14 Feb 2024 17:26:20 +0000
Subject: [PATCH 2/7] Add implicit uses
---
.../Target/AArch64/AArch64ISelLowering.cpp | 59 +++----------------
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 -
.../AArch64/AArch64MachineFunctionInfo.h | 4 +-
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 2 +-
4 files changed, 12 insertions(+), 56 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f9f702a24916be..6f5de406ba668a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2942,7 +2942,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
if (TPIDR2->Uses == 0) {
BB->remove_instr(&MI);
- MFI.RemoveStackObject(TPIDR2->Addr);
+ MFI.RemoveStackObject(TPIDR2->FrameIndex);
return BB;
}
@@ -2965,9 +2965,8 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
.addReg(MSub);
- uint64_t TPIDR2Object = TPIDR2->Addr;
-
- MFI.CreateVariableSizedObject(Align(1), nullptr);
+ unsigned TPIDR2Object = TPIDR2->FrameIndex;
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
MachineInstrBuilder Wzr =
@@ -6918,47 +6917,6 @@ AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
}
}
-
-unsigned
-AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
- SelectionDAG &DAG) const {
- MachineFunction &MF = DAG.getMachineFunction();
- MachineFrameInfo &MFI = MF.getFrameInfo();
-
- // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
- SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
- DAG.getConstant(1, DL, MVT::i32));
- SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
- SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
- SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
- Chain = Buffer.getValue(1);
- MFI.CreateVariableSizedObject(Align(1), nullptr);
-
- // Allocate an additional TPIDR2 object on the stack (16 bytes)
- unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
-
- // Store the buffer pointer to the TPIDR2 stack object.
- MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
- SDValue Ptr = DAG.getFrameIndex(
- TPIDR2Obj,
- DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
- Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
-
- // Set the reserved bytes (10-15) to zero
- EVT PtrTy = Ptr.getValueType();
- SDValue ReservedPtr =
- DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
- Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
- MPI);
- ReservedPtr =
- DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
- Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
- MPI);
-
- return TPIDR2Obj;
-}
-
static bool isPassedInFPR(EVT VT) {
return VT.isFixedLengthVector() ||
(VT.isFloatingPoint() && !VT.isScalableVector());
@@ -7381,7 +7339,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
Chain = SDValue(
DAG.getMachineNode(AArch64::ExpandZABuffer, DL, MVT::Other, Chain), 0);
TPIDR2Object TPIDR2;
- TPIDR2.Addr = MFI.CreateStackObject(16, Align(16), false);
+ TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
FuncInfo->setTPIDR2Obj(TPIDR2);
}
@@ -8053,10 +8011,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
if (RequiresLazySave) {
- TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
- MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2.Addr);
+ const TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
+ MachinePointerInfo MPI =
+ MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
- TPIDR2.Addr,
+ TPIDR2.FrameIndex,
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
SDValue NumZaSaveSlicesAddr =
DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
@@ -8584,7 +8543,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// RESTORE_ZA pseudo.
SDValue Glue;
SDValue TPIDR2Block = DAG.getFrameIndex(
- TPIDR2.Addr,
+ TPIDR2.FrameIndex,
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
Result =
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 13bf1a5ee44927..0723faf8b4bab3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1012,9 +1012,6 @@ class AArch64TargetLowering : public TargetLowering {
void addDRTypeForNEON(MVT VT);
void addQRTypeForNEON(MVT VT);
- unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
- SelectionDAG &DAG) const;
-
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index b43db1883b92e3..12704a3b5512e7 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -36,8 +36,8 @@ class AArch64Subtarget;
class MachineInstr;
struct TPIDR2Object {
- uint64_t Addr = 0;
- uint32_t Uses = 0;
+ unsigned FrameIndex = 0;
+ unsigned Uses = 0;
};
/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2ad98c3313caee..b635695bdb71a0 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -31,7 +31,7 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in {
def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {}
}
>From 2665899b5f8de238a52ae208899e7a3fab02007e Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 7 Mar 2024 13:45:25 +0000
Subject: [PATCH 3/7] fixup: add comments from lazy save function
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6f5de406ba668a..47772f9dd428cd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2957,6 +2957,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
.addReg(AArch64::SP);
+ // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub)
.addReg(RDSVL)
@@ -2965,6 +2966,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
.addReg(MSub);
+ // Allocate an additional TPIDR2 object on the stack (16 bytes)
unsigned TPIDR2Object = TPIDR2->FrameIndex;
MFI.CreateVariableSizedObject(Align(16), nullptr);
@@ -2973,10 +2975,12 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32)
.addReg(AArch64::WZR);
+ // Store the buffer pointer to the TPIDR2 stack object.
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
.addReg(MSub)
.addFrameIndex(TPIDR2Object)
.addImm(0);
+ // Set the reserved bytes (10-15) to zero
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
.addReg(Wzr.getReg(0))
.addFrameIndex(TPIDR2Object)
>From 1e8a402b13239d2a86251b8fed1c5d5c506a371d Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 7 Mar 2024 13:48:54 +0000
Subject: [PATCH 4/7] fixup: set FrameIndex to max int
---
llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 12704a3b5512e7..8933ab23e42e91 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -36,7 +36,7 @@ class AArch64Subtarget;
class MachineInstr;
struct TPIDR2Object {
- unsigned FrameIndex = 0;
+ unsigned FrameIndex = std::numeric_limits<int>::max();
unsigned Uses = 0;
};
>From a9911291da21de55baa68862d46875712af7842a Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 14 Mar 2024 16:01:56 +0000
Subject: [PATCH 5/7] fixup: add Windows assertion
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 47772f9dd428cd..82f9190983c37e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2935,6 +2935,12 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
MachineFunction *MF = BB->getParent();
MachineFrameInfo &MFI = MF->getFrameInfo();
AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+ // TODO This function grows the stack with a subtraction, which doesn't work
+ // on Windows. Some refactoring to share the functionality in
+ // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
+ // supports SME
+ assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
+ "Lazy ZA save is not yet supported on Windows");
std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj();
if (!TPIDR2)
>From 34f9e9ce4b549b5919e56fd9212afb2b0ff9762f Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Tue, 26 Mar 2024 16:19:59 +0000
Subject: [PATCH 6/7] fixup: lower to STACKALLOC pseudo
---
.../AArch64/AArch64ExpandPseudoInsts.cpp | 16 +++++++++++
.../Target/AArch64/AArch64ISelLowering.cpp | 28 ++++++++++---------
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 7 ++++-
.../AArch64/sme-disable-gisel-fisel.ll | 12 +++++---
.../CodeGen/AArch64/sme-lazy-save-call.ll | 12 +++++---
.../AArch64/sme-shared-za-interface.ll | 6 ++--
.../AArch64/sme-za-lazy-save-buffer.ll | 3 +-
llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 6 ++--
8 files changed, 63 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 03f0778bae59d5..69199e81c1c6de 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1121,6 +1121,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
default:
break;
+ case AArch64::STACKALLOC: {
+ Register Dest = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ Register SPCopy = MI.getOperand(2).getReg();
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), Dest)
+ .addReg(SPCopy)
+ .add(MI.getOperand(1))
+ .addImm(0);
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
+ .addReg(AArch64::SP, RegState::Define)
+ .addReg(Dest)
+ .addImm(0)
+ .addImm(0);
+ MI.eraseFromParent();
+ return true;
+ }
case AArch64::BSPv8i8:
case AArch64::BSPv16i8: {
Register DstReg = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 82f9190983c37e..39d5a26ae485e0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2959,22 +2959,24 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL)
.addImm(1);
- Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
- .addReg(AArch64::SP);
-
- // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
- Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub)
+ // Allocate the ZA buffer
+ Register BufferSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MADDXrrr), BufferSize)
.addReg(RDSVL)
.addReg(RDSVL)
- .addReg(SP);
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
- .addReg(MSub);
+ .addReg(AArch64::XZR);
+ Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ Register SPCopy = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SPCopy)
+ .addReg(AArch64::SP);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STACKALLOC), BufferAddr)
+ .addReg(BufferSize)
+ .addReg(SPCopy);
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+
+ // expand pseudo in expand pass or remove pseudo and remove stack object
- // Allocate an additional TPIDR2 object on the stack (16 bytes)
unsigned TPIDR2Object = TPIDR2->FrameIndex;
- MFI.CreateVariableSizedObject(Align(16), nullptr);
Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
MachineInstrBuilder Wzr =
@@ -2983,7 +2985,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
// Store the buffer pointer to the TPIDR2 stack object.
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
- .addReg(MSub)
+ .addReg(BufferAddr)
.addFrameIndex(TPIDR2Object)
.addImm(0);
// Set the reserved bytes (10-15) to zero
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b1f514f75207f0..2ad25e041edb16 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -980,7 +980,11 @@ include "SMEInstrFormats.td"
//===----------------------------------------------------------------------===//
let hasSideEffects = 1, isCodeGenOnly = 1 in {
-let Defs = [SP], Uses = [SP] in {
+let Defs = [SP] in {
+
+def STACKALLOC : Pseudo<(outs GPR64:$addr), (ins GPR64:$size, GPR64:$sp), []>, Sched<[]>;
+
+let Uses = [SP] in {
// We set Sched to empty list because we expect these instructions to simply get
// removed in most cases.
def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
@@ -991,6 +995,7 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
Sched<[]>;
}
+}
let Defs = [SP, NZCV], Uses = [SP] in {
// Probed stack allocation of a constant size, used in function prologues when
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 694bc6d0bd9377..eb2e346873b94a 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -218,8 +218,9 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
+; CHECK-COMMON-NEXT: mul x8, x8, x8
; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT: sub x8, x9, x8
; CHECK-COMMON-NEXT: mov sp, x8
; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
@@ -255,8 +256,9 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
+; CHECK-COMMON-NEXT: mul x8, x8, x8
; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT: sub x8, x9, x8
; CHECK-COMMON-NEXT: mov sp, x8
; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
@@ -297,7 +299,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT: mul x8, x8, x8
+; CHECK-COMMON-NEXT: sub x8, x9, x8
; CHECK-COMMON-NEXT: mov sp, x8
; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -360,7 +363,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT: mul x8, x8, x8
+; CHECK-COMMON-NEXT: sub x8, x9, x8
; CHECK-COMMON-NEXT: mov sp, x8
; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
; CHECK-COMMON-NEXT: rdsvl x8, #1
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 4eb21ed70dd0fb..9d24708577c134 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -13,7 +13,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mul x8, x8, x8
+; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: stur x8, [x29, #-16]
; CHECK-NEXT: rdsvl x8, #1
@@ -48,7 +49,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mul x8, x8, x8
+; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: rdsvl x19, #1
; CHECK-NEXT: sub x20, x29, #16
@@ -95,7 +97,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mul x8, x8, x8
+; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: stur x8, [x29, #-16]
; CHECK-NEXT: rdsvl x8, #1
@@ -134,7 +137,8 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mul x8, x8, x8
+; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: stur x8, [x29, #-80]
; CHECK-NEXT: rdsvl x8, #1
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 46672c364b73d8..03b49c39a4539e 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -12,7 +12,8 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mul x8, x8, x8
+; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: stur x8, [x29, #-16]
; CHECK-NEXT: rdsvl x8, #1
@@ -46,7 +47,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mul x8, x8, x8
+; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: stur x8, [x29, #-16]
; CHECK-NEXT: rdsvl x8, #1
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index aaf11bf2ba64a6..26bacd72ffa47e 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -21,7 +21,8 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mul x8, x8, x8
+; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: stur x8, [x29, #-16]
; CHECK-NEXT: sturh wzr, [x29, #-6]
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index cbbfb4a7ca7a68..f810054eac8315 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -39,7 +39,8 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
; CHECK-NEXT: sub sp, sp, #80
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mul x8, x8, x8
+; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: stur x8, [x29, #-16]
; CHECK-NEXT: rdsvl x8, #1
@@ -178,7 +179,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
; CHECK-NEXT: sub sp, sp, #80
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mul x8, x8, x8
+; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: stur x8, [x29, #-16]
; CHECK-NEXT: sturh wzr, [x29, #-6]
>From 19a7169671a964b1b5126468d34a9b5731e23e66 Mon Sep 17 00:00:00 2001
From: Samuel Tebbs <samuel.tebbs at arm.com>
Date: Thu, 28 Mar 2024 10:59:50 +0000
Subject: [PATCH 7/7] fixup: lower to STORETPIDR2 pseudo
---
.../AArch64/AArch64ExpandPseudoInsts.cpp | 23 +++++++-
.../Target/AArch64/AArch64ISelLowering.cpp | 26 ++--------
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 1 +
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 3 ++
.../AArch64/sme-disable-gisel-fisel.ll | 44 +++++++++-------
.../CodeGen/AArch64/sme-lazy-save-call.ll | 52 ++++++++++---------
.../AArch64/sme-shared-za-interface.ll | 30 ++++++-----
.../AArch64/sme-za-lazy-save-buffer.ll | 8 +--
llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 15 +++---
9 files changed, 110 insertions(+), 92 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 69199e81c1c6de..efa0cb53ca5584 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1121,9 +1121,30 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
default:
break;
+ case AArch64::STORETPIDR2: {
+ Register BufferAddr = MI.getOperand(0).getReg();
+ auto TPIDR2Object = MI.getOperand(1).getReg();
+ unsigned Offset = MI.getOperand(2).getImm();
+ // Store the buffer pointer to the TPIDR2 stack object.
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+ .addReg(BufferAddr)
+ .addUse(TPIDR2Object)
+ .addImm(0 + Offset);
+ // Set the reserved bytes (10-15) to zero
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
+ .addReg(AArch64::WZR)
+ .addUse(TPIDR2Object)
+ .addImm(5 + Offset);
+ BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
+ .addReg(AArch64::WZR)
+ .addUse(TPIDR2Object)
+ .addImm(3 + Offset);
+ MI.eraseFromParent();
+ return true;
+ }
+
case AArch64::STACKALLOC: {
Register Dest = MI.getOperand(0).getReg();
- Register Src = MI.getOperand(1).getReg();
Register SPCopy = MI.getOperand(2).getReg();
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), Dest)
.addReg(SPCopy)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 39d5a26ae485e0..6569ed90e2551d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2974,30 +2974,12 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
.addReg(SPCopy);
MFI.CreateVariableSizedObject(Align(16), nullptr);
- // expand pseudo in expand pass or remove pseudo and remove stack object
-
unsigned TPIDR2Object = TPIDR2->FrameIndex;
- Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
- MachineInstrBuilder Wzr =
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32)
- .addReg(AArch64::WZR);
-
- // Store the buffer pointer to the TPIDR2 stack object.
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
- .addReg(BufferAddr)
- .addFrameIndex(TPIDR2Object)
- .addImm(0);
- // Set the reserved bytes (10-15) to zero
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
- .addReg(Wzr.getReg(0))
- .addFrameIndex(TPIDR2Object)
- .addImm(5);
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
- .addReg(Wzr.getReg(0))
- .addFrameIndex(TPIDR2Object)
- .addImm(3);
-
+ auto MI2 = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STORETPIDR2))
+ .addReg(BufferAddr)
+ .addFrameIndex(TPIDR2Object)
+ .addImm(0);
BB->remove_instr(&MI);
return BB;
}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d0c5e6b99e9eec..f3a0c4a7abacb2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3633,6 +3633,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
case AArch64::LDRDui:
case AArch64::STRXui:
case AArch64::STRDui:
+ case AArch64::STORETPIDR2:
Scale = TypeSize::getFixed(8);
Width = TypeSize::getFixed(8);
MinOffset = 0;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 2ad25e041edb16..83915697bfe8b3 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -980,6 +980,9 @@ include "SMEInstrFormats.td"
//===----------------------------------------------------------------------===//
let hasSideEffects = 1, isCodeGenOnly = 1 in {
+
+def STORETPIDR2 : Pseudo<(outs), (ins GPR64:$addr, GPR64sp:$frameindex, i32imm:$offset), []>, Sched<[]>;
+
let Defs = [SP] in {
def STACKALLOC : Pseudo<(outs GPR64:$addr), (ins GPR64:$size, GPR64:$sp), []>, Sched<[]>;
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index eb2e346873b94a..6db0c4f9d970cf 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -222,9 +222,10 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
; CHECK-COMMON-NEXT: mov x9, sp
; CHECK-COMMON-NEXT: sub x8, x9, x8
; CHECK-COMMON-NEXT: mov sp, x8
-; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT: sub x9, x29, #16
+; CHECK-COMMON-NEXT: str x8, [x9]
+; CHECK-COMMON-NEXT: strh wzr, [x9, #10]
+; CHECK-COMMON-NEXT: str wzr, [x9, #12]
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
; CHECK-COMMON-NEXT: cbz x8, .LBB6_2
; CHECK-COMMON-NEXT: b .LBB6_1
@@ -260,9 +261,10 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: mov x9, sp
; CHECK-COMMON-NEXT: sub x8, x9, x8
; CHECK-COMMON-NEXT: mov sp, x8
-; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT: sub x9, x29, #16
+; CHECK-COMMON-NEXT: str x8, [x9]
+; CHECK-COMMON-NEXT: strh wzr, [x9, #10]
+; CHECK-COMMON-NEXT: str wzr, [x9, #12]
; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
; CHECK-COMMON-NEXT: sub x8, x29, #16
@@ -302,13 +304,14 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: mul x8, x8, x8
; CHECK-COMMON-NEXT: sub x8, x9, x8
; CHECK-COMMON-NEXT: mov sp, x8
-; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT: rdsvl x8, #1
-; CHECK-COMMON-NEXT: sub x9, x29, #16
-; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
-; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9
+; CHECK-COMMON-NEXT: rdsvl x9, #1
+; CHECK-COMMON-NEXT: sub x10, x29, #16
+; CHECK-COMMON-NEXT: sub x11, x29, #16
+; CHECK-COMMON-NEXT: str x8, [x11]
+; CHECK-COMMON-NEXT: strh wzr, [x11, #10]
+; CHECK-COMMON-NEXT: str wzr, [x11, #12]
+; CHECK-COMMON-NEXT: sturh w9, [x29, #-8]
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
; CHECK-COMMON-NEXT: bl __addtf3
; CHECK-COMMON-NEXT: smstart za
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
@@ -366,13 +369,14 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: mul x8, x8, x8
; CHECK-COMMON-NEXT: sub x8, x9, x8
; CHECK-COMMON-NEXT: mov sp, x8
-; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
-; CHECK-COMMON-NEXT: rdsvl x8, #1
-; CHECK-COMMON-NEXT: sub x9, x29, #16
-; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
-; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9
+; CHECK-COMMON-NEXT: rdsvl x9, #1
+; CHECK-COMMON-NEXT: sub x10, x29, #16
+; CHECK-COMMON-NEXT: sub x11, x29, #16
+; CHECK-COMMON-NEXT: str x8, [x11]
+; CHECK-COMMON-NEXT: strh wzr, [x11, #10]
+; CHECK-COMMON-NEXT: str wzr, [x11, #12]
+; CHECK-COMMON-NEXT: sturh w9, [x29, #-8]
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
; CHECK-COMMON-NEXT: bl fmod
; CHECK-COMMON-NEXT: smstart za
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9d24708577c134..76e3a15d4def9d 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -16,13 +16,14 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-NEXT: mul x8, x8, x8
; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: sub x9, x29, #16
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: sub x10, x29, #16
+; CHECK-NEXT: sub x11, x29, #16
+; CHECK-NEXT: str x8, [x11]
+; CHECK-NEXT: strh wzr, [x11, #10]
+; CHECK-NEXT: str wzr, [x11, #12]
+; CHECK-NEXT: sturh w9, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -54,9 +55,10 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: rdsvl x19, #1
; CHECK-NEXT: sub x20, x29, #16
-; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: sub x9, x29, #16
+; CHECK-NEXT: str x8, [x9]
+; CHECK-NEXT: strh wzr, [x9, #10]
+; CHECK-NEXT: str wzr, [x9, #12]
; CHECK-NEXT: sturh w19, [x29, #-8]
; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_callee
@@ -100,13 +102,14 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
; CHECK-NEXT: mul x8, x8, x8
; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: sub x9, x29, #16
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: sub x10, x29, #16
+; CHECK-NEXT: sub x11, x29, #16
+; CHECK-NEXT: str x8, [x11]
+; CHECK-NEXT: strh wzr, [x11, #10]
+; CHECK-NEXT: str wzr, [x11, #12]
+; CHECK-NEXT: sturh w9, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl cosf
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -140,13 +143,14 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: mul x8, x8, x8
; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-80]
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: sub x9, x29, #80
-; CHECK-NEXT: sturh wzr, [x29, #-70]
-; CHECK-NEXT: stur wzr, [x29, #-68]
-; CHECK-NEXT: sturh w8, [x29, #-72]
-; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: sub x10, x29, #80
+; CHECK-NEXT: sub x11, x29, #80
+; CHECK-NEXT: str x8, [x11]
+; CHECK-NEXT: strh wzr, [x11, #10]
+; CHECK-NEXT: str wzr, [x11, #12]
+; CHECK-NEXT: sturh w9, [x29, #-72]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB3_2
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index 03b49c39a4539e..dcd9dbadc70669 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -15,13 +15,14 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
; CHECK-NEXT: mul x8, x8, x8
; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: sub x9, x29, #16
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: sub x10, x29, #16
+; CHECK-NEXT: sub x11, x29, #16
+; CHECK-NEXT: str x8, [x11]
+; CHECK-NEXT: strh wzr, [x11, #10]
+; CHECK-NEXT: str wzr, [x11, #12]
+; CHECK-NEXT: sturh w9, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -50,13 +51,14 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-NEXT: mul x8, x8, x8
; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: sub x9, x29, #16
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: sub x10, x29, #16
+; CHECK-NEXT: sub x11, x29, #16
+; CHECK-NEXT: str x8, [x11]
+; CHECK-NEXT: strh wzr, [x11, #10]
+; CHECK-NEXT: str wzr, [x11, #12]
+; CHECK-NEXT: sturh w9, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl __addtf3
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
index 26bacd72ffa47e..1c672e9fe2bb94 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -24,9 +24,10 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6
; CHECK-NEXT: mul x8, x8, x8
; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: sub x9, x29, #16
+; CHECK-NEXT: str x8, [x9]
+; CHECK-NEXT: strh wzr, [x9, #10]
+; CHECK-NEXT: str wzr, [x9, #12]
; CHECK-NEXT: cbz w0, .LBB1_2
; CHECK-NEXT: // %bb.1: // %use_b
; CHECK-NEXT: fmov s1, #4.00000000
@@ -35,7 +36,6 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6
; CHECK-NEXT: .LBB1_2: // %use_c
; CHECK-NEXT: fmov s0, s1
; CHECK-NEXT: rdsvl x8, #1
-; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh w8, [x29, #-8]
; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl cosf
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index f810054eac8315..dfd7714b387241 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -42,12 +42,12 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
; CHECK-NEXT: mul x8, x8, x8
; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sub x19, x29, #80
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: str x8, [x9]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: strh wzr, [x9, #10]
+; CHECK-NEXT: str wzr, [x9, #12]
; CHECK-NEXT: sturh w8, [x29, #-8]
; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: str zt0, [x19]
@@ -182,9 +182,10 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
; CHECK-NEXT: mul x8, x8, x8
; CHECK-NEXT: sub x8, x9, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: sub x9, x29, #16
+; CHECK-NEXT: str x8, [x9]
+; CHECK-NEXT: strh wzr, [x9, #10]
+; CHECK-NEXT: str wzr, [x9, #12]
; CHECK-NEXT: mrs x8, TPIDR2_EL0
; CHECK-NEXT: cbz x8, .LBB7_2
; CHECK-NEXT: // %bb.1: // %save.za
More information about the llvm-commits
mailing list