[llvm] [AArch64][SME] Spill p-regs as z-regs when streaming hazards are possible (PR #123752)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 24 09:54:00 PST 2025


https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/123752

>From 14d2b4c88e35d965bbb185a70631ea39f1c27c6e Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 13 Jan 2025 16:15:53 +0000
Subject: [PATCH 1/5] [AArch64][SME] Spill p-regs as z-regs when streaming
 hazards are possible

This patch adds a new option `-aarch64-enable-zpr-predicate-spills`
(which is disabled by default), this option replaces predicate spills
with vector spills in streaming[-compatible] functions.

For example:

```
str	p8, [sp, #7, mul vl]            // 2-byte Folded Spill
// ...
ldr	p8, [sp, #7, mul vl]            // 2-byte Folded Reload
```

Becomes:

```
mov	z0.b, p8/z, #1
str	z0, [sp]                        // 16-byte Folded Spill
// ...
ldr	z0, [sp]                        // 16-byte Folded Reload
ptrue	p4.b
cmpne	p8.b, p4/z, z0.b, #0
```

This is done to avoid streaming memory hazards between FPR/vector and
predicate spills, which currently occupy the same stack area even when
the `-aarch64-stack-hazard-size` flag is set.

This is implemented with two new pseudos SPILL_PPR_TO_ZPR_SLOT_PSEUDO
and FILL_PPR_FROM_ZPR_SLOT_PSEUDO. The expansion of these pseudos
handles scavenging the required registers (z0 in the above example) and,
in the worst case spilling a register to an emergency stack slot in the
expansion. The condition flags are also preserved around the `cmpne`
in case they are live at the expansion point.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  335 +++++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |   16 +-
 .../Target/AArch64/AArch64RegisterInfo.cpp    |    4 +-
 llvm/lib/Target/AArch64/AArch64RegisterInfo.h |    2 +-
 .../lib/Target/AArch64/AArch64RegisterInfo.td |   11 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |   22 +
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |    2 +
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |   14 +
 .../AArch64/spill-fill-zpr-predicates.mir     | 1035 +++++++++++++++++
 .../AArch64/ssve-stack-hazard-remarks.ll      |   13 +-
 10 files changed, 1444 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index eabe64361938b4..64c3ecaf21ea31 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1630,6 +1630,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
   case AArch64::STR_PXI:
   case AArch64::LDR_ZXI:
   case AArch64::LDR_PXI:
+  case AArch64::PTRUE_B:
+  case AArch64::CPY_ZPzI_B:
+  case AArch64::CMPNE_PPzZI_B:
     return I->getFlag(MachineInstr::FrameSetup) ||
            I->getFlag(MachineInstr::FrameDestroy);
   }
@@ -3261,7 +3264,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
       break;
     case RegPairInfo::PPR:
-      StrOpc = AArch64::STR_PXI;
+      StrOpc =
+          Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI;
       break;
     case RegPairInfo::VG:
       StrOpc = AArch64::STRXui;
@@ -3490,7 +3494,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
       break;
     case RegPairInfo::PPR:
-      LdrOpc = AArch64::LDR_PXI;
+      LdrOpc = Size == 16 ? AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO
+                          : AArch64::LDR_PXI;
       break;
     case RegPairInfo::VG:
       continue;
@@ -3716,6 +3721,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       continue;
     }
 
+    // Always save P4 when PPR spills are ZPR-sized and a predicate above p8 is
+    // spilled. If all of p0-p3 are used as return values p4 is must be free
+    // to reload p8-p15.
+    if (RegInfo->getSpillSize(AArch64::PPRRegClass) == 16 &&
+        AArch64::PPR_p8to15RegClass.contains(Reg)) {
+      SavedRegs.set(AArch64::P4);
+    }
+
     // MachO's compact unwind format relies on all registers being stored in
     // pairs.
     // FIXME: the usual format is actually better if unwinding isn't needed.
@@ -4155,8 +4168,318 @@ int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
                                         true);
 }
 
+/// Attempts to scavenge a register from \p ScavengeableRegs given the used
+/// registers in \p UsedRegs.
+static Register tryScavengeRegister(LiveRegUnits const &UsedRegs,
+                                    BitVector const &ScavengeableRegs) {
+  for (auto Reg : ScavengeableRegs.set_bits()) {
+    if (UsedRegs.available(Reg))
+      return Reg;
+  }
+  return AArch64::NoRegister;
+}
+
+/// Propagates frame-setup/destroy flags from \p SourceMI to all instructions in
+/// \p MachineInstrs.
+static void propagateFrameFlags(MachineInstr &SourceMI,
+                                ArrayRef<MachineInstr *> MachineInstrs) {
+  for (MachineInstr *MI : MachineInstrs) {
+    if (SourceMI.getFlag(MachineInstr::FrameSetup))
+      MI->setFlag(MachineInstr::FrameSetup);
+    if (SourceMI.getFlag(MachineInstr::FrameDestroy))
+      MI->setFlag(MachineInstr::FrameDestroy);
+  }
+}
+
+/// RAII helper class for scavenging or spilling a register. On construction
+/// attempts to find a free register of class \p RC (given \p UsedRegs and \p
+/// AllocatableRegs), if no register can be found spills \p SpillCandidate to \p
+/// MaybeSpillFI to free a register. The free'd register is returned via the \p
+/// FreeReg output parameter. On destruction, if there is a spill, its previous
+/// value is reloaded. The spilling and scavenging is only valid at the
+/// insertion point \p MBBI, this class should _not_ be used in places that
+/// create or manipulate basic blocks, moving the expected insertion point.
+struct ScopedScavengeOrSpill {
+  ScopedScavengeOrSpill(const ScopedScavengeOrSpill &) = delete;
+  ScopedScavengeOrSpill(ScopedScavengeOrSpill &&) = delete;
+
+  ScopedScavengeOrSpill(MachineFunction &MF, MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI, Register &FreeReg,
+                        Register SpillCandidate, const TargetRegisterClass &RC,
+                        LiveRegUnits const &UsedRegs,
+                        BitVector const &AllocatableRegs,
+                        std::optional<int> &MaybeSpillFI)
+      : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast<const AArch64InstrInfo &>(
+                                          *MF.getSubtarget().getInstrInfo())),
+        TRI(*MF.getSubtarget().getRegisterInfo()) {
+    FreeReg = tryScavengeRegister(UsedRegs, AllocatableRegs);
+    if (FreeReg != AArch64::NoRegister)
+      return;
+    if (!MaybeSpillFI) {
+      MachineFrameInfo &MFI = MF.getFrameInfo();
+      MaybeSpillFI = MFI.CreateSpillStackObject(TRI.getSpillSize(RC),
+                                                TRI.getSpillAlign(RC));
+    }
+    FreeReg = SpilledReg = SpillCandidate;
+    SpillFI = *MaybeSpillFI;
+    TII.storeRegToStackSlot(MBB, MBBI, SpilledReg, false, SpillFI, &RC, &TRI,
+                            Register());
+  }
+
+  bool hasSpilled() const { return SpilledReg != AArch64::NoRegister; }
+
+  ~ScopedScavengeOrSpill() {
+    if (hasSpilled())
+      TII.loadRegFromStackSlot(MBB, MBBI, SpilledReg, SpillFI, &RC, &TRI,
+                               Register());
+  }
+
+private:
+  MachineBasicBlock &MBB;
+  MachineBasicBlock::iterator MBBI;
+  const TargetRegisterClass &RC;
+  const AArch64InstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  Register SpilledReg = AArch64::NoRegister;
+  int SpillFI = -1;
+};
+
+/// Emergency stack slots for expanding SPILL_PPR_TO_ZPR_SLOT_PSEUDO and
+/// FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
+struct EmergencyStackSlots {
+  std::optional<int> ZPRSpillFI;
+  std::optional<int> PPRSpillFI;
+  std::optional<int> GPRSpillFI;
+};
+
+/// Expands:
+/// ```
+/// SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0
+/// ```
+/// To:
+/// ```
+/// $z0 = CPY_ZPzI_B $p0, 1, 0
+/// STR_ZXI $z0, $stack.0, 0
+/// ```
+/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
+/// spilling if necessary).
+static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB,
+                                          MachineInstr &MI,
+                                          const TargetRegisterInfo &TRI,
+                                          LiveRegUnits const &UsedRegs,
+                                          BitVector const &ZPRRegs,
+                                          EmergencyStackSlots &SpillSlots) {
+  MachineFunction &MF = *MBB.getParent();
+  auto *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  Register ZPredReg = AArch64::NoRegister;
+  ScopedScavengeOrSpill FindZPRReg(MF, MBB, MachineBasicBlock::iterator(MI),
+                                   ZPredReg, AArch64::Z0, AArch64::ZPRRegClass,
+                                   UsedRegs, ZPRRegs, SpillSlots.ZPRSpillFI);
+
+#ifndef NDEBUG
+  bool InPrologueOrEpilogue = MI.getFlag(MachineInstr::FrameSetup) ||
+                              MI.getFlag(MachineInstr::FrameDestroy);
+  assert((!FindZPRReg.hasSpilled() || !InPrologueOrEpilogue) &&
+         "SPILL_PPR_TO_ZPR_SLOT_PSEUDO expansion should not spill in prologue "
+         "or epilogue");
+#endif
+
+  SmallVector<MachineInstr *, 2> MachineInstrs;
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::CPY_ZPzI_B))
+                              .addReg(ZPredReg, RegState::Define)
+                              .add(MI.getOperand(0))
+                              .addImm(1)
+                              .addImm(0)
+                              .getInstr());
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::STR_ZXI))
+                              .addReg(ZPredReg)
+                              .add(MI.getOperand(1))
+                              .addImm(MI.getOperand(2).getImm())
+                              .setMemRefs(MI.memoperands())
+                              .getInstr());
+  propagateFrameFlags(MI, MachineInstrs);
+}
+
+/// Expands:
+/// ```
+/// $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0
+/// ```
+/// To:
+/// ```
+/// $z0 = LDR_ZXI %stack.0, 0
+/// $p0 = PTRUE_B 31, implicit $vg
+/// $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+/// ```
+/// While ensuring a ZPR ($z0 in this example) is free for the predicate (
+/// spilling if necessary). If the status flags are in use at the point of
+/// expansion they are preserved (by moving them to/from a GPR). This may cause
+/// an additional spill if no GPR is free at the expansion point.
+static bool expandFillPPRFromZPRSlotPseudo(
+    MachineBasicBlock &MBB, MachineInstr &MI, const TargetRegisterInfo &TRI,
+    LiveRegUnits const &UsedRegs, BitVector const &ZPRRegs,
+    BitVector const &PPR3bRegs, BitVector const &GPRRegs,
+    EmergencyStackSlots &SpillSlots) {
+  MachineFunction &MF = *MBB.getParent();
+  auto *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  Register ZPredReg = AArch64::NoRegister;
+  ScopedScavengeOrSpill FindZPRReg(MF, MBB, MachineBasicBlock::iterator(MI),
+                                   ZPredReg, AArch64::Z0, AArch64::ZPRRegClass,
+                                   UsedRegs, ZPRRegs, SpillSlots.ZPRSpillFI);
+
+  Register PredReg = AArch64::NoRegister;
+  std::optional<ScopedScavengeOrSpill> FindPPR3bReg;
+  if (AArch64::PPR_3bRegClass.contains(MI.getOperand(0).getReg()))
+    PredReg = MI.getOperand(0).getReg();
+  else
+    FindPPR3bReg.emplace(MF, MBB, MachineBasicBlock::iterator(MI), PredReg,
+                         AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs,
+                         PPR3bRegs, SpillSlots.PPRSpillFI);
+
+  // Elide NZCV spills if we know it is not used.
+  Register NZCVSaveReg = AArch64::NoRegister;
+  bool IsNZCVUsed = !UsedRegs.available(AArch64::NZCV);
+  std::optional<ScopedScavengeOrSpill> FindGPRReg;
+  if (IsNZCVUsed)
+    FindGPRReg.emplace(MF, MBB, MachineBasicBlock::iterator(MI), NZCVSaveReg,
+                       AArch64::X0, AArch64::GPR64RegClass, UsedRegs, GPRRegs,
+                       SpillSlots.GPRSpillFI);
+
+#ifndef NDEBUG
+  bool Spilled = FindZPRReg.hasSpilled() ||
+                 (FindPPR3bReg && FindPPR3bReg->hasSpilled()) ||
+                 (FindGPRReg && FindGPRReg->hasSpilled());
+  bool InPrologueOrEpilogue = MI.getFlag(MachineInstr::FrameSetup) ||
+                              MI.getFlag(MachineInstr::FrameDestroy);
+  assert((!Spilled || !InPrologueOrEpilogue) &&
+         "FILL_PPR_FROM_ZPR_SLOT_PSEUDO expansion should not spill in prologue "
+         "or epilogue");
+#endif
+
+  SmallVector<MachineInstr *, 4> MachineInstrs;
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::LDR_ZXI))
+                              .addReg(ZPredReg, RegState::Define)
+                              .add(MI.getOperand(1))
+                              .addImm(MI.getOperand(2).getImm())
+                              .setMemRefs(MI.memoperands())
+                              .getInstr());
+  if (IsNZCVUsed)
+    MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::MRS))
+                                .addReg(NZCVSaveReg, RegState::Define)
+                                .addImm(AArch64SysReg::NZCV)
+                                .addReg(AArch64::NZCV, RegState::Implicit)
+                                .getInstr());
+  MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::PTRUE_B))
+                              .addReg(PredReg, RegState::Define)
+                              .addImm(31));
+  MachineInstrs.push_back(
+      BuildMI(MBB, MI, DL, TII->get(AArch64::CMPNE_PPzZI_B))
+          .addReg(MI.getOperand(0).getReg(), RegState::Define)
+          .addReg(PredReg)
+          .addReg(ZPredReg)
+          .addImm(0)
+          .addReg(AArch64::NZCV, RegState::ImplicitDefine)
+          .getInstr());
+  if (IsNZCVUsed)
+    MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::MSR))
+                                .addImm(AArch64SysReg::NZCV)
+                                .addReg(NZCVSaveReg)
+                                .addReg(AArch64::NZCV, RegState::ImplicitDefine)
+                                .getInstr());
+
+  propagateFrameFlags(MI, MachineInstrs);
+  return FindPPR3bReg && FindPPR3bReg->hasSpilled();
+}
+
+/// Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
+/// operations within the MachineBasicBlock \p MBB.
+static bool expandSMEPPRToZPRSpillPseudos(MachineBasicBlock &MBB,
+                                          const TargetRegisterInfo &TRI,
+                                          BitVector const &ZPRRegs,
+                                          BitVector const &PPR3bRegs,
+                                          BitVector const &GPRRegs,
+                                          EmergencyStackSlots &SpillSlots) {
+  LiveRegUnits UsedRegs(TRI);
+  UsedRegs.addLiveOuts(MBB);
+  bool HasPPRSpills = false;
+  for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
+    UsedRegs.stepBackward(MI);
+    switch (MI.getOpcode()) {
+    case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
+      HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(
+          MBB, MI, TRI, UsedRegs, ZPRRegs, PPR3bRegs, GPRRegs, SpillSlots);
+      MI.eraseFromParent();
+      break;
+    case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+      expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, ZPRRegs,
+                                    SpillSlots);
+      MI.eraseFromParent();
+      break;
+    default:
+      break;
+    }
+  }
+
+  return HasPPRSpills;
+}
+
 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
+
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  const TargetSubtargetInfo &TSI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *TSI.getRegisterInfo();
+  if (AFI->hasStackFrame() && TRI.getSpillSize(AArch64::PPRRegClass) == 16) {
+    const uint32_t *CSRMask =
+        TRI.getCallPreservedMask(MF, MF.getFunction().getCallingConv());
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    assert(MFI.isCalleeSavedInfoValid());
+
+    auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
+      BitVector ScavengeableRegs =
+          TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID));
+      if (CSRMask)
+        ScavengeableRegs.clearBitsInMask(CSRMask);
+      // TODO: Allow reusing callee-saved registers that have been saved.
+      return ScavengeableRegs;
+    };
+
+    // If predicates spills are 16-bytes we may need to expand
+    // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
+    // These are handled separately as we need to compute register liveness to
+    // scavenge a ZPR and PPR during the expansion.
+    BitVector ZPRRegs = ComputeScavengeableRegisters(AArch64::ZPRRegClassID);
+    // Only p0-7 are possible as the second operand of cmpne (needed for fills).
+    BitVector PPR3bRegs =
+        ComputeScavengeableRegisters(AArch64::PPR_3bRegClassID);
+    BitVector GPRRegs = ComputeScavengeableRegisters(AArch64::GPR64RegClassID);
+
+    bool SpillsAboveP7 =
+        any_of(MFI.getCalleeSavedInfo(), [](const CalleeSavedInfo &CSI) {
+          return AArch64::PPR_p8to15RegClass.contains(CSI.getReg());
+        });
+    // We spill p4 in determineCalleeSaves() if a predicate above p8 is spilled,
+    // as it may be needed to reload callee saves (if p0-p3 are used as
+    // returns).
+    if (SpillsAboveP7)
+      PPR3bRegs.set(AArch64::P4);
+
+    EmergencyStackSlots SpillSlots;
+    for (MachineBasicBlock &MBB : MF) {
+      for (int Pass = 0; Pass < 2; Pass++) {
+        bool HasPPRSpills = expandSMEPPRToZPRSpillPseudos(
+            MBB, TRI, ZPRRegs, PPR3bRegs, GPRRegs, SpillSlots);
+        if (!HasPPRSpills)
+          break;
+      }
+    }
+  }
+
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
   assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
@@ -4166,7 +4489,6 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
   int64_t SVEStackSize =
       assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
 
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
   AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
 
@@ -5200,7 +5522,12 @@ void AArch64FrameLowering::emitRemarks(
 
           unsigned RegTy = StackAccess::AccessType::GPR;
           if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
-            if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
+            // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
+            // spill/fill the predicate as a data vector (so are an FPR acess).
+            if (!is_contained({AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO,
+                               AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO},
+                              MI.getOpcode()) &&
+                AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
               RegTy = StackAccess::PPR;
             else
               RegTy = StackAccess::FPR;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 17dd8a073eff0f..0f2b969fba35c7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -81,7 +81,7 @@ static cl::opt<unsigned>
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP,
                           AArch64::CATCHRET),
-      RI(STI.getTargetTriple()), Subtarget(STI) {}
+      RI(STI.getTargetTriple(), STI.getHwMode()), Subtarget(STI) {}
 
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
@@ -2438,6 +2438,8 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STZ2Gi:
   case AArch64::STZGi:
   case AArch64::TAGPstack:
+  case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+  case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
     return 2;
   case AArch64::LD1B_D_IMM:
   case AArch64::LD1B_H_IMM:
@@ -4223,6 +4225,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MinOffset = -256;
     MaxOffset = 254;
     break;
+  case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
+  case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
   case AArch64::LDR_ZXI:
   case AArch64::STR_ZXI:
     Scale = TypeSize::getScalable(16);
@@ -5355,6 +5359,11 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZXI;
       StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected predicate store without SVE store instructions");
+      Opc = AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 24:
@@ -5527,6 +5536,11 @@ void AArch64InstrInfo::loadRegFromStackSlot(
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZXI;
       StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected predicate load without SVE load instructions");
+      Opc = AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO;
+      StackID = TargetStackID::ScalableVector;
     }
     break;
   case 24:
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 5973b63b5a8024..e9730348ba58e5 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -38,8 +38,8 @@ using namespace llvm;
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
 
-AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
-    : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {
+AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT, unsigned HwMode)
+    : AArch64GenRegisterInfo(AArch64::LR, 0, 0, 0, HwMode), TT(TT) {
   AArch64_MC::initLLVMToCVRegMapping(this);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 11da624af4881b..898a509f75908f 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -27,7 +27,7 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
   const Triple &TT;
 
 public:
-  AArch64RegisterInfo(const Triple &TT);
+  AArch64RegisterInfo(const Triple &TT, unsigned HwMode);
 
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const {
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index dd4f2549929f84..6b6884c5457589 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -979,10 +979,19 @@ class ZPRRegOp <string Suffix, AsmOperandClass C, ElementSizeEnum Size,
 //******************************************************************************
 
 // SVE predicate register classes.
+
+// Note: This hardware mode is enabled in AArch64Subtarget::getHwModeSet()
+// (without the use of the table-gen'd predicates).
+def SMEWithStreamingMemoryHazards : HwMode<"", [Predicate<"false">]>;
+
+def PPRSpillFillRI : RegInfoByHwMode<
+      [DefaultMode,              SMEWithStreamingMemoryHazards],
+      [RegInfo<16,16,16>,        RegInfo<16,128,128>]>;
+
 class PPRClass<int firstreg, int lastreg, int step = 1> : RegisterClass<"AArch64",
                                   [ nxv16i1, nxv8i1, nxv4i1, nxv2i1, nxv1i1 ], 16,
                                   (sequence "P%u", firstreg, lastreg, step)> {
-  let Size = 16;
+  let RegInfos = PPRSpillFillRI;
 }
 
 def PPR    : PPRClass<0, 15> {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index bc921f07e1dbf8..5864f57582e21c 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -86,6 +86,11 @@ static cl::alias AArch64StreamingStackHazardSize(
     cl::desc("alias for -aarch64-streaming-hazard-size"),
     cl::aliasopt(AArch64StreamingHazardSize));
 
+static cl::opt<bool> EnableZPRPredicateSpills(
+    "aarch64-enable-zpr-predicate-spills", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Enables spilling/reloading SVE predicates as data vectors (ZPRs)"));
+
 // Subreg liveness tracking is disabled by default for now until all issues
 // are ironed out. This option allows the feature to be used in tests.
 static cl::opt<bool>
@@ -400,6 +405,23 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
   EnableSubregLiveness = EnableSubregLivenessTracking.getValue();
 }
 
+unsigned AArch64Subtarget::getHwModeSet() const {
+  unsigned Modes = 0;
+
+  // Use a special hardware mode in streaming functions with stack hazards.
+  // This changes the spill size (and alignment) for the predicate register
+  // class.
+  //
+  // FIXME: This overrides the table-gen'd `getHwModeSet()` which only looks at
+  // CPU features.
+  if (EnableZPRPredicateSpills.getValue() &&
+      (isStreaming() || isStreamingCompatible())) {
+    Modes |= (1 << 0);
+  }
+
+  return Modes;
+}
+
 const CallLowering *AArch64Subtarget::getCallLowering() const {
   return CallLoweringInfo.get();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index d22991224d496d..e7757907a66434 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -130,6 +130,8 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
                    bool IsStreaming = false, bool IsStreamingCompatible = false,
                    bool HasMinSize = false);
 
+  virtual unsigned getHwModeSet() const override;
+
 // Getters for SubtargetFeatures defined in tablegen
 #define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER)                    \
   bool GETTER() const { return ATTRIBUTE; }
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 81004e70dc179b..e1b34dfc895262 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -59,6 +59,20 @@ def FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO :
   let hasPostISelHook = 1;
 }
 
+def SPILL_PPR_TO_ZPR_SLOT_PSEUDO :
+  Pseudo<(outs), (ins PPRorPNRAny:$Pt, GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]>
+{
+  let mayStore = 1;
+  let hasSideEffects = 0;
+}
+
+def FILL_PPR_FROM_ZPR_SLOT_PSEUDO :
+  Pseudo<(outs PPRorPNRAny:$Pt), (ins GPR64sp:$Rn, simm9:$imm9), []>, Sched<[]>
+{
+  let mayLoad = 1;
+  let hasSideEffects = 0;
+}
+
 def SDTZALoadStore : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>;
 def AArch64SMELdr : SDNode<"AArch64ISD::SME_ZA_LDR", SDTZALoadStore,
                              [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
new file mode 100644
index 00000000000000..a432a61384e42a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
@@ -0,0 +1,1035 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-zpr-predicate-spills -run-pass=greedy %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-zpr-predicate-spills -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND
+--- |
+  source_filename = "<stdin>"
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64--linux-gnu"
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill() #0 { entry: unreachable }
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill__save_restore_nzcv() #0 { entry: unreachable }
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill__save_restore_nzcv__spill_gpr() #0 { entry: unreachable }
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill__spill_zpr() #0 { entry: unreachable }
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill_above_p7() #0 { entry: unreachable }
+
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill_p4_saved() #0 { entry: unreachable }
+
+  attributes #0 = {nounwind "target-features"="+sme,+sve" "aarch64_pstate_sm_compatible"}
+...
+---
+name: zpr_predicate_spill
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+body:             |
+  bb.0.entry:
+    liveins: $p0
+
+    ; CHECK-LABEL: name: zpr_predicate_spill
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
+    ;
+    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill
+    ; EXPAND: liveins: $p0, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
+    ; EXPAND-NEXT: {{  $}}
+    ;
+    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.13)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+    ;
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.0)
+    ;
+    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
+    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.13)
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0
+    %1:ppr = COPY $p0
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0 = COPY %1
+
+    RET_ReallyLR implicit $p0
+...
+---
+name: zpr_predicate_spill__save_restore_nzcv
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+body:             |
+  bb.0.entry:
+    liveins: $p0
+
+    ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
+    ; CHECK: liveins: $p0
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
+    ;
+    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ;
+    ; CHECK-NEXT: FAKE_USE implicit $nzcv
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill__save_restore_nzcv
+    ; EXPAND: liveins: $p0, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
+    ; EXPAND-NEXT: {{  $}}
+    ;
+    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.13)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+    ;
+    ; EXPAND-NEXT: $nzcv = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.0)
+    ;
+    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $x0 = MRS 55824, implicit-def $nzcv, implicit $nzcv
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: MSR 55824, $x0, implicit-def $nzcv
+    ;
+    ; EXPAND-NEXT: FAKE_USE implicit $nzcv
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
+    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.13)
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0
+    $nzcv = IMPLICIT_DEF
+
+    %1:ppr = COPY $p0
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0 = COPY %1
+
+    FAKE_USE implicit $nzcv
+
+    RET_ReallyLR implicit $p0
+...
+---
+name: zpr_predicate_spill__save_restore_nzcv__spill_gpr
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+  - { reg: '$x0' }
+  - { reg: '$x1' }
+  - { reg: '$x2' }
+  - { reg: '$x3' }
+  - { reg: '$x4' }
+  - { reg: '$x5' }
+  - { reg: '$x6' }
+  - { reg: '$x7' }
+body:             |
+  bb.0.entry:
+    liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+
+    ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv__spill_gpr
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
+    ; CHECK: liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+    ; CHECK-NEXT: $x8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x15 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x16 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x17 = IMPLICIT_DEF
+    ; CHECK-NEXT: $x18 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
+    ;
+    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ;
+    ; CHECK-NEXT: FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill__save_restore_nzcv__spill_gpr
+    ; EXPAND: liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
+    ; EXPAND-NEXT: {{  $}}
+    ;
+    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.13)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+    ;
+    ; EXPAND-NEXT: $nzcv = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x15 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x16 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x17 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $x18 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.0)
+    ;
+    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $fp = ADDVL_XXI $sp, 13, implicit $vg
+    ; EXPAND-NEXT: STRXui $x0, killed $fp, 1 :: (store (s64) into %stack.14)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $x0 = MRS 55824, implicit-def $nzcv, implicit $nzcv
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: MSR 55824, $x0, implicit-def $nzcv
+    ; EXPAND-NEXT: $fp = ADDVL_XXI $sp, 13, implicit $vg
+    ; EXPAND-NEXT: $x0 = LDRXui killed $fp, 1 :: (load (s64) from %stack.14)
+    ;
+    ; EXPAND-NEXT: FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
+    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.13)
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+    $nzcv = IMPLICIT_DEF
+    $x8 = IMPLICIT_DEF
+    $x9 = IMPLICIT_DEF
+    $x10 = IMPLICIT_DEF
+    $x11 = IMPLICIT_DEF
+    $x12 = IMPLICIT_DEF
+    $x13 = IMPLICIT_DEF
+    $x14 = IMPLICIT_DEF
+    $x15 = IMPLICIT_DEF
+    $x16 = IMPLICIT_DEF
+    $x17 = IMPLICIT_DEF
+    $x18 = IMPLICIT_DEF
+
+    %1:ppr = COPY $p0
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0 = COPY %1
+
+    FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+
+    RET_ReallyLR implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+...
+---
+name: zpr_predicate_spill__spill_zpr
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+  - { reg: '$z0' }
+  - { reg: '$z1' }
+  - { reg: '$z2' }
+  - { reg: '$z3' }
+  - { reg: '$z4' }
+  - { reg: '$z5' }
+  - { reg: '$z6' }
+  - { reg: '$z7' }
+body:             |
+  bb.0.entry:
+    liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+
+    ; CHECK-LABEL: name: zpr_predicate_spill__spill_zpr
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
+    ; CHECK: liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: $z16 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z17 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z18 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z19 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z20 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z21 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z22 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z23 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z24 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z25 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z26 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z27 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z28 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z29 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z30 = IMPLICIT_DEF
+    ; CHECK-NEXT: $z31 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0 :: (store (s128) into %stack.0)
+    ;
+    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ;
+    ; CHECK-NEXT: FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill__spill_zpr
+    ; EXPAND: liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4, $z23, $z22, $z21, $z20, $z19, $z18, $z17, $z16
+    ; EXPAND-NEXT: {{  $}}
+    ;
+    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.21)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -20, implicit $vg
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 0 :: (store (s128) into %stack.20)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 1 :: (store (s128) into %stack.19)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 2 :: (store (s128) into %stack.18)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 3 :: (store (s128) into %stack.17)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 4 :: (store (s128) into %stack.16)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 5 :: (store (s128) into %stack.15)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 6 :: (store (s128) into %stack.14)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 7 :: (store (s128) into %stack.13)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 8 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 9 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 10 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 11 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z23, $sp, 12 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z22, $sp, 13 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z21, $sp, 14 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z20, $sp, 15 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z19, $sp, 16 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z18, $sp, 17 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z17, $sp, 18 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z16, $sp, 19 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+    ;
+    ; EXPAND-NEXT: $z16 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z17 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z18 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z19 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z20 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z21 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z22 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z23 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z24 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z25 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z26 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z27 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z28 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z29 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z30 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $z31 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.22)
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.22)
+    ;
+    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.22)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.22)
+    ;
+    ; EXPAND-NEXT: FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+    ; EXPAND-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 12 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 13 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $z21 = frame-destroy LDR_ZXI $sp, 14 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $z20 = frame-destroy LDR_ZXI $sp, 15 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $z19 = frame-destroy LDR_ZXI $sp, 16 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $z18 = frame-destroy LDR_ZXI $sp, 17 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $z17 = frame-destroy LDR_ZXI $sp, 18 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $z16 = frame-destroy LDR_ZXI $sp, 19 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.20)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.19)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.18)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.17)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.16)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.15)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.14)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.13)
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 20, implicit $vg
+    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.21)
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
+    $z16 = IMPLICIT_DEF
+    $z17 = IMPLICIT_DEF
+    $z18 = IMPLICIT_DEF
+    $z19 = IMPLICIT_DEF
+    $z20 = IMPLICIT_DEF
+    $z21 = IMPLICIT_DEF
+    $z22 = IMPLICIT_DEF
+    $z23 = IMPLICIT_DEF
+    $z24 = IMPLICIT_DEF
+    $z25 = IMPLICIT_DEF
+    $z26 = IMPLICIT_DEF
+    $z27 = IMPLICIT_DEF
+    $z28 = IMPLICIT_DEF
+    $z29 = IMPLICIT_DEF
+    $z30 = IMPLICIT_DEF
+    $z31 = IMPLICIT_DEF
+
+    %1:ppr = COPY $p0
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0 = COPY %1
+
+    FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
+
+    RET_ReallyLR implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
+...
+---
+name: zpr_predicate_spill_above_p7
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+  - { reg: '$p1' }
+  - { reg: '$p2' }
+  - { reg: '$p3' }
+body:             |
+  bb.0.entry:
+    liveins: $p0, $p1, $p2, $p3
+
+    ; CHECK-LABEL: name: zpr_predicate_spill_above_p7
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
+    ; CHECK: liveins: $p0, $p1, $p2, $p3
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p15, %stack.0, 0 :: (store (s128) into %stack.0)
+    ;
+    ; CHECK-NEXT: $p0 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p1 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p2 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p3 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p4 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p5 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p6 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p7 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p9 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p10 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p11 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p12 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p13 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p14 = IMPLICIT_DEF
+    ; CHECK-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: $p15 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ;
+    ; CHECK-NEXT: FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill_above_p7
+    ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
+    ; EXPAND-NEXT: {{  $}}
+    ;
+    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.13)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+    ;
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p15, 1, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
+    ;
+    ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p2 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p3 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p4 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p5 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p6 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p7 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p9 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p10 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p11 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p12 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p13 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
+    ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.14)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.14)
+    ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ;
+    ; EXPAND-NEXT: FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
+    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.13)
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
+    $p15 = IMPLICIT_DEF
+    %1:ppr = COPY $p15
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p15 = COPY %1
+
+    FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
+
+    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+...
+---
+name: zpr_predicate_spill_p4_saved
+tracksRegLiveness: true
+stack:
+liveins:
+  - { reg: '$p0' }
+  - { reg: '$p1' }
+  - { reg: '$p2' }
+  - { reg: '$p3' }
+body:             |
+  bb.0.entry:
+    liveins: $p0, $p1, $p2, $p3
+
+    ; CHECK-LABEL: name: zpr_predicate_spill_p4_saved
+    ; CHECK: liveins: $p0, $p1, $p2, $p3
+    ; CHECK-NEXT: {{  $}}
+    ;
+    ; CHECK-NEXT: $p8 = IMPLICIT_DEF
+    ;
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+
+    ; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved
+    ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4
+    ; EXPAND-NEXT: {{  $}}
+    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
+    ;
+    ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
+    ;
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
+
+    ; If we spill a register above p8, p4 must also be saved, so we can guarantee
+    ; they will be a register (in the range p0-p7 to for the cmpne reload).
+    $p8 = IMPLICIT_DEF
+
+    RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+...
diff --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
index 0b6bf3892a0c2b..c67d91952c6188 100644
--- a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-remark-size=64 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-size=1024 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-PADDING
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-enable-zpr-predicate-spills -aarch64-stack-hazard-remark-size=64 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-ZPR-PRED-SPILLS
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-enable-zpr-predicate-spills -aarch64-stack-hazard-size=1024 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-ZPR-PRED-SPILLS-WITH-PADDING
 
 ; Don't emit remarks for non-streaming functions.
 define float @csr_x20_stackargs_notsc(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) {
@@ -66,13 +68,18 @@ entry:
 }
 
 ; SVE calling conventions
-; Predicate register spills end up in FP region, currently.
+; Predicate register spills end up in FP region, currently. This can be
+; mitigated with the -aarch64-enable-zpr-predicate-spills option.
 
 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 {
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-48-258 * vscale] is too close to FPR stack object at [SP-48-256 * vscale]
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
 ; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale]
 ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_call':
+; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
+; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
+; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
+; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at {{.*}} is too close to GPR stack object
 entry:
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
   %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
@@ -84,6 +91,10 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
 ; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale]
 ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call':
+; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
+; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
+; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at {{.*}} is too close to FPR stack object
+; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at {{.*}} is too close to GPR stack object
 entry:
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
   %0 = alloca [37 x i8], align 16

>From 2a74bc6aac8d413b2d305a634a314da2aefd0ee1 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 23 Jan 2025 16:45:52 +0000
Subject: [PATCH 2/5] Fixups

---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  27 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |   2 +-
 .../AArch64/spill-fill-zpr-predicates.mir     | 388 ++++++++++--------
 3 files changed, 223 insertions(+), 194 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 64c3ecaf21ea31..a2eacc69aee71b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4332,13 +4332,9 @@ static bool expandFillPPRFromZPRSlotPseudo(
                                    UsedRegs, ZPRRegs, SpillSlots.ZPRSpillFI);
 
   Register PredReg = AArch64::NoRegister;
-  std::optional<ScopedScavengeOrSpill> FindPPR3bReg;
-  if (AArch64::PPR_3bRegClass.contains(MI.getOperand(0).getReg()))
-    PredReg = MI.getOperand(0).getReg();
-  else
-    FindPPR3bReg.emplace(MF, MBB, MachineBasicBlock::iterator(MI), PredReg,
-                         AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs,
-                         PPR3bRegs, SpillSlots.PPRSpillFI);
+  ScopedScavengeOrSpill FindPPR3bReg(
+      MF, MBB, MachineBasicBlock::iterator(MI), PredReg, AArch64::P0,
+      AArch64::PPR_3bRegClass, UsedRegs, PPR3bRegs, SpillSlots.PPRSpillFI);
 
   // Elide NZCV spills if we know it is not used.
   Register NZCVSaveReg = AArch64::NoRegister;
@@ -4350,8 +4346,7 @@ static bool expandFillPPRFromZPRSlotPseudo(
                        SpillSlots.GPRSpillFI);
 
 #ifndef NDEBUG
-  bool Spilled = FindZPRReg.hasSpilled() ||
-                 (FindPPR3bReg && FindPPR3bReg->hasSpilled()) ||
+  bool Spilled = FindZPRReg.hasSpilled() || FindPPR3bReg.hasSpilled() ||
                  (FindGPRReg && FindGPRReg->hasSpilled());
   bool InPrologueOrEpilogue = MI.getFlag(MachineInstr::FrameSetup) ||
                               MI.getFlag(MachineInstr::FrameDestroy);
@@ -4393,7 +4388,7 @@ static bool expandFillPPRFromZPRSlotPseudo(
                                 .getInstr());
 
   propagateFrameFlags(MI, MachineInstrs);
-  return FindPPR3bReg && FindPPR3bReg->hasSpilled();
+  return FindPPR3bReg.hasSpilled();
 }
 
 /// Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
@@ -4446,6 +4441,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
       if (CSRMask)
         ScavengeableRegs.clearBitsInMask(CSRMask);
       // TODO: Allow reusing callee-saved registers that have been saved.
+      assert(ScavengeableRegs.count() > 0 && "Expected scavengeable registers");
       return ScavengeableRegs;
     };
 
@@ -4471,9 +4467,15 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
 
     EmergencyStackSlots SpillSlots;
     for (MachineBasicBlock &MBB : MF) {
+      // In the case we had to spill a predicate (in the range p0-p7) to reload
+      // a predicate (>= p8), additional spill/fill pseudos will be created.
+      // These need an additional expansion pass. Note: There will only be at
+      // most two expansion passes, as spilling/filling a predicate in the range
+      // p0-p7 never requires spilling another predicate.
       for (int Pass = 0; Pass < 2; Pass++) {
         bool HasPPRSpills = expandSMEPPRToZPRSpillPseudos(
             MBB, TRI, ZPRRegs, PPR3bRegs, GPRRegs, SpillSlots);
+        assert((Pass == 0 || !HasPPRSpills) && "Did not expect PPR spills");
         if (!HasPPRSpills)
           break;
       }
@@ -5524,9 +5526,8 @@ void AArch64FrameLowering::emitRemarks(
           if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
             // SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
             // spill/fill the predicate as a data vector (so are an FPR acess).
-            if (!is_contained({AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO,
-                               AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO},
-                              MI.getOpcode()) &&
+            if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
+                MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
                 AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
               RegTy = StackAccess::PPR;
             else
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 5864f57582e21c..34d05c6457e057 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -414,7 +414,7 @@ unsigned AArch64Subtarget::getHwModeSet() const {
   //
   // FIXME: This overrides the table-gen'd `getHwModeSet()` which only looks at
   // CPU features.
-  if (EnableZPRPredicateSpills.getValue() &&
+  if (EnableZPRPredicateSpills.getValue() && getStreamingHazardSize() > 0 &&
       (isStreaming() || isStreamingCompatible())) {
     Modes |= (1 << 0);
   }
diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
index a432a61384e42a..8aa957f04efc07 100644
--- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
+++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
@@ -31,7 +31,6 @@ body:             |
     liveins: $p0
 
     ; CHECK-LABEL: name: zpr_predicate_spill
-    ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
     ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
     ; CHECK: liveins: $p0
@@ -57,42 +56,46 @@ body:             |
     ; CHECK-NEXT: $p15 = IMPLICIT_DEF
     ;
     ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
+    ;
     ; CHECK-NEXT: RET_ReallyLR implicit $p0
 
     ; EXPAND-LABEL: name: zpr_predicate_spill
     ; EXPAND: liveins: $p0, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
     ; EXPAND-NEXT: {{  $}}
     ;
-    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.13)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
     ;
     ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.0)
+    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.0)
     ;
     ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
     ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
@@ -111,49 +114,51 @@ body:             |
     ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
     ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
     ;
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.0)
     ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
     ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
     ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
     ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
-    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.13)
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
     ; EXPAND-NEXT: RET undef $lr, implicit $p0
     %1:ppr = COPY $p0
 
@@ -189,7 +194,6 @@ body:             |
     liveins: $p0
 
     ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv
-    ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
     ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
     ; CHECK: liveins: $p0
@@ -219,44 +223,48 @@ body:             |
     ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
     ;
     ; CHECK-NEXT: FAKE_USE implicit $nzcv
+    ;
     ; CHECK-NEXT: RET_ReallyLR implicit $p0
 
     ; EXPAND-LABEL: name: zpr_predicate_spill__save_restore_nzcv
     ; EXPAND: liveins: $p0, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
     ; EXPAND-NEXT: {{  $}}
     ;
-    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.13)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
     ;
     ; EXPAND-NEXT: $nzcv = IMPLICIT_DEF
     ;
     ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.0)
+    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.0)
     ;
     ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
     ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
@@ -275,7 +283,7 @@ body:             |
     ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
     ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
     ;
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.0)
     ; EXPAND-NEXT: $x0 = MRS 55824, implicit-def $nzcv, implicit $nzcv
     ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
@@ -283,45 +291,47 @@ body:             |
     ;
     ; EXPAND-NEXT: FAKE_USE implicit $nzcv
     ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
     ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
     ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
     ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
-    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.13)
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
     ; EXPAND-NEXT: RET undef $lr, implicit $p0
     $nzcv = IMPLICIT_DEF
 
@@ -369,7 +379,6 @@ body:             |
     liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
 
     ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv__spill_gpr
-    ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
     ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
     ; CHECK: liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
@@ -410,38 +419,41 @@ body:             |
     ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
     ;
     ; CHECK-NEXT: FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+    ;
     ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
 
     ; EXPAND-LABEL: name: zpr_predicate_spill__save_restore_nzcv__spill_gpr
     ; EXPAND: liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
     ; EXPAND-NEXT: {{  $}}
     ;
-    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.13)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
     ;
     ; EXPAND-NEXT: $nzcv = IMPLICIT_DEF
@@ -458,7 +470,8 @@ body:             |
     ; EXPAND-NEXT: $x18 = IMPLICIT_DEF
     ;
     ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.0)
+    ; EXPAND-NEXT: $fp = ADDXri $sp, 1040, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $fp, 0 :: (store (s128) into %stack.0)
     ;
     ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
     ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
@@ -477,56 +490,57 @@ body:             |
     ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
     ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
     ;
-    ; EXPAND-NEXT: $fp = ADDVL_XXI $sp, 13, implicit $vg
-    ; EXPAND-NEXT: STRXui $x0, killed $fp, 1 :: (store (s64) into %stack.14)
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: STRXui $x0, $sp, 1 :: (store (s64) into %stack.16)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $fp, 0 :: (load (s128) from %stack.0)
     ; EXPAND-NEXT: $x0 = MRS 55824, implicit-def $nzcv, implicit $nzcv
     ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: MSR 55824, $x0, implicit-def $nzcv
-    ; EXPAND-NEXT: $fp = ADDVL_XXI $sp, 13, implicit $vg
-    ; EXPAND-NEXT: $x0 = LDRXui killed $fp, 1 :: (load (s64) from %stack.14)
+    ; EXPAND-NEXT: $x0 = LDRXui $sp, 1 :: (load (s64) from %stack.16)
     ;
     ; EXPAND-NEXT: FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
     ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
     ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
     ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
-    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.13)
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
     ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
     $nzcv = IMPLICIT_DEF
     $x8 = IMPLICIT_DEF
@@ -585,7 +599,6 @@ body:             |
     liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
 
     ; CHECK-LABEL: name: zpr_predicate_spill__spill_zpr
-    ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
     ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
     ; CHECK: liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
@@ -630,46 +643,49 @@ body:             |
     ; CHECK-NEXT: $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
     ;
     ; CHECK-NEXT: FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
+    ;
     ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
 
     ; EXPAND-LABEL: name: zpr_predicate_spill__spill_zpr
     ; EXPAND: liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4, $z23, $z22, $z21, $z20, $z19, $z18, $z17, $z16
     ; EXPAND-NEXT: {{  $}}
     ;
-    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.21)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.22)
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -20, implicit $vg
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 0 :: (store (s128) into %stack.20)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 0 :: (store (s128) into %stack.21)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 1 :: (store (s128) into %stack.19)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 1 :: (store (s128) into %stack.20)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 2 :: (store (s128) into %stack.18)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 2 :: (store (s128) into %stack.19)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 3 :: (store (s128) into %stack.17)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 3 :: (store (s128) into %stack.18)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 4 :: (store (s128) into %stack.16)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 4 :: (store (s128) into %stack.17)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 5 :: (store (s128) into %stack.15)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 5 :: (store (s128) into %stack.16)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 6 :: (store (s128) into %stack.14)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 6 :: (store (s128) into %stack.15)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 7 :: (store (s128) into %stack.13)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 7 :: (store (s128) into %stack.14)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 8 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 8 :: (store (s128) into %stack.13)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 9 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 9 :: (store (s128) into %stack.12)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 10 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 10 :: (store (s128) into %stack.11)
     ; EXPAND-NEXT: $z24 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 11 :: (store (s128) into %stack.9)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z23, $sp, 12 :: (store (s128) into %stack.8)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z22, $sp, 13 :: (store (s128) into %stack.7)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z21, $sp, 14 :: (store (s128) into %stack.6)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z20, $sp, 15 :: (store (s128) into %stack.5)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z19, $sp, 16 :: (store (s128) into %stack.4)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z18, $sp, 17 :: (store (s128) into %stack.3)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z17, $sp, 18 :: (store (s128) into %stack.2)
-    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z16, $sp, 19 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z24, $sp, 11 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z23, $sp, 12 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z22, $sp, 13 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z21, $sp, 14 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z20, $sp, 15 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z19, $sp, 16 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z18, $sp, 17 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z17, $sp, 18 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: frame-setup STR_ZXI killed $z16, $sp, 19 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
     ;
     ; EXPAND-NEXT: $z16 = IMPLICIT_DEF
@@ -689,10 +705,11 @@ body:             |
     ; EXPAND-NEXT: $z30 = IMPLICIT_DEF
     ; EXPAND-NEXT: $z31 = IMPLICIT_DEF
     ;
-    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.22)
+    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.24)
     ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.22)
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 1 :: (store (s128) into %stack.0)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $x8, 0 :: (load (s128) from %stack.24)
     ;
     ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
     ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
@@ -711,60 +728,63 @@ body:             |
     ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
     ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
     ;
-    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.22)
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.24)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $x8, 1 :: (load (s128) from %stack.0)
     ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.22)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.24)
     ;
     ; EXPAND-NEXT: FAKE_USE implicit $z16, implicit $z17, implicit $z18, implicit $z19, implicit $z20, implicit $z21, implicit $z22, implicit $z23, implicit $z24, implicit $z25, implicit $z26, implicit $z27, implicit $z28, implicit $z29, implicit $z30, implicit $z31
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-    ; EXPAND-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 12 :: (load (s128) from %stack.8)
-    ; EXPAND-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 13 :: (load (s128) from %stack.7)
-    ; EXPAND-NEXT: $z21 = frame-destroy LDR_ZXI $sp, 14 :: (load (s128) from %stack.6)
-    ; EXPAND-NEXT: $z20 = frame-destroy LDR_ZXI $sp, 15 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $z19 = frame-destroy LDR_ZXI $sp, 16 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $z18 = frame-destroy LDR_ZXI $sp, 17 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $z17 = frame-destroy LDR_ZXI $sp, 18 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $z16 = frame-destroy LDR_ZXI $sp, 19 :: (load (s128) from %stack.1)
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.20)
+    ; EXPAND-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 12 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 13 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $z21 = frame-destroy LDR_ZXI $sp, 14 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $z20 = frame-destroy LDR_ZXI $sp, 15 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $z19 = frame-destroy LDR_ZXI $sp, 16 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $z18 = frame-destroy LDR_ZXI $sp, 17 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $z17 = frame-destroy LDR_ZXI $sp, 18 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $z16 = frame-destroy LDR_ZXI $sp, 19 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.21)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.19)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.20)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.18)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.19)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.17)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.18)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.16)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.17)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.15)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.16)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.14)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.15)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.13)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.14)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.13)
     ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.12)
     ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.11)
     ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.10)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 20, implicit $vg
-    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.21)
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.22)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
     ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $z0, implicit $z1, implicit $z2, implicit $z3, implicit $z4, implicit $z5, implicit $z6, implicit $z7
     $z16 = IMPLICIT_DEF
     $z17 = IMPLICIT_DEF
@@ -822,7 +842,6 @@ body:             |
     liveins: $p0, $p1, $p2, $p3
 
     ; CHECK-LABEL: name: zpr_predicate_spill_above_p7
-    ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
     ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
     ; CHECK: liveins: $p0, $p1, $p2, $p3
@@ -852,44 +871,48 @@ body:             |
     ; CHECK-NEXT: $p15 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0 :: (load (s128) from %stack.0)
     ;
     ; CHECK-NEXT: FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
+    ;
     ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
 
     ; EXPAND-LABEL: name: zpr_predicate_spill_above_p7
     ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
     ; EXPAND-NEXT: {{  $}}
     ;
-    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.13)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.14)
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -12, implicit $vg
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p15, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.12)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.13)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p14, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.11)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.12)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p13, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.10)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 2 :: (store (s128) into %stack.11)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p12, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.9)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 3 :: (store (s128) into %stack.10)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p11, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.8)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 4 :: (store (s128) into %stack.9)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p10, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.7)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 5 :: (store (s128) into %stack.8)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p9, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.6)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 6 :: (store (s128) into %stack.7)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.5)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 7 :: (store (s128) into %stack.6)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p7, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.4)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 8 :: (store (s128) into %stack.5)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p6, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.3)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 9 :: (store (s128) into %stack.4)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p5, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
     ;
     ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
     ;
     ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p15, 1, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
+    ; EXPAND-NEXT: $x8 = ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 1 :: (store (s128) into %stack.0)
     ;
     ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
     ; EXPAND-NEXT: $p1 = IMPLICIT_DEF
@@ -909,54 +932,57 @@ body:             |
     ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
     ;
     ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.14)
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: STR_ZXI $z0, $x8, 0 :: (store (s128) into %stack.16)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI $x8, 1 :: (load (s128) from %stack.0)
     ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = LDR_ZXI $sp, 0 :: (load (s128) from %stack.14)
+    ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.16)
     ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ;
     ; EXPAND-NEXT: FAKE_USE implicit $p4, implicit $p5, implicit $p6, implicit $p7
+    ;
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.12)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p15 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.11)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.12)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p14 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.10)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.11)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p13 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.9)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.10)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p12 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.8)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 4 :: (load (s128) from %stack.9)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p11 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.7)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 5 :: (load (s128) from %stack.8)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p10 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.6)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 6 :: (load (s128) from %stack.7)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p9 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.5)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 7 :: (load (s128) from %stack.6)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.4)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
     ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.3)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
     ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.2)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
     ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
-    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.13)
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
     ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
     $p15 = IMPLICIT_DEF
     %1:ppr = COPY $p15
@@ -1000,31 +1026,33 @@ body:             |
     ; CHECK-LABEL: name: zpr_predicate_spill_p4_saved
     ; CHECK: liveins: $p0, $p1, $p2, $p3
     ; CHECK-NEXT: {{  $}}
-    ;
     ; CHECK-NEXT: $p8 = IMPLICIT_DEF
-    ;
     ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
-
+    ;
     ; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved
     ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4
     ; EXPAND-NEXT: {{  $}}
-    ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.3)
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.2)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
-    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
+    ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.1)
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
     ;
     ; EXPAND-NEXT: $p8 = IMPLICIT_DEF
     ;
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.1)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.2)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
+    ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.1)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-    ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
+    ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.3)
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
     ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
 
     ; If we spill a register above p8, p4 must also be saved, so we can guarantee

>From e522bb68cc0a96ff491e50cfcc92f402efa6b2bf Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 24 Jan 2025 17:16:14 +0000
Subject: [PATCH 3/5] Fixups

---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 168 ++++++++++--------
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |   2 +-
 .../AArch64/spill-fill-zpr-predicates.mir     | 111 ++++++------
 3 files changed, 149 insertions(+), 132 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a2eacc69aee71b..5bbf07b607bc30 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4208,20 +4208,22 @@ struct ScopedScavengeOrSpill {
                         Register SpillCandidate, const TargetRegisterClass &RC,
                         LiveRegUnits const &UsedRegs,
                         BitVector const &AllocatableRegs,
-                        std::optional<int> &MaybeSpillFI)
+                        std::optional<int> *MaybeSpillFI)
       : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast<const AArch64InstrInfo &>(
                                           *MF.getSubtarget().getInstrInfo())),
         TRI(*MF.getSubtarget().getRegisterInfo()) {
     FreeReg = tryScavengeRegister(UsedRegs, AllocatableRegs);
     if (FreeReg != AArch64::NoRegister)
       return;
-    if (!MaybeSpillFI) {
+    assert(MaybeSpillFI && "Expected emergency spill slot FI information "
+                           "(attempted to spill in prologue/epilogue?)");
+    if (!MaybeSpillFI->has_value()) {
       MachineFrameInfo &MFI = MF.getFrameInfo();
-      MaybeSpillFI = MFI.CreateSpillStackObject(TRI.getSpillSize(RC),
-                                                TRI.getSpillAlign(RC));
+      *MaybeSpillFI = MFI.CreateSpillStackObject(TRI.getSpillSize(RC),
+                                                 TRI.getSpillAlign(RC));
     }
     FreeReg = SpilledReg = SpillCandidate;
-    SpillFI = *MaybeSpillFI;
+    SpillFI = MaybeSpillFI->value();
     TII.storeRegToStackSlot(MBB, MBBI, SpilledReg, false, SpillFI, &RC, &TRI,
                             Register());
   }
@@ -4252,6 +4254,18 @@ struct EmergencyStackSlots {
   std::optional<int> GPRSpillFI;
 };
 
+/// Registers available for scavenging (ZPR, PPR3b, GPR).
+struct ScavengeableRegs {
+  BitVector ZPRRegs;
+  BitVector PPR3bRegs;
+  BitVector GPRRegs;
+};
+
+static bool isInPrologueOrEpilogue(const MachineInstr &MI) {
+  return MI.getFlag(MachineInstr::FrameSetup) ||
+         MI.getFlag(MachineInstr::FrameDestroy);
+}
+
 /// Expands:
 /// ```
 /// SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0
@@ -4267,24 +4281,17 @@ static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB,
                                           MachineInstr &MI,
                                           const TargetRegisterInfo &TRI,
                                           LiveRegUnits const &UsedRegs,
-                                          BitVector const &ZPRRegs,
+                                          ScavengeableRegs const &Regs,
                                           EmergencyStackSlots &SpillSlots) {
   MachineFunction &MF = *MBB.getParent();
   auto *TII =
       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   Register ZPredReg = AArch64::NoRegister;
-  ScopedScavengeOrSpill FindZPRReg(MF, MBB, MachineBasicBlock::iterator(MI),
-                                   ZPredReg, AArch64::Z0, AArch64::ZPRRegClass,
-                                   UsedRegs, ZPRRegs, SpillSlots.ZPRSpillFI);
-
-#ifndef NDEBUG
-  bool InPrologueOrEpilogue = MI.getFlag(MachineInstr::FrameSetup) ||
-                              MI.getFlag(MachineInstr::FrameDestroy);
-  assert((!FindZPRReg.hasSpilled() || !InPrologueOrEpilogue) &&
-         "SPILL_PPR_TO_ZPR_SLOT_PSEUDO expansion should not spill in prologue "
-         "or epilogue");
-#endif
+  ScopedScavengeOrSpill FindZPRReg(
+      MF, MBB, MachineBasicBlock::iterator(MI), ZPredReg, AArch64::Z0,
+      AArch64::ZPRRegClass, UsedRegs, Regs.ZPRRegs,
+      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
 
   SmallVector<MachineInstr *, 2> MachineInstrs;
   const DebugLoc &DL = MI.getDebugLoc();
@@ -4317,44 +4324,37 @@ static void expandSpillPPRToZPRSlotPseudo(MachineBasicBlock &MBB,
 /// spilling if necessary). If the status flags are in use at the point of
 /// expansion they are preserved (by moving them to/from a GPR). This may cause
 /// an additional spill if no GPR is free at the expansion point.
-static bool expandFillPPRFromZPRSlotPseudo(
-    MachineBasicBlock &MBB, MachineInstr &MI, const TargetRegisterInfo &TRI,
-    LiveRegUnits const &UsedRegs, BitVector const &ZPRRegs,
-    BitVector const &PPR3bRegs, BitVector const &GPRRegs,
-    EmergencyStackSlots &SpillSlots) {
+static bool expandFillPPRFromZPRSlotPseudo(MachineBasicBlock &MBB,
+                                           MachineInstr &MI,
+                                           const TargetRegisterInfo &TRI,
+                                           LiveRegUnits const &UsedRegs,
+                                           ScavengeableRegs const &Regs,
+                                           EmergencyStackSlots &SpillSlots) {
   MachineFunction &MF = *MBB.getParent();
   auto *TII =
       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   Register ZPredReg = AArch64::NoRegister;
-  ScopedScavengeOrSpill FindZPRReg(MF, MBB, MachineBasicBlock::iterator(MI),
-                                   ZPredReg, AArch64::Z0, AArch64::ZPRRegClass,
-                                   UsedRegs, ZPRRegs, SpillSlots.ZPRSpillFI);
+  ScopedScavengeOrSpill FindZPRReg(
+      MF, MBB, MachineBasicBlock::iterator(MI), ZPredReg, AArch64::Z0,
+      AArch64::ZPRRegClass, UsedRegs, Regs.ZPRRegs,
+      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.ZPRSpillFI);
 
   Register PredReg = AArch64::NoRegister;
   ScopedScavengeOrSpill FindPPR3bReg(
       MF, MBB, MachineBasicBlock::iterator(MI), PredReg, AArch64::P0,
-      AArch64::PPR_3bRegClass, UsedRegs, PPR3bRegs, SpillSlots.PPRSpillFI);
+      AArch64::PPR_3bRegClass, UsedRegs, Regs.PPR3bRegs,
+      isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.PPRSpillFI);
 
   // Elide NZCV spills if we know it is not used.
   Register NZCVSaveReg = AArch64::NoRegister;
   bool IsNZCVUsed = !UsedRegs.available(AArch64::NZCV);
   std::optional<ScopedScavengeOrSpill> FindGPRReg;
   if (IsNZCVUsed)
-    FindGPRReg.emplace(MF, MBB, MachineBasicBlock::iterator(MI), NZCVSaveReg,
-                       AArch64::X0, AArch64::GPR64RegClass, UsedRegs, GPRRegs,
-                       SpillSlots.GPRSpillFI);
-
-#ifndef NDEBUG
-  bool Spilled = FindZPRReg.hasSpilled() || FindPPR3bReg.hasSpilled() ||
-                 (FindGPRReg && FindGPRReg->hasSpilled());
-  bool InPrologueOrEpilogue = MI.getFlag(MachineInstr::FrameSetup) ||
-                              MI.getFlag(MachineInstr::FrameDestroy);
-  assert((!Spilled || !InPrologueOrEpilogue) &&
-         "FILL_PPR_FROM_ZPR_SLOT_PSEUDO expansion should not spill in prologue "
-         "or epilogue");
-#endif
-
+    FindGPRReg.emplace(
+        MF, MBB, MachineBasicBlock::iterator(MI), NZCVSaveReg, AArch64::X0,
+        AArch64::GPR64RegClass, UsedRegs, Regs.GPRRegs,
+        isInPrologueOrEpilogue(MI) ? nullptr : &SpillSlots.GPRSpillFI);
   SmallVector<MachineInstr *, 4> MachineInstrs;
   const DebugLoc &DL = MI.getDebugLoc();
   MachineInstrs.push_back(BuildMI(MBB, MI, DL, TII->get(AArch64::LDR_ZXI))
@@ -4393,26 +4393,27 @@ static bool expandFillPPRFromZPRSlotPseudo(
 
 /// Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
 /// operations within the MachineBasicBlock \p MBB.
-static bool expandSMEPPRToZPRSpillPseudos(MachineBasicBlock &MBB,
-                                          const TargetRegisterInfo &TRI,
-                                          BitVector const &ZPRRegs,
-                                          BitVector const &PPR3bRegs,
-                                          BitVector const &GPRRegs,
-                                          EmergencyStackSlots &SpillSlots) {
+static bool expandSMEPPRToZPRSpillPseudos(
+    MachineBasicBlock &MBB, const TargetRegisterInfo &TRI,
+    ScavengeableRegs const &ScavengeableRegsBody,
+    ScavengeableRegs const &ScavengeableRegsFrameSetup,
+    EmergencyStackSlots &SpillSlots) {
   LiveRegUnits UsedRegs(TRI);
   UsedRegs.addLiveOuts(MBB);
   bool HasPPRSpills = false;
   for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
     UsedRegs.stepBackward(MI);
+    ScavengeableRegs const &Regs = isInPrologueOrEpilogue(MI)
+                                       ? ScavengeableRegsFrameSetup
+                                       : ScavengeableRegsBody;
     switch (MI.getOpcode()) {
     case AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
-      HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(
-          MBB, MI, TRI, UsedRegs, ZPRRegs, PPR3bRegs, GPRRegs, SpillSlots);
+      HasPPRSpills |= expandFillPPRFromZPRSlotPseudo(MBB, MI, TRI, UsedRegs,
+                                                     Regs, SpillSlots);
       MI.eraseFromParent();
       break;
     case AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
-      expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, ZPRRegs,
-                                    SpillSlots);
+      expandSpillPPRToZPRSlotPseudo(MBB, MI, TRI, UsedRegs, Regs, SpillSlots);
       MI.eraseFromParent();
       break;
     default:
@@ -4430,40 +4431,47 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
   const TargetSubtargetInfo &TSI = MF.getSubtarget();
   const TargetRegisterInfo &TRI = *TSI.getRegisterInfo();
   if (AFI->hasStackFrame() && TRI.getSpillSize(AArch64::PPRRegClass) == 16) {
-    const uint32_t *CSRMask =
-        TRI.getCallPreservedMask(MF, MF.getFunction().getCallingConv());
+    // If predicates spills are 16-bytes we may need to expand
+    // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
+
     const MachineFrameInfo &MFI = MF.getFrameInfo();
     assert(MFI.isCalleeSavedInfoValid());
+    const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
     auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
-      BitVector ScavengeableRegs =
-          TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID));
-      if (CSRMask)
-        ScavengeableRegs.clearBitsInMask(CSRMask);
-      // TODO: Allow reusing callee-saved registers that have been saved.
-      assert(ScavengeableRegs.count() > 0 && "Expected scavengeable registers");
-      return ScavengeableRegs;
+      BitVector Regs = TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID));
+
+      for (const CalleeSavedInfo &I : CSI)
+        if (TRI.getRegClass(RegClassID)->contains(I.getReg()))
+          Regs.set(I.getReg());
+
+      assert(Regs.count() > 0 && "Expected scavengeable registers");
+      return Regs;
     };
 
-    // If predicates spills are 16-bytes we may need to expand
-    // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
-    // These are handled separately as we need to compute register liveness to
-    // scavenge a ZPR and PPR during the expansion.
-    BitVector ZPRRegs = ComputeScavengeableRegisters(AArch64::ZPRRegClassID);
+    const uint32_t *CSRMask =
+        TRI.getCallPreservedMask(MF, MF.getFunction().getCallingConv());
+
+    // Registers free to scavenge in the function body.
+    ScavengeableRegs ScavengeableRegsBody;
+    ScavengeableRegsBody.ZPRRegs =
+        ComputeScavengeableRegisters(AArch64::ZPRRegClassID);
     // Only p0-7 are possible as the second operand of cmpne (needed for fills).
-    BitVector PPR3bRegs =
+    ScavengeableRegsBody.PPR3bRegs =
         ComputeScavengeableRegisters(AArch64::PPR_3bRegClassID);
-    BitVector GPRRegs = ComputeScavengeableRegisters(AArch64::GPR64RegClassID);
-
-    bool SpillsAboveP7 =
-        any_of(MFI.getCalleeSavedInfo(), [](const CalleeSavedInfo &CSI) {
-          return AArch64::PPR_p8to15RegClass.contains(CSI.getReg());
-        });
-    // We spill p4 in determineCalleeSaves() if a predicate above p8 is spilled,
-    // as it may be needed to reload callee saves (if p0-p3 are used as
-    // returns).
-    if (SpillsAboveP7)
-      PPR3bRegs.set(AArch64::P4);
+    ScavengeableRegsBody.GPRRegs =
+        ComputeScavengeableRegisters(AArch64::GPR64RegClassID);
+
+    // Registers free to scavenge in the prologue/epilogue.
+    ScavengeableRegs ScavengeableRegsFrameSetup = ScavengeableRegsBody;
+    ScavengeableRegsFrameSetup.ZPRRegs.clearBitsInMask(CSRMask);
+    ScavengeableRegsFrameSetup.GPRRegs.clearBitsInMask(CSRMask);
+    // Note: If p4 was available allow it to be scavenged (even though it is a
+    // CSR). P4 is reloaded last in the epilogue and is needed to reload
+    // predicates >= p8 if p0-p3 are used as return values.
+    ScavengeableRegsFrameSetup.PPR3bRegs.clearBitsInMask(CSRMask);
+    if (ScavengeableRegsBody.PPR3bRegs[AArch64::P4])
+      ScavengeableRegsFrameSetup.PPR3bRegs.set(AArch64::P4);
 
     EmergencyStackSlots SpillSlots;
     for (MachineBasicBlock &MBB : MF) {
@@ -4474,7 +4482,8 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
       // p0-p7 never requires spilling another predicate.
       for (int Pass = 0; Pass < 2; Pass++) {
         bool HasPPRSpills = expandSMEPPRToZPRSpillPseudos(
-            MBB, TRI, ZPRRegs, PPR3bRegs, GPRRegs, SpillSlots);
+            MBB, TRI, ScavengeableRegsBody, ScavengeableRegsFrameSetup,
+            SpillSlots);
         assert((Pass == 0 || !HasPPRSpills) && "Did not expect PPR spills");
         if (!HasPPRSpills)
           break;
@@ -5528,9 +5537,10 @@ void AArch64FrameLowering::emitRemarks(
             // spill/fill the predicate as a data vector (so are an FPR acess).
             if (MI.getOpcode() != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
                 MI.getOpcode() != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
-                AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
+                AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) {
+              MI.dump();
               RegTy = StackAccess::PPR;
-            else
+            } else
               RegTy = StackAccess::FPR;
           } else if (AArch64InstrInfo::isFpOrNEON(MI)) {
             RegTy = StackAccess::FPR;
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 34d05c6457e057..5864f57582e21c 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -414,7 +414,7 @@ unsigned AArch64Subtarget::getHwModeSet() const {
   //
   // FIXME: This overrides the table-gen'd `getHwModeSet()` which only looks at
   // CPU features.
-  if (EnableZPRPredicateSpills.getValue() && getStreamingHazardSize() > 0 &&
+  if (EnableZPRPredicateSpills.getValue() &&
       (isStreaming() || isStreamingCompatible())) {
     Modes |= (1 << 0);
   }
diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
index 8aa957f04efc07..b58f91ac68a932 100644
--- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
+++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
@@ -10,7 +10,7 @@
 
   define aarch64_sve_vector_pcs void @zpr_predicate_spill__save_restore_nzcv() #0 { entry: unreachable }
 
-  define aarch64_sve_vector_pcs void @zpr_predicate_spill__save_restore_nzcv__spill_gpr() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr() #0 { entry: unreachable }
 
   define aarch64_sve_vector_pcs void @zpr_predicate_spill__spill_zpr() #0 { entry: unreachable }
 
@@ -31,6 +31,7 @@ body:             |
     liveins: $p0
 
     ; CHECK-LABEL: name: zpr_predicate_spill
+    ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
     ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
     ; CHECK: liveins: $p0
@@ -145,17 +146,17 @@ body:             |
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
     ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
     ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
@@ -194,6 +195,7 @@ body:             |
     liveins: $p0
 
     ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv
+    ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
     ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
     ; CHECK: liveins: $p0
@@ -284,10 +286,10 @@ body:             |
     ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
     ;
     ; EXPAND-NEXT: $z0 = LDR_ZXI killed $x8, 0 :: (load (s128) from %stack.0)
-    ; EXPAND-NEXT: $x0 = MRS 55824, implicit-def $nzcv, implicit $nzcv
+    ; EXPAND-NEXT: $fp = MRS 55824, implicit-def $nzcv, implicit $nzcv
     ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: MSR 55824, $x0, implicit-def $nzcv
+    ; EXPAND-NEXT: MSR 55824, $fp, implicit-def $nzcv
     ;
     ; EXPAND-NEXT: FAKE_USE implicit $nzcv
     ;
@@ -318,17 +320,17 @@ body:             |
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
     ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
     ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
@@ -361,7 +363,7 @@ body:             |
     RET_ReallyLR implicit $p0
 ...
 ---
-name: zpr_predicate_spill__save_restore_nzcv__spill_gpr
+name: zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr
 tracksRegLiveness: true
 stack:
 liveins:
@@ -378,13 +380,15 @@ body:             |
   bb.0.entry:
     liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
 
-    ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv__spill_gpr
+    ; CHECK-LABEL: name: zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr
+    ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
     ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
     ; CHECK: liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
     ; CHECK-NEXT: {{  $}}
     ;
     ; CHECK-NEXT: $nzcv = IMPLICIT_DEF
+    ;
     ; CHECK-NEXT: $x8 = IMPLICIT_DEF
     ; CHECK-NEXT: $x9 = IMPLICIT_DEF
     ; CHECK-NEXT: $x10 = IMPLICIT_DEF
@@ -422,7 +426,7 @@ body:             |
     ;
     ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $x0, implicit $x1, implicit $x2, implicit $x3, implicit $x4, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
 
-    ; EXPAND-LABEL: name: zpr_predicate_spill__save_restore_nzcv__spill_gpr
+    ; EXPAND-LABEL: name: zpr_predicate_spill__save_restore_nzcv__scavenge_csr_gpr
     ; EXPAND: liveins: $p0, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $fp, $p15, $p14, $p13, $p12, $p11, $p10, $p9, $p8, $p7, $p6, $p5, $p4
     ; EXPAND-NEXT: {{  $}}
     ;
@@ -453,10 +457,11 @@ body:             |
     ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 10 :: (store (s128) into %stack.3)
     ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
     ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 11 :: (store (s128) into %stack.2)
-    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+    ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
     ;
     ; EXPAND-NEXT: $nzcv = IMPLICIT_DEF
+    ;
     ; EXPAND-NEXT: $x8 = IMPLICIT_DEF
     ; EXPAND-NEXT: $x9 = IMPLICIT_DEF
     ; EXPAND-NEXT: $x10 = IMPLICIT_DEF
@@ -470,7 +475,7 @@ body:             |
     ; EXPAND-NEXT: $x18 = IMPLICIT_DEF
     ;
     ; EXPAND-NEXT: $z0 = CPY_ZPzI_B $p0, 1, 0
-    ; EXPAND-NEXT: $fp = ADDXri $sp, 1040, 0
+    ; EXPAND-NEXT: $fp = ADDXri $sp, 1024, 0
     ; EXPAND-NEXT: STR_ZXI $z0, $fp, 0 :: (store (s128) into %stack.0)
     ;
     ; EXPAND-NEXT: $p0 = IMPLICIT_DEF
@@ -490,17 +495,15 @@ body:             |
     ; EXPAND-NEXT: $p14 = IMPLICIT_DEF
     ; EXPAND-NEXT: $p15 = IMPLICIT_DEF
     ;
-    ; EXPAND-NEXT: STRXui $x0, $sp, 1 :: (store (s64) into %stack.16)
     ; EXPAND-NEXT: $z0 = LDR_ZXI killed $fp, 0 :: (load (s128) from %stack.0)
-    ; EXPAND-NEXT: $x0 = MRS 55824, implicit-def $nzcv, implicit $nzcv
+    ; EXPAND-NEXT: $fp = MRS 55824, implicit-def $nzcv, implicit $nzcv
     ; EXPAND-NEXT: $p0 = PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
-    ; EXPAND-NEXT: MSR 55824, $x0, implicit-def $nzcv
-    ; EXPAND-NEXT: $x0 = LDRXui $sp, 1 :: (load (s64) from %stack.16)
+    ; EXPAND-NEXT: MSR 55824, $fp, implicit-def $nzcv
     ;
     ; EXPAND-NEXT: FAKE_USE implicit $nzcv, implicit $x8, implicit $x9, implicit $x10, implicit $x11, implicit $x12, implicit $x13, implicit $x14, implicit $x15, implicit $x16, implicit $x17, implicit $x18
     ;
-    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+    ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.13)
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
@@ -527,17 +530,17 @@ body:             |
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 12, implicit $vg
     ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.14)
     ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
@@ -599,6 +602,7 @@ body:             |
     liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
 
     ; CHECK-LABEL: name: zpr_predicate_spill__spill_zpr
+    ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
     ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
     ; CHECK: liveins: $p0, $z0, $z1, $z2, $z3, $z4, $z5, $z6, $z7
@@ -771,17 +775,17 @@ body:             |
     ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.13)
-    ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.12)
-    ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.11)
-    ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z24 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.10)
-    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p1 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p1, $z24, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 20, implicit $vg
     ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.22)
     ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
@@ -842,6 +846,7 @@ body:             |
     liveins: $p0, $p1, $p2, $p3
 
     ; CHECK-LABEL: name: zpr_predicate_spill_above_p7
+    ; CHECK: stack:
     ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16,
     ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register:
     ; CHECK: liveins: $p0, $p1, $p2, $p3
@@ -969,14 +974,14 @@ body:             |
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 8 :: (load (s128) from %stack.5)
-    ; EXPAND-NEXT: $p7 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p7, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p7 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 9 :: (load (s128) from %stack.4)
-    ; EXPAND-NEXT: $p6 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p6, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p6 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 10 :: (load (s128) from %stack.3)
-    ; EXPAND-NEXT: $p5 = frame-destroy PTRUE_B 31, implicit $vg
-    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p5, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
+    ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
+    ; EXPAND-NEXT: $p5 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
     ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 11 :: (load (s128) from %stack.2)
     ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
     ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
@@ -1026,9 +1031,11 @@ body:             |
     ; CHECK-LABEL: name: zpr_predicate_spill_p4_saved
     ; CHECK: liveins: $p0, $p1, $p2, $p3
     ; CHECK-NEXT: {{  $}}
+    ;
     ; CHECK-NEXT: $p8 = IMPLICIT_DEF
-    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
     ;
+    ; CHECK-NEXT: RET_ReallyLR implicit $p0, implicit $p1, implicit $p2, implicit $p3
+
     ; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved
     ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4
     ; EXPAND-NEXT: {{  $}}

>From d28c70e4fe6a37a07860291a9f4d708cf09bb69d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 24 Jan 2025 17:48:52 +0000
Subject: [PATCH 4/5] Fixups

---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 55 +++++++++++--------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 5bbf07b607bc30..d1e2d5ab67c4cd 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4434,42 +4434,49 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     // If predicates spills are 16-bytes we may need to expand
     // SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
 
-    const MachineFrameInfo &MFI = MF.getFrameInfo();
-    assert(MFI.isCalleeSavedInfoValid());
-    const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+    const uint32_t *CSRMask =
+        TRI.getCallPreservedMask(MF, MF.getFunction().getCallingConv());
 
     auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
       BitVector Regs = TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID));
-
-      for (const CalleeSavedInfo &I : CSI)
-        if (TRI.getRegClass(RegClassID)->contains(I.getReg()))
-          Regs.set(I.getReg());
-
+      Regs.clearBitsInMask(CSRMask);
       assert(Regs.count() > 0 && "Expected scavengeable registers");
       return Regs;
     };
 
-    const uint32_t *CSRMask =
-        TRI.getCallPreservedMask(MF, MF.getFunction().getCallingConv());
-
-    // Registers free to scavenge in the function body.
-    ScavengeableRegs ScavengeableRegsBody;
-    ScavengeableRegsBody.ZPRRegs =
+    // Registers free to scavenge in the prologue/epilogue.
+    ScavengeableRegs ScavengeableRegsFrameSetup;
+    ScavengeableRegsFrameSetup.ZPRRegs =
         ComputeScavengeableRegisters(AArch64::ZPRRegClassID);
     // Only p0-7 are possible as the second operand of cmpne (needed for fills).
-    ScavengeableRegsBody.PPR3bRegs =
+    ScavengeableRegsFrameSetup.PPR3bRegs =
         ComputeScavengeableRegisters(AArch64::PPR_3bRegClassID);
-    ScavengeableRegsBody.GPRRegs =
+    ScavengeableRegsFrameSetup.GPRRegs =
         ComputeScavengeableRegisters(AArch64::GPR64RegClassID);
 
-    // Registers free to scavenge in the prologue/epilogue.
-    ScavengeableRegs ScavengeableRegsFrameSetup = ScavengeableRegsBody;
-    ScavengeableRegsFrameSetup.ZPRRegs.clearBitsInMask(CSRMask);
-    ScavengeableRegsFrameSetup.GPRRegs.clearBitsInMask(CSRMask);
-    // Note: If p4 was available allow it to be scavenged (even though it is a
-    // CSR). P4 is reloaded last in the epilogue and is needed to reload
-    // predicates >= p8 if p0-p3 are used as return values.
-    ScavengeableRegsFrameSetup.PPR3bRegs.clearBitsInMask(CSRMask);
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    assert(MFI.isCalleeSavedInfoValid());
+    const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+    auto MarkSavedRegistersAsAvailable =
+        [&, &Reserved = MF.getRegInfo().getReservedRegs()](
+            BitVector &Regs, unsigned RegClassID) {
+          for (const CalleeSavedInfo &I : CSI)
+            if (!Reserved[I.getReg()] &&
+                TRI.getRegClass(RegClassID)->contains(I.getReg()))
+              Regs.set(I.getReg());
+        };
+
+    // Registers free to scavenge in the function body.
+    ScavengeableRegs ScavengeableRegsBody = ScavengeableRegsFrameSetup;
+    MarkSavedRegistersAsAvailable(ScavengeableRegsBody.ZPRRegs,
+                                  AArch64::ZPRRegClassID);
+    MarkSavedRegistersAsAvailable(ScavengeableRegsBody.PPR3bRegs,
+                                  AArch64::PPR_3bRegClassID);
+    MarkSavedRegistersAsAvailable(ScavengeableRegsBody.GPRRegs,
+                                  AArch64::GPR64RegClassID);
+
+    // p4 (CSR) is reloaded last in the epilogue, so if it is saved, it can be
+    // used to reload other predicates.
     if (ScavengeableRegsBody.PPR3bRegs[AArch64::P4])
       ScavengeableRegsFrameSetup.PPR3bRegs.set(AArch64::P4);
 

>From 213d5aa9007243a7c8610e561763b2ed3361a356 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 24 Jan 2025 17:53:41 +0000
Subject: [PATCH 5/5] Fixups

---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index d1e2d5ab67c4cd..7d3c79f269008d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4439,7 +4439,8 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
 
     auto ComputeScavengeableRegisters = [&](unsigned RegClassID) {
       BitVector Regs = TRI.getAllocatableSet(MF, TRI.getRegClass(RegClassID));
-      Regs.clearBitsInMask(CSRMask);
+      if (CSRMask)
+        Regs.clearBitsInMask(CSRMask);
       assert(Regs.count() > 0 && "Expected scavengeable registers");
       return Regs;
     };



More information about the llvm-commits mailing list