[llvm] [AArch64][SME] Move saving VG for the unwinder out of frame lowering (PR #145748)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 25 10:50:19 PDT 2025


https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/145748

This patch moves computing and storing of VG in functions with streaming mode changes to a new pre-RA pass (MachineSMEABI). The goal is to make saving VG simpler, as computing VG may require calling `__arm_get_current_vg` -- which requires saving X0 around the call and the LR (among complexities in frame lowering). Doing this pre-RA allows the register allocator to handle this (rather than manual scavenging).

The MachineSMEABI saves to VG to AArch64::SAVED_STREAMING_VG_SLOT and AArch64::SAVED_VG_SLOT target frame indices. These will be resolved to an actual frame indices during PEI (as they are not known before then).

For the most part this does not significantly change codegen, however, there is one downside that resolving the frame indices outside of the prologue may need exta instructions (to step past later allocations on the stack, such as scalable vectors).

Fixes #145635

>From 29a752f95ffe47dd669195c8215d4787a5e839b9 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 25 Jun 2025 17:01:25 +0000
Subject: [PATCH] [AArch64][SME] Move saving VG for the unwinder out of frame
 lowering

This patch moves computing and storing of VG in functions with streaming
mode changes to a new pre-RA pass (MachineSMEABI). The goal is to make
saving VG simpler, as computing VG may require calling
`__arm_get_current_vg` -- which requires saving X0 around the call and
the LR (among complexities in frame lowering). Doing this pre-RA allows
the register allocator to handle this (rather than manual scavenging).

The MachineSMEABI saves to VG to AArch64::SAVED_STREAMING_VG_SLOT and
AArch64::SAVED_VG_SLOT target frame indices. These will be resolved to
an actual frame indices during PEI (as they are not known before then).

For the most part this does not significantly change codegen, however,
there is one downside that resolving the frame indices outside of
the prologue may need exta instructions (to step past later allocations
on the stack, such as scalable vectors).

Fixes #145635
---
 llvm/lib/CodeGen/MachineVerifier.cpp          |   4 +-
 llvm/lib/Target/AArch64/AArch64.h             |   7 +
 .../Target/AArch64/AArch64FrameLowering.cpp   | 158 +---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |   8 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   3 +
 .../Target/AArch64/AArch64RegisterInfo.cpp    |  12 +
 llvm/lib/Target/AArch64/AArch64RegisterInfo.h |   2 +
 .../Target/AArch64/AArch64TargetMachine.cpp   |   3 +
 llvm/lib/Target/AArch64/CMakeLists.txt        |   1 +
 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 113 +++
 llvm/test/CodeGen/AArch64/O0-pipeline.ll      |   1 +
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   1 +
 .../outlining-with-streaming-mode-changes.ll  |  24 +-
 llvm/test/CodeGen/AArch64/sme-agnostic-za.ll  |  20 +-
 .../sme-avoid-coalescing-locally-streaming.ll |  39 +-
 ...compatible-to-normal-fn-wihout-sme-attr.ll |  14 +-
 .../AArch64/sme-callee-save-restore-pairs.ll  |  64 +-
 .../test/CodeGen/AArch64/sme-darwin-sve-vg.ll |  36 +-
 .../AArch64/sme-disable-gisel-fisel.ll        |  55 +-
 .../CodeGen/AArch64/sme-lazy-save-call.ll     |   8 +-
 .../AArch64/sme-must-save-lr-for-vg.ll        |  49 ++
 .../test/CodeGen/AArch64/sme-peephole-opts.ll |  91 +--
 ...ate-sm-changing-call-disable-coalescing.ll | 543 +++++++-------
 ...ing-body-streaming-compatible-interface.ll |  40 +-
 .../CodeGen/AArch64/sme-streaming-body.ll     | 138 ++--
 .../sme-streaming-compatible-interface.ll     |  93 +--
 .../AArch64/sme-streaming-interface.ll        |  53 +-
 ...nging-call-disable-stackslot-scavenging.ll |  28 +-
 llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll  | 245 +++----
 llvm/test/CodeGen/AArch64/stack-hazard.ll     | 691 +++++++++---------
 .../streaming-compatible-memory-ops.ll        |  75 +-
 .../CodeGen/AArch64/sve-stack-frame-layout.ll |  44 +-
 32 files changed, 1429 insertions(+), 1234 deletions(-)
 create mode 100644 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
 create mode 100644 llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll

diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 109988246d0ab..39f13f5907190 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2529,8 +2529,8 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       }
 
       // Check that an instruction has register operands only as expected.
-      if (MCOI.OperandType == MCOI::OPERAND_REGISTER &&
-          !MO->isReg() && !MO->isFI())
+      if (MCOI.OperandType == MCOI::OPERAND_REGISTER && !MO->isReg() &&
+          !MO->isFI() && !MO->isTargetIndex())
         report("Expected a register operand.", MO, MONum);
       if (MO->isReg()) {
         if (MCOI.OperandType == MCOI::OPERAND_IMMEDIATE ||
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 5496ebd495a55..558cec0e373da 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -60,6 +60,7 @@ FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 FunctionPass *createAArch64CollectLOHPass();
 FunctionPass *createSMEABIPass();
 FunctionPass *createSMEPeepholeOptPass();
+FunctionPass *createMachineSMEABIPass();
 ModulePass *createSVEIntrinsicOptsPass();
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
@@ -111,8 +112,14 @@ void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
 void initializeSMEABIPass(PassRegistry &);
 void initializeSMEPeepholeOptPass(PassRegistry &);
+void initializeMachineSMEABIPass(PassRegistry &);
 void initializeSVEIntrinsicOptsPass(PassRegistry &);
 void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &);
+
+namespace AArch64 {
+enum TargetIndex { SAVED_VG_SLOT, SAVED_STREAMING_VG_SLOT };
+}
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index a71668e71c235..ddcdb59cc8ec5 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -333,7 +333,10 @@ static bool needsWinCFI(const MachineFunction &MF);
 static StackOffset getSVEStackSize(const MachineFunction &MF);
 static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
                                                  bool HasCall = false);
-static bool requiresSaveVG(const MachineFunction &MF);
+static bool requiresSaveVG(const MachineFunction &MF) {
+  return MF.getSubtarget<AArch64Subtarget>().getRegisterInfo()->requiresSaveVG(
+      MF);
+}
 
 /// Returns true if a homogeneous prolog or epilog code can be emitted
 /// for the size optimization. If possible, a frame helper call is injected.
@@ -1105,8 +1108,7 @@ bool AArch64FrameLowering::canUseAsPrologue(
 
   // May need a scratch register (for return value) if require making a special
   // call
-  if (requiresSaveVG(*MF) ||
-      windowsRequiresStackProbe(*MF, std::numeric_limits<uint64_t>::max()))
+  if (windowsRequiresStackProbe(*MF, std::numeric_limits<uint64_t>::max()))
     if (findScratchNonCalleeSaveRegister(TmpMBB, true) == AArch64::NoRegister)
       return false;
 
@@ -1391,38 +1393,6 @@ bool requiresGetVGCall(MachineFunction &MF) {
          !MF.getSubtarget<AArch64Subtarget>().hasSVE();
 }
 
-static bool requiresSaveVG(const MachineFunction &MF) {
-  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  // For Darwin platforms we don't save VG for non-SVE functions, even if SME
-  // is enabled with streaming mode changes.
-  if (!AFI->hasStreamingModeChanges())
-    return false;
-  auto &ST = MF.getSubtarget<AArch64Subtarget>();
-  if (ST.isTargetDarwin())
-    return ST.hasSVE();
-  return true;
-}
-
-bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
-  unsigned Opc = MBBI->getOpcode();
-  if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
-      Opc == AArch64::UBFMXri)
-    return true;
-
-  if (requiresGetVGCall(*MBBI->getMF())) {
-    if (Opc == AArch64::ORRXrr)
-      return true;
-
-    if (Opc == AArch64::BL) {
-      auto Op1 = MBBI->getOperand(0);
-      return Op1.isSymbol() &&
-             (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
-    }
-  }
-
-  return false;
-}
-
 // Convert callee-save register save/restore instruction to do stack pointer
 // decrement/increment to allocate/deallocate the callee-save stack area by
 // converting store/load to use pre/post increment version.
@@ -1434,15 +1404,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     int CFAOffset = 0) {
   unsigned NewOpc;
 
-  // If the function contains streaming mode changes, we expect instructions
-  // to calculate the value of VG before spilling. For locally-streaming
-  // functions, we need to do this for both the streaming and non-streaming
-  // vector length. Move past these instructions if necessary.
-  MachineFunction &MF = *MBB.getParent();
-  if (requiresSaveVG(MF))
-    while (isVGInstruction(MBBI))
-      ++MBBI;
-
   switch (MBBI->getOpcode()) {
   default:
     llvm_unreachable("Unexpected callee-save save/restore opcode!");
@@ -1979,9 +1940,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // pointer bump above.
   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
          !IsSVECalleeSave(MBBI)) {
-    if (CombineSPBump &&
-        // Only fix-up frame-setup load/store instructions.
-        (!requiresSaveVG(MF) || !isVGInstruction(MBBI)))
+    if (CombineSPBump)
       fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
                                         NeedsWinCFI, &HasWinCFI);
     ++MBBI;
@@ -3403,66 +3362,19 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       StrOpc =
           Size == 16 ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI;
       break;
-    case RegPairInfo::VG:
-      StrOpc = AArch64::STRXui;
-      break;
-    }
-
-    unsigned X0Scratch = AArch64::NoRegister;
-    if (Reg1 == AArch64::VG) {
-      // Find an available register to store value of VG to.
-      Reg1 = findScratchNonCalleeSaveRegister(&MBB, true);
-      assert(Reg1 != AArch64::NoRegister);
+    case RegPairInfo::VG: {
       SMEAttrs Attrs = AFI->getSMEFnAttrs();
-
       if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() &&
           AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) {
         // For locally-streaming functions, we need to store both the streaming
-        // & non-streaming VG. Spill the streaming value first.
-        BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1)
-            .addImm(1)
-            .setMIFlag(MachineInstr::FrameSetup);
-        BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1)
-            .addReg(Reg1)
-            .addImm(3)
-            .addImm(63)
-            .setMIFlag(MachineInstr::FrameSetup);
-
+        // & non-streaming VG.
         AFI->setStreamingVGIdx(RPI.FrameIdx);
-      } else if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
-        BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
-            .addImm(31)
-            .addImm(1)
-            .setMIFlag(MachineInstr::FrameSetup);
-        AFI->setVGIdx(RPI.FrameIdx);
       } else {
-        const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
-        if (llvm::any_of(
-                MBB.liveins(),
-                [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
-                  return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
-                      AArch64::X0, LiveIn.PhysReg);
-                }))
-          X0Scratch = Reg1;
-
-        if (X0Scratch != AArch64::NoRegister)
-          BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)
-              .addReg(AArch64::XZR)
-              .addReg(AArch64::X0, RegState::Undef)
-              .addReg(AArch64::X0, RegState::Implicit)
-              .setMIFlag(MachineInstr::FrameSetup);
-
-        const uint32_t *RegMask = TRI->getCallPreservedMask(
-            MF,
-            CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
-        BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
-            .addExternalSymbol("__arm_get_current_vg")
-            .addRegMask(RegMask)
-            .addReg(AArch64::X0, RegState::ImplicitDefine)
-            .setMIFlag(MachineInstr::FrameSetup);
-        Reg1 = AArch64::X0;
         AFI->setVGIdx(RPI.FrameIdx);
       }
+      // VG will be written to the frame indices immediately after the prologue.
+      continue;
+    }
     }
 
     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
@@ -3556,13 +3468,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       if (RPI.isPaired())
         MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
     }
-
-    if (X0Scratch != AArch64::NoRegister)
-      BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0)
-          .addReg(AArch64::XZR)
-          .addReg(X0Scratch, RegState::Undef)
-          .addReg(X0Scratch, RegState::Implicit)
-          .setMIFlag(MachineInstr::FrameSetup);
   }
   return true;
 }
@@ -4092,31 +3997,19 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
       MaxCSFrameIndex = FrameIdx;
   }
 
-  // Insert VG into the list of CSRs, immediately before LR if saved.
   if (requiresSaveVG(MF)) {
-    std::vector<CalleeSavedInfo> VGSaves;
-    SMEAttrs Attrs = AFI->getSMEFnAttrs();
-
-    auto VGInfo = CalleeSavedInfo(AArch64::VG);
+    CalleeSavedInfo VGInfo(AArch64::VG);
     VGInfo.setRestored(false);
-    VGSaves.push_back(VGInfo);
+    SmallVector<CalleeSavedInfo, 2> VGSaves{VGInfo};
 
     // Add VG again if the function is locally-streaming, as we will spill two
     // values.
+    SMEAttrs Attrs = AFI->getSMEFnAttrs();
     if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
       VGSaves.push_back(VGInfo);
 
-    bool InsertBeforeLR = false;
-
-    for (unsigned I = 0; I < CSI.size(); I++)
-      if (CSI[I].getReg() == AArch64::LR) {
-        InsertBeforeLR = true;
-        CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end());
-        break;
-      }
-
-    if (!InsertBeforeLR)
-      llvm::append_range(CSI, VGSaves);
+    // Insert the VG saves at the start of the CSI (alongside GPRs).
+    CSI.insert(CSI.begin(), VGSaves.begin(), VGSaves.end());
   }
 
   Register LastReg = 0;
@@ -5135,13 +5028,28 @@ static void emitVGSaveRestore(MachineBasicBlock::iterator II,
   MI.eraseFromParent();
 }
 
+static void replaceVGTargetIndices(MachineBasicBlock::iterator II,
+                                   AArch64FunctionInfo *AFI) {
+  for (auto &MO : II->explicit_operands()) {
+    if (MO.isTargetIndex()) {
+      if (MO.getIndex() == AArch64::SAVED_STREAMING_VG_SLOT)
+        MO.ChangeToFrameIndex(AFI->getStreamingVGIdx());
+      if (MO.getIndex() == AArch64::SAVED_VG_SLOT)
+        MO.ChangeToFrameIndex(AFI->getVGIdx());
+    }
+  }
+}
+
 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
     MachineFunction &MF, RegScavenger *RS = nullptr) const {
+  bool VGSaved = requiresSaveVG(MF);
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   for (auto &BB : MF)
     for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) {
-      if (requiresSaveVG(MF))
+      if (VGSaved) {
+        replaceVGTargetIndices(II, AFI);
         emitVGSaveRestore(II++, this);
-      else if (StackTaggingMergeSetTag)
+      } else if (StackTaggingMergeSetTag)
         II = tryMergeAdjacentSTG(II, this, RS);
     }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 951cb93ea8f8c..ec42718954ba6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3688,6 +3688,14 @@ static unsigned offsetExtendOpcode(unsigned Opcode) {
   }
 }
 
+ArrayRef<std::pair<int, const char *>>
+AArch64InstrInfo::getSerializableTargetIndices() const {
+  static constexpr std::pair<int, const char *> TargetIndices[] = {
+      {AArch64::SAVED_VG_SLOT, "saved-vg-slot"},
+      {AArch64::SAVED_STREAMING_VG_SLOT, "saved-streaming-vg-slot"}};
+  return TargetIndices;
+}
+
 MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI,
                                                  const ExtAddrMode &AM) const {
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 7c255da333e4b..7d12271424664 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -299,6 +299,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   MachineInstr *emitLdStWithAddr(MachineInstr &MemI,
                                  const ExtAddrMode &AM) const override;
 
+  ArrayRef<std::pair<int, const char *>>
+  getSerializableTargetIndices() const override;
+
   bool getMemOperandsWithOffsetWidth(
       const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps,
       int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index fb472ddc719fc..9563b1eeacc1d 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -673,6 +673,18 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   return false;
 }
 
+bool AArch64RegisterInfo::requiresSaveVG(const MachineFunction &MF) const {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  // For Darwin platforms we don't save VG for non-SVE functions, even if SME
+  // is enabled with streaming mode changes.
+  if (!AFI->hasStreamingModeChanges())
+    return false;
+  auto &ST = MF.getSubtarget<AArch64Subtarget>();
+  if (ST.isTargetDarwin())
+    return ST.hasSVE();
+  return true;
+}
+
 bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF,
                                              MCRegister Reg) const {
   CallingConv::ID CC = MF.getFunction().getCallingConv();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index cc94be611a2ea..29fa6df4c7875 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -127,6 +127,8 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
   bool hasBasePointer(const MachineFunction &MF) const;
   unsigned getBaseRegister() const;
 
+  bool requiresSaveVG(const MachineFunction &MF) const;
+
   bool isArgumentRegister(const MachineFunction &MF,
                           MCRegister Reg) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 281a1457fe57e..4fc1f850e4e09 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -803,6 +803,9 @@ bool AArch64PassConfig::addILPOpts() {
 }
 
 void AArch64PassConfig::addPreRegAlloc() {
+  // Insert VG saves for the unwinder.
+  addPass(createMachineSMEABIPass());
+
   // Change dead register definitions to refer to the zero register.
   if (TM->getOptLevel() != CodeGenOptLevel::None &&
       EnableDeadRegisterElimination)
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index 66136a464f05d..909f638d03beb 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -87,6 +87,7 @@ add_llvm_target(AArch64CodeGen
   AArch64TargetObjectFile.cpp
   AArch64TargetTransformInfo.cpp
   SMEABIPass.cpp
+  MachineSMEABIPass.cpp
   SMEPeepholeOpt.cpp
   SVEIntrinsicOpts.cpp
   AArch64SIMDInstrOpt.cpp
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
new file mode 100644
index 0000000000000..bbdf6cab5222e
--- /dev/null
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -0,0 +1,113 @@
+//===- MachineSMEABIPass.cpp - MIR SME ABI lowerings ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This pass implements aspects of the SME ABI not known to be required till
+// this stage in lowering. Currently this is just:
+// * Saving VG for the unwinder (which depends on SMEPeepholeOpt running first).
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-machine-sme-abi"
+
+namespace {
+
+struct MachineSMEABI : public MachineFunctionPass {
+  inline static char ID = 0;
+
+  MachineSMEABI() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "Machine SME ABI pass"; }
+
+  bool insertVGSaveForUnwinder(MachineFunction &MF) const;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+bool MachineSMEABI::insertVGSaveForUnwinder(MachineFunction &MF) const {
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  if (!TRI.requiresSaveVG(MF))
+    return false;
+
+  SMEAttrs Attrs = AFI->getSMEFnAttrs();
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface()) {
+    // For locally-streaming functions, we need to store both the streaming
+    // & non-streaming VG. Spill the streaming value first.
+    Register RDSVLReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    Register StreamingVGReg =
+        MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::RDSVLI_XI), RDSVLReg).addImm(1);
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::UBFMXri), StreamingVGReg)
+        .addReg(RDSVLReg)
+        .addImm(3)
+        .addImm(63);
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
+        .addReg(StreamingVGReg)
+        .addTargetIndex(AArch64::SAVED_STREAMING_VG_SLOT)
+        .addImm(0);
+  }
+
+  Register VGReg;
+  if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+    VGReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::CNTD_XPiI), VGReg)
+        .addImm(31)
+        .addImm(1);
+  } else {
+    VGReg = AArch64::X0;
+    const uint32_t *RegMask = TRI.getCallPreservedMask(
+        MF, CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::BL))
+        .addExternalSymbol("__arm_get_current_vg")
+        .addRegMask(RegMask)
+        .addReg(AArch64::X0, RegState::ImplicitDefine);
+  }
+
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
+      .addReg(VGReg)
+      .addTargetIndex(AArch64::SAVED_VG_SLOT)
+      .addImm(0);
+  return true;
+}
+
+INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi",
+                "Machine SME ABI pass", false, false)
+
+bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
+  assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
+  return insertVGSaveForUnwinder(MF);
+}
+
+FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index abc67eec32391..f226a0b73d7a4 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -54,6 +54,7 @@
 ; CHECK-NEXT:       AArch64 Instruction Selection
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
 ; CHECK-NEXT:       Local Stack Slot Allocation
+; CHECK-NEXT:       Machine SME ABI pass
 ; CHECK-NEXT:       Eliminate PHI nodes for register allocation
 ; CHECK-NEXT:       Two-Address instruction pass
 ; CHECK-NEXT:       Fast Register Allocator
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index e1481667a4ab7..961ff923491b8 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -161,6 +161,7 @@
 ; CHECK-NEXT:       Peephole Optimizations
 ; CHECK-NEXT:       Remove dead machine instructions
 ; CHECK-NEXT:       AArch64 MI Peephole Optimization pass
+; CHECK-NEXT:       Machine SME ABI pass
 ; CHECK-NEXT:       AArch64 Dead register definitions
 ; CHECK-NEXT:       Detect Dead Lanes
 ; CHECK-NEXT:       Init Undef Pass
diff --git a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
index 94fe06733347a..2466a0b3bf162 100644
--- a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
+++ b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
@@ -7,11 +7,12 @@ define void @streaming_mode_change1() #0 {
 ; CHECK-LABEL: streaming_mode_change1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
@@ -21,10 +22,9 @@ define void @streaming_mode_change1() #0 {
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
-;
+
 ; OUTLINER-LABEL: streaming_mode_change1:
 ; OUTLINER-NOT: OUTLINED_FUNCTION_
-;
   call void @callee();
   ret void;
 }
@@ -33,11 +33,12 @@ define void @streaming_mode_change2() #0 {
 ; CHECK-LABEL: streaming_mode_change2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
@@ -47,10 +48,9 @@ define void @streaming_mode_change2() #0 {
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
-;
+
 ; OUTLINER-LABEL: streaming_mode_change2:
 ; OUTLINER-NOT: OUTLINED_FUNCTION_
-;
   call void @callee();
   ret void;
 }
@@ -59,11 +59,12 @@ define void @streaming_mode_change3() #0 {
 ; CHECK-LABEL: streaming_mode_change3:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
@@ -73,10 +74,9 @@ define void @streaming_mode_change3() #0 {
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
-;
+
 ; OUTLINER-LABEL: streaming_mode_change3:
 ; OUTLINER-NOT: OUTLINED_FUNCTION_
-;
   call void @callee();
   ret void;
 }
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 1f68815411097..b75accee2899f 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -89,16 +89,14 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x9, x0
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __arm_get_current_vg
-; CHECK-NEXT:    str x0, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    mov x0, x9
 ; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __arm_get_current_vg
 ; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    str x0, [x29, #32]
 ; CHECK-NEXT:    bl __arm_sme_state_size
 ; CHECK-NEXT:    sub sp, sp, x0
 ; CHECK-NEXT:    mov x20, sp
@@ -122,7 +120,7 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
 ; CHECK-NEXT:    bl __arm_sme_restore
 ; CHECK-NEXT:    mov x0, x1
 ; CHECK-NEXT:    sub sp, x29, #64
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
@@ -140,16 +138,14 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x9, x0
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __arm_get_current_vg
-; CHECK-NEXT:    str x0, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    mov x0, x9
 ; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    bl __arm_get_current_vg
 ; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    str x0, [x29, #32]
 ; CHECK-NEXT:    bl __arm_sme_state_size
 ; CHECK-NEXT:    sub sp, sp, x0
 ; CHECK-NEXT:    mov x19, sp
@@ -189,7 +185,7 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
 ; CHECK-NEXT:    bl __arm_sme_restore
 ; CHECK-NEXT:    mov x0, x1
 ; CHECK-NEXT:    sub sp, x29, #64
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll b/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll
index 8e3866fcec89a..60be329ab8567 100644
--- a/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll
+++ b/llvm/test/CodeGen/AArch64/sme-avoid-coalescing-locally-streaming.ll
@@ -11,28 +11,33 @@ define void @dont_coalesce_args(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind
   ; CHECK-COALESCER-BARRIER-NEXT: {{  $}}
   ; CHECK-COALESCER-BARRIER-NEXT:   [[COPY:%[0-9]+]]:fpr128 = COPY $q0
   ; CHECK-COALESCER-BARRIER-NEXT:   [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY]]
-  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-COALESCER-BARRIER-NEXT:   [[DEF:%[0-9]+]]:zpr = IMPLICIT_DEF
   ; CHECK-COALESCER-BARRIER-NEXT:   [[INSERT_SUBREG:%[0-9]+]]:zpr = INSERT_SUBREG [[DEF]], [[COALESCER_BARRIER_FPR128_]], %subreg.zsub
   ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-COALESCER-BARRIER-NEXT:   $z0 = COPY [[INSERT_SUBREG]]
   ; CHECK-COALESCER-BARRIER-NEXT:   BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
   ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
-  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-COALESCER-BARRIER-NEXT:   RET_ReallyLR
   ;
   ; CHECK-REGALLOC-LABEL: name: dont_coalesce_args
   ; CHECK-REGALLOC: bb.0 (%ir-block.0):
   ; CHECK-REGALLOC-NEXT:   liveins: $q0
   ; CHECK-REGALLOC-NEXT: {{  $}}
+  ; CHECK-REGALLOC-NEXT:   renamable $x8 = RDSVLI_XI 1, implicit $vg
+  ; CHECK-REGALLOC-NEXT:   renamable $x8 = UBFMXri killed renamable $x8, 3, 63
+  ; CHECK-REGALLOC-NEXT:   STRXui killed renamable $x8, target-index(saved-streaming-vg-slot), 0
+  ; CHECK-REGALLOC-NEXT:   BL &__arm_get_current_vg, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def dead $lr, implicit $sp, implicit-def $x0
   ; CHECK-REGALLOC-NEXT:   STRQui $q0, %stack.0, 0 :: (store (s128) into %stack.0)
-  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   STRXui killed $x0, target-index(saved-vg-slot), 0
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-REGALLOC-NEXT:   renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
   ; CHECK-REGALLOC-NEXT:   renamable $q0 = KILL killed renamable $q0, implicit-def $z0
   ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-REGALLOC-NEXT:   BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
   ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
-  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-REGALLOC-NEXT:   RET_ReallyLR
   %sa = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %a, i64 0)
   call void @scalable_args(<vscale x 2 x i64> %sa)
@@ -42,26 +47,31 @@ define void @dont_coalesce_args(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind
 define <2 x i64> @dont_coalesce_res() "aarch64_pstate_sm_body" nounwind {
   ; CHECK-COALESCER-BARRIER-LABEL: name: dont_coalesce_res
   ; CHECK-COALESCER-BARRIER: bb.0 (%ir-block.0):
-  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-COALESCER-BARRIER-NEXT:   BL @scalable_res, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
   ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-COALESCER-BARRIER-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z0
   ; CHECK-COALESCER-BARRIER-NEXT:   [[COPY1:%[0-9]+]]:fpr128 = COPY [[COPY]].zsub
   ; CHECK-COALESCER-BARRIER-NEXT:   [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY1]]
-  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-COALESCER-BARRIER-NEXT:   $q0 = COPY [[COALESCER_BARRIER_FPR128_]]
   ; CHECK-COALESCER-BARRIER-NEXT:   RET_ReallyLR implicit $q0
   ;
   ; CHECK-REGALLOC-LABEL: name: dont_coalesce_res
   ; CHECK-REGALLOC: bb.0 (%ir-block.0):
-  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   renamable $x8 = RDSVLI_XI 1, implicit $vg
+  ; CHECK-REGALLOC-NEXT:   renamable $x8 = UBFMXri killed renamable $x8, 3, 63
+  ; CHECK-REGALLOC-NEXT:   STRXui killed renamable $x8, target-index(saved-streaming-vg-slot), 0
+  ; CHECK-REGALLOC-NEXT:   BL &__arm_get_current_vg, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def dead $lr, implicit $sp, implicit-def $x0
+  ; CHECK-REGALLOC-NEXT:   STRXui killed $x0, target-index(saved-vg-slot), 0
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-REGALLOC-NEXT:   BL @scalable_res, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
   ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-REGALLOC-NEXT:   renamable $q0 = KILL renamable $q0, implicit killed $z0
   ; CHECK-REGALLOC-NEXT:   STRQui killed renamable $q0, %stack.0, 0 :: (store (s128) into %stack.0)
-  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-REGALLOC-NEXT:   $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
   ; CHECK-REGALLOC-NEXT:   RET_ReallyLR implicit $q0
   %sa = call <vscale x 2 x i64> @scalable_res()
@@ -76,7 +86,7 @@ define <2 x i64> @dont_coalesce_arg_that_is_also_res(<2 x i64> %a) "aarch64_psta
   ; CHECK-COALESCER-BARRIER-NEXT: {{  $}}
   ; CHECK-COALESCER-BARRIER-NEXT:   [[COPY:%[0-9]+]]:fpr128 = COPY $q0
   ; CHECK-COALESCER-BARRIER-NEXT:   [[COALESCER_BARRIER_FPR128_:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COPY]]
-  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-COALESCER-BARRIER-NEXT:   [[DEF:%[0-9]+]]:zpr = IMPLICIT_DEF
   ; CHECK-COALESCER-BARRIER-NEXT:   [[INSERT_SUBREG:%[0-9]+]]:zpr = INSERT_SUBREG [[DEF]], [[COALESCER_BARRIER_FPR128_]], %subreg.zsub
   ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
@@ -84,7 +94,7 @@ define <2 x i64> @dont_coalesce_arg_that_is_also_res(<2 x i64> %a) "aarch64_psta
   ; CHECK-COALESCER-BARRIER-NEXT:   BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
   ; CHECK-COALESCER-BARRIER-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-COALESCER-BARRIER-NEXT:   [[COALESCER_BARRIER_FPR128_1:%[0-9]+]]:fpr128 = COALESCER_BARRIER_FPR128 [[COALESCER_BARRIER_FPR128_]]
-  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg
+  ; CHECK-COALESCER-BARRIER-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $q0, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-COALESCER-BARRIER-NEXT:   $q0 = COPY [[COALESCER_BARRIER_FPR128_1]]
   ; CHECK-COALESCER-BARRIER-NEXT:   RET_ReallyLR implicit $q0
   ;
@@ -92,14 +102,19 @@ define <2 x i64> @dont_coalesce_arg_that_is_also_res(<2 x i64> %a) "aarch64_psta
   ; CHECK-REGALLOC: bb.0 (%ir-block.0):
   ; CHECK-REGALLOC-NEXT:   liveins: $q0
   ; CHECK-REGALLOC-NEXT: {{  $}}
+  ; CHECK-REGALLOC-NEXT:   renamable $x8 = RDSVLI_XI 1, implicit $vg
+  ; CHECK-REGALLOC-NEXT:   renamable $x8 = UBFMXri killed renamable $x8, 3, 63
+  ; CHECK-REGALLOC-NEXT:   STRXui killed renamable $x8, target-index(saved-streaming-vg-slot), 0
+  ; CHECK-REGALLOC-NEXT:   BL &__arm_get_current_vg, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def dead $lr, implicit $sp, implicit-def $x0
   ; CHECK-REGALLOC-NEXT:   STRQui $q0, %stack.0, 0 :: (store (s128) into %stack.0)
-  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   STRXui killed $x0, target-index(saved-vg-slot), 0
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-REGALLOC-NEXT:   renamable $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
   ; CHECK-REGALLOC-NEXT:   renamable $q0 = KILL killed renamable $q0, implicit-def $z0
   ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-REGALLOC-NEXT:   BL @scalable_args, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
   ; CHECK-REGALLOC-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
-  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg
+  ; CHECK-REGALLOC-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def dead $q0, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-REGALLOC-NEXT:   $q0 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
   ; CHECK-REGALLOC-NEXT:   RET_ReallyLR implicit $q0
   %sa = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %a, i64 0)
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index c4440e7bcc3ff..0545b3110e461 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -14,9 +14,9 @@ define void @streaming_compatible() #0 {
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_get_current_vg
-; CHECK-NEXT:    stp x0, x19, [sp, #72] // 16-byte Folded Spill
+; CHECK-NEXT:    str x0, [sp, #80]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB0_2
@@ -28,10 +28,9 @@ define void @streaming_compatible() #0 {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -52,10 +51,10 @@ define void @streaming_compatible_arg(float %f) #0 {
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_get_current_vg
-; CHECK-NEXT:    stp x0, x19, [sp, #88] // 16-byte Folded Spill
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    str x0, [sp, #96]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB1_2
@@ -68,10 +67,9 @@ define void @streaming_compatible_arg(float %f) #0 {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
index 980144d6ca584..a5f51926899dd 100644
--- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -9,9 +9,8 @@ declare void @my_func2(<vscale x 16 x i8> %v)
 define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; NOPAIR-LABEL: fbyte:
 ; NOPAIR:       // %bb.0:
-; NOPAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; NOPAIR-NEXT:    cntd x9
-; NOPAIR-NEXT:    stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str x29, [sp, #-32]! // 8-byte Folded Spill
+; NOPAIR-NEXT:    stp x30, x19, [sp, #8] // 16-byte Folded Spill
 ; NOPAIR-NEXT:    addvl sp, sp, #-18
 ; NOPAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; NOPAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -42,7 +41,10 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; NOPAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; NOPAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; NOPAIR-NEXT:    addvl sp, sp, #-1
+; NOPAIR-NEXT:    cntd x8
+; NOPAIR-NEXT:    addvl x9, sp, #19
 ; NOPAIR-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str x8, [x9, #24]
 ; NOPAIR-NEXT:    bl __arm_sme_state
 ; NOPAIR-NEXT:    and x19, x0, #0x1
 ; NOPAIR-NEXT:    tbz w19, #0, .LBB0_2
@@ -85,15 +87,14 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; NOPAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; NOPAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; NOPAIR-NEXT:    addvl sp, sp, #18
-; NOPAIR-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
-; NOPAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldp x30, x19, [sp, #8] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
 ; NOPAIR-NEXT:    ret
 ;
 ; PAIR-LABEL: fbyte:
 ; PAIR:       // %bb.0:
-; PAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; PAIR-NEXT:    cntd x9
-; PAIR-NEXT:    stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; PAIR-NEXT:    str x29, [sp, #-32]! // 8-byte Folded Spill
+; PAIR-NEXT:    stp x30, x19, [sp, #8] // 16-byte Folded Spill
 ; PAIR-NEXT:    addvl sp, sp, #-18
 ; PAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -124,7 +125,10 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; PAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; PAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; PAIR-NEXT:    addvl sp, sp, #-1
+; PAIR-NEXT:    cntd x8
+; PAIR-NEXT:    addvl x9, sp, #19
 ; PAIR-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; PAIR-NEXT:    str x8, [x9, #24]
 ; PAIR-NEXT:    bl __arm_sme_state
 ; PAIR-NEXT:    and x19, x0, #0x1
 ; PAIR-NEXT:    tbz w19, #0, .LBB0_2
@@ -167,8 +171,8 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; PAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    addvl sp, sp, #18
-; PAIR-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
-; PAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; PAIR-NEXT:    ldp x30, x19, [sp, #8] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
 ; PAIR-NEXT:    ret
   call void @my_func2(<vscale x 16 x i8> %v)
   ret void
@@ -178,9 +182,9 @@ define void @fhalf(<vscale x 8 x half> %v) #1{
 ; NOPAIR-LABEL: fhalf:
 ; NOPAIR:       // %bb.0:
 ; NOPAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; NOPAIR-NEXT:    cntd x9
-; NOPAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
 ; NOPAIR-NEXT:    addvl sp, sp, #-18
+; NOPAIR-NEXT:    cntd x8
+; NOPAIR-NEXT:    addvl x9, sp, #18
 ; NOPAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; NOPAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; NOPAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -209,6 +213,7 @@ define void @fhalf(<vscale x 8 x half> %v) #1{
 ; NOPAIR-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; NOPAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; NOPAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str x8, [x9, #16]
 ; NOPAIR-NEXT:    smstop sm
 ; NOPAIR-NEXT:    bl my_func
 ; NOPAIR-NEXT:    smstart sm
@@ -247,21 +252,21 @@ define void @fhalf(<vscale x 8 x half> %v) #1{
 ; PAIR-LABEL: fhalf:
 ; PAIR:       // %bb.0:
 ; PAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; PAIR-NEXT:    cntd x9
-; PAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
 ; PAIR-NEXT:    addvl sp, sp, #-18
 ; PAIR-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    ptrue pn8.b
+; PAIR-NEXT:    cntd x8
+; PAIR-NEXT:    addvl x9, sp, #18
 ; PAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    st1b { z20.b, z21.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    st1b { z18.b, z19.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    st1b { z16.b, z17.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    st1b { z14.b, z15.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill
-; PAIR-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    st1b { z12.b, z13.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill
 ; PAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
@@ -272,6 +277,7 @@ define void @fhalf(<vscale x 8 x half> %v) #1{
 ; PAIR-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; PAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str x8, [x9, #16]
 ; PAIR-NEXT:    smstop sm
 ; PAIR-NEXT:    bl my_func
 ; PAIR-NEXT:    smstart sm
@@ -308,14 +314,13 @@ define void @ffloat(<vscale x 4 x i32> %v) #2 {
 ; NOPAIR-LABEL: ffloat:
 ; NOPAIR:       // %bb.0:
 ; NOPAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; NOPAIR-NEXT:    rdsvl x9, #1
-; NOPAIR-NEXT:    lsr x9, x9, #3
-; NOPAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
-; NOPAIR-NEXT:    cntd x9
-; NOPAIR-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
 ; NOPAIR-NEXT:    addsvl sp, sp, #-18
+; NOPAIR-NEXT:    rdsvl x8, #1
+; NOPAIR-NEXT:    addsvl x10, sp, #18
 ; NOPAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    lsr x8, x8, #3
 ; NOPAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; NOPAIR-NEXT:    cntd x9
 ; NOPAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
 ; NOPAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
 ; NOPAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
@@ -342,6 +347,9 @@ define void @ffloat(<vscale x 4 x i32> %v) #2 {
 ; NOPAIR-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; NOPAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; NOPAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str x8, [x10, #16]
+; NOPAIR-NEXT:    addsvl x8, sp, #18
+; NOPAIR-NEXT:    str x9, [x8, #24]
 ; NOPAIR-NEXT:    smstart sm
 ; NOPAIR-NEXT:    smstop sm
 ; NOPAIR-NEXT:    bl my_func
@@ -380,14 +388,13 @@ define void @ffloat(<vscale x 4 x i32> %v) #2 {
 ; PAIR-LABEL: ffloat:
 ; PAIR:       // %bb.0:
 ; PAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; PAIR-NEXT:    rdsvl x9, #1
-; PAIR-NEXT:    lsr x9, x9, #3
-; PAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
-; PAIR-NEXT:    cntd x9
-; PAIR-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
 ; PAIR-NEXT:    addsvl sp, sp, #-18
+; PAIR-NEXT:    rdsvl x8, #1
+; PAIR-NEXT:    addsvl x10, sp, #18
 ; PAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    lsr x8, x8, #3
 ; PAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; PAIR-NEXT:    cntd x9
 ; PAIR-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
@@ -414,6 +421,9 @@ define void @ffloat(<vscale x 4 x i32> %v) #2 {
 ; PAIR-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; PAIR-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; PAIR-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; PAIR-NEXT:    str x8, [x10, #16]
+; PAIR-NEXT:    addsvl x8, sp, #18
+; PAIR-NEXT:    str x9, [x8, #24]
 ; PAIR-NEXT:    smstart sm
 ; PAIR-NEXT:    smstop sm
 ; PAIR-NEXT:    bl my_func
diff --git a/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll b/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll
index a08e4896f5ee9..17a85f580bf40 100644
--- a/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll
+++ b/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll
@@ -1,20 +1,17 @@
-; RUN: llc -mtriple=aarch64-darwin -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu  -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme -enable-aarch64-sme-peephole-opt=false -verify-machineinstrs < %s | FileCheck %s
 
 declare void @normal_callee();
 
 define void @locally_streaming_fn() #0 {
 ; CHECK-LABEL: locally_streaming_fn:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! ; 16-byte Folded Spill
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    stp d13, d12, [sp, #16] ; 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
-; CHECK-NEXT:    stp d11, d10, [sp, #32] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #48] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] ; 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] ; 8-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
@@ -25,18 +22,23 @@ define void @locally_streaming_fn() #0 {
 ; CHECK-NEXT:    .cfi_offset b13, -80
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    str x8, [sp, #72]
+; CHECK-NEXT:    str x9, [sp, #80]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    bl _normal_callee
+; CHECK-NEXT:    bl normal_callee
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] ; 8-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    .cfi_restore b8
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 4a52bf27a7591..ce73f895738e8 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -17,8 +17,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
 ; CHECK-FISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-FISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-FISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-FISEL-NEXT:    cntd x9
-; CHECK-FISEL-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
+; CHECK-FISEL-NEXT:    cntd x8
+; CHECK-FISEL-NEXT:    str x8, [sp, #88]
 ; CHECK-FISEL-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-FISEL-NEXT:    smstart sm
 ; CHECK-FISEL-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
@@ -45,8 +45,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
 ; CHECK-GISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-GISEL-NEXT:    cntd x9
-; CHECK-GISEL-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
+; CHECK-GISEL-NEXT:    cntd x8
+; CHECK-GISEL-NEXT:    str x8, [sp, #88]
 ; CHECK-GISEL-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-GISEL-NEXT:    smstart sm
 ; CHECK-GISEL-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
@@ -80,8 +80,8 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    cntd x9
-; CHECK-COMMON-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    cntd x8
+; CHECK-COMMON-NEXT:    str x8, [sp, #88]
 ; CHECK-COMMON-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
 ; CHECK-COMMON-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
@@ -114,11 +114,11 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    rdsvl x9, #1
-; CHECK-COMMON-NEXT:    lsr x9, x9, #3
-; CHECK-COMMON-NEXT:    str x9, [sp, #104] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    cntd x9
-; CHECK-COMMON-NEXT:    str x9, [sp, #112] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    lsr x8, x8, #3
+; CHECK-COMMON-NEXT:    str x8, [sp, #104]
+; CHECK-COMMON-NEXT:    cntd x8
+; CHECK-COMMON-NEXT:    str x8, [sp, #112]
 ; CHECK-COMMON-NEXT:    str d0, [sp, #24] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:    ldr d0, [sp, #24] // 8-byte Folded Reload
@@ -182,11 +182,11 @@ define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noin
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    rdsvl x9, #1
-; CHECK-COMMON-NEXT:    lsr x9, x9, #3
-; CHECK-COMMON-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    cntd x9
-; CHECK-COMMON-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    rdsvl x8, #1
+; CHECK-COMMON-NEXT:    lsr x8, x8, #3
+; CHECK-COMMON-NEXT:    str x8, [sp, #72]
+; CHECK-COMMON-NEXT:    cntd x8
+; CHECK-COMMON-NEXT:    str x8, [sp, #80]
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:    blr x0
 ; CHECK-COMMON-NEXT:    smstop sm
@@ -208,8 +208,8 @@ define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optno
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    cntd x9
-; CHECK-COMMON-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    cntd x8
+; CHECK-COMMON-NEXT:    str x8, [sp, #72]
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:    blr x0
 ; CHECK-COMMON-NEXT:    smstop sm
@@ -339,13 +339,14 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
 ; CHECK-COMMON-LABEL: f128_call_sm:
 ; CHECK-COMMON:       // %bb.0:
 ; CHECK-COMMON-NEXT:    sub sp, sp, #112
-; CHECK-COMMON-NEXT:    cntd x9
+; CHECK-COMMON-NEXT:    cntd x8
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x8, [sp, #104]
 ; CHECK-COMMON-NEXT:    smstop sm
 ; CHECK-COMMON-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
 ; CHECK-COMMON-NEXT:    bl __addtf3
@@ -403,13 +404,14 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
 ; CHECK-COMMON-LABEL: frem_call_sm:
 ; CHECK-COMMON:       // %bb.0:
 ; CHECK-COMMON-NEXT:    sub sp, sp, #96
-; CHECK-COMMON-NEXT:    cntd x9
+; CHECK-COMMON-NEXT:    cntd x8
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x8, [sp, #88]
 ; CHECK-COMMON-NEXT:    smstop sm
 ; CHECK-COMMON-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    bl fmodf
@@ -432,14 +434,14 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
 ; CHECK-COMMON-LABEL: frem_call_sm_compat:
 ; CHECK-COMMON:       // %bb.0:
 ; CHECK-COMMON-NEXT:    sub sp, sp, #112
-; CHECK-COMMON-NEXT:    cntd x9
+; CHECK-COMMON-NEXT:    cntd x8
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x8, [sp, #96]
 ; CHECK-COMMON-NEXT:    bl __arm_sme_state
 ; CHECK-COMMON-NEXT:    and x19, x0, #0x1
 ; CHECK-COMMON-NEXT:    tbz w19, #0, .LBB12_2
@@ -454,10 +456,9 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:  .LBB12_4:
 ; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    add sp, sp, #112
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index e463e833bdbde..018f5f801d09c 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -128,18 +128,18 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    cntd x10
 ; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    str x10, [x29, #32]
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-80]
 ; CHECK-NEXT:    sub x9, x29, #80
@@ -167,7 +167,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:  .LBB3_6:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    sub sp, x29, #64
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll b/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll
new file mode 100644
index 0000000000000..2d49ea542b96e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -O0 < %s | FileCheck %s
+
+; Example of locally streaming function that (at -O0) must preserve the LR (X30)
+; before calling __arm_get_current_vg.
+define void @foo() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: foo:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    .cfi_offset b8, -40
+; CHECK-NEXT:    .cfi_offset b9, -48
+; CHECK-NEXT:    .cfi_offset b10, -56
+; CHECK-NEXT:    .cfi_offset b11, -64
+; CHECK-NEXT:    .cfi_offset b12, -72
+; CHECK-NEXT:    .cfi_offset b13, -80
+; CHECK-NEXT:    .cfi_offset b14, -88
+; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    str x8, [sp, #72]
+; CHECK-NEXT:    bl __arm_get_current_vg
+; CHECK-NEXT:    str x0, [sp, #80]
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 130a316bcc2ba..d6096ce86233e 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -11,11 +11,12 @@ define void @test0(ptr %callee) nounwind {
 ; CHECK-LABEL: test0:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl callee_sm
 ; CHECK-NEXT:    bl callee_sm
@@ -36,11 +37,12 @@ define void @test1() nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    bl callee
@@ -62,12 +64,12 @@ define void @test2() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB2_2
@@ -90,10 +92,9 @@ define void @test2() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:  // %bb.7:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB2_8:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -107,12 +108,12 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: test3:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbnz w19, #0, .LBB3_2
@@ -146,10 +147,9 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:  // %bb.11:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB3_12:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -164,11 +164,12 @@ define void @test4() nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    fmov s0, wzr
 ; CHECK-NEXT:    bl callee_farg
@@ -191,13 +192,14 @@ define void @test5(float %f) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test5:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #88]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    bl callee_farg
@@ -220,13 +222,14 @@ define float @test6(float %f) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test6:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #88]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    bl callee_farg_fret
@@ -279,11 +282,12 @@ define void @test8() nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
@@ -322,14 +326,10 @@ define void @test10() "aarch64_pstate_sm_body" {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    rdsvl x9, #1
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
@@ -340,6 +340,11 @@ define void @test10() "aarch64_pstate_sm_body" {
 ; CHECK-NEXT:    .cfi_offset b13, -80
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    str x8, [sp, #72]
+; CHECK-NEXT:    str x9, [sp, #80]
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    .cfi_restore vg
@@ -375,13 +380,13 @@ define void @test11(ptr %p) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test11:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
@@ -390,10 +395,9 @@ define void @test11(ptr %p) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -411,14 +415,10 @@ define void @test12() "aarch64_pstate_sm_body" {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    rdsvl x9, #1
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
@@ -429,6 +429,11 @@ define void @test12() "aarch64_pstate_sm_body" {
 ; CHECK-NEXT:    .cfi_offset b13, -80
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    str x8, [sp, #72]
+; CHECK-NEXT:    str x9, [sp, #80]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    .cfi_offset vg, -24
@@ -467,15 +472,17 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test13:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [x9, #88]
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
@@ -490,8 +497,8 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index 5ea5e3e7766e8..622877d48fb35 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -16,15 +16,17 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str x8, [x9, #88]
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl use_i8
@@ -32,8 +34,8 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -49,15 +51,17 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str x8, [x9, #88]
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl use_i16
@@ -65,8 +69,8 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -82,15 +86,17 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str x8, [x9, #88]
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl use_i32
@@ -98,8 +104,8 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -115,15 +121,17 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    str x8, [x9, #88]
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl use_i64
@@ -131,8 +139,8 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -148,19 +156,21 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
@@ -171,8 +181,8 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -188,19 +198,21 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
@@ -211,8 +223,8 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -228,19 +240,21 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
@@ -251,8 +265,8 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -273,19 +287,21 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
@@ -296,8 +312,8 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -314,19 +330,21 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
@@ -337,8 +355,8 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -355,19 +373,21 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
@@ -378,8 +398,8 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -396,19 +416,21 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
@@ -419,8 +441,8 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -437,19 +459,21 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
@@ -460,8 +484,8 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -478,19 +502,21 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
@@ -501,8 +527,8 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -519,19 +545,21 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
@@ -542,8 +570,8 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -564,19 +592,21 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -587,8 +617,8 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -604,19 +634,21 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -627,8 +659,8 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -644,19 +676,21 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -667,8 +701,8 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -684,19 +718,21 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -707,8 +743,8 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -724,19 +760,21 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -747,8 +785,8 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -764,19 +802,21 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -787,8 +827,8 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -804,19 +844,21 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -827,8 +869,8 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -844,19 +886,21 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -867,8 +911,8 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -887,23 +931,25 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v8i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    str p0, [x8, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
@@ -914,8 +960,8 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -935,22 +981,21 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i8
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -964,22 +1009,21 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i16
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -993,22 +1037,21 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i32
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1022,22 +1065,21 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i64
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    fmov d0, x0
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -1051,25 +1093,24 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_f16
 ; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1084,24 +1125,23 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_f32
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1116,24 +1156,23 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_f64
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1152,24 +1191,23 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1i8
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1185,24 +1223,23 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1i16
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1218,24 +1255,23 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1i32
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1251,24 +1287,23 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1i64
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1284,25 +1319,24 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1f16
 ; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1318,24 +1352,23 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1f32
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1351,24 +1384,23 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1f64
 ; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1388,25 +1420,24 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v16i8
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1421,25 +1452,24 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v8i16
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1454,25 +1484,24 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v4i32
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1487,25 +1516,24 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v2i64
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1520,25 +1548,24 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v8f16
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1553,25 +1580,24 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v4f32
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
@@ -1586,25 +1612,24 @@ define void @dont_coalesce_res_v2f64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x8, [sp, #96]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v2f64
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #112
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
index 1a49da84c00ce..036baff0b6e5a 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
@@ -9,14 +9,15 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
 ; CHECK-LABEL: sm_body_sm_compatible_simple:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
+; CHECK-NEXT:    lsr x8, x8, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
+; CHECK-NEXT:    str x9, [sp, #80]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x8, x0, #0x1
 ; CHECK-NEXT:    tbnz w8, #0, .LBB0_2
@@ -41,14 +42,15 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
 ; CHECK-LABEL: sm_body_caller_sm_compatible_caller_normal_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
+; CHECK-NEXT:    lsr x8, x8, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80]
+; CHECK-NEXT:    str x9, [sp, #88]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbnz w19, #0, .LBB1_2
@@ -62,10 +64,9 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -79,14 +80,15 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    cntd x10
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #80]
+; CHECK-NEXT:    str x10, [sp, #88]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbnz w19, #0, .LBB2_2
@@ -100,10 +102,9 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
 ; CHECK-NEXT:  // %bb.4: // %if.else
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB2_5: // %if.else
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -115,10 +116,9 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
 ; CHECK-NEXT:  // %bb.7: // %if.then
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB2_8: // %if.then
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
index dd336e0f2e686..5fe9066192084 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
@@ -9,14 +9,15 @@ define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body
 ; CHECK-LABEL: locally_streaming_caller_streaming_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
+; CHECK-NEXT:    lsr x8, x8, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
+; CHECK-NEXT:    str x9, [sp, #80]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_compatible_callee
 ; CHECK-NEXT:    bl streaming_compatible_callee
@@ -51,33 +52,31 @@ define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_
 define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: locally_streaming_multiple_exit:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    lsr x9, x9, #3
-; CHECK-NEXT:    str x9, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #64]
+; CHECK-NEXT:    str x9, [sp, #72]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    cmp x0, #1
 ; CHECK-NEXT:    b.ne .LBB2_2
 ; CHECK-NEXT:  // %bb.1: // %if.else
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_2: // %if.end
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 
 entry:
@@ -99,16 +98,16 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta
 ; CHECK-LABEL: locally_streaming_caller_no_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
-; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80]
+; CHECK-NEXT:    str x9, [sp, #88]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
@@ -118,11 +117,11 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
 
@@ -156,16 +155,17 @@ define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i
 ; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
+; CHECK-NEXT:    lsr x8, x8, #3
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #88]
+; CHECK-NEXT:    str x9, [sp, #96]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl streaming_compatible_callee_vec_args_ret
@@ -189,16 +189,17 @@ define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct
 ; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #128
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
+; CHECK-NEXT:    lsr x8, x8, #3
 ; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #112] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
 ; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #104]
+; CHECK-NEXT:    str x9, [sp, #112]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl streaming_compatible_callee_vec_arg_struct_ret
@@ -225,16 +226,18 @@ define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body"
 ; CHECK-LABEL: locally_streaming_caller_alloca:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
 ; CHECK-NEXT:    addsvl sp, sp, #-1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    addsvl x10, sp, #1
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    str x8, [x10, #80]
+; CHECK-NEXT:    addsvl x8, sp, #1
+; CHECK-NEXT:    str x9, [x8, #88]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    mov x0, sp
 ; CHECK-NEXT:    bl use_ptr
@@ -272,16 +275,16 @@ define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstat
 ; CHECK-LABEL: test_arg_survives_loop:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
-; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80]
+; CHECK-NEXT:    str x9, [sp, #88]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB9_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -293,11 +296,11 @@ define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstat
 ; CHECK-NEXT:    fadd s0, s1, s0
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
 entry:
@@ -319,14 +322,15 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: disable_tailcallopt:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
+; CHECK-NEXT:    lsr x8, x8, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
+; CHECK-NEXT:    str x9, [sp, #80]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_compatible_callee
 ; CHECK-NEXT:    smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index e967f3b7be5e8..17f08ddfa04ef 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -37,12 +37,12 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
 ; CHECK-LABEL: streaming_compatible_caller_normal_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB1_2
@@ -54,10 +54,9 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -76,12 +75,12 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
 ; CHECK-LABEL: streaming_compatible_caller_streaming_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbnz w19, #0, .LBB2_2
@@ -93,10 +92,9 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -130,17 +128,19 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-LABEL: streaming_compatible_with_neon_vectors:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
@@ -167,8 +167,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -183,9 +183,8 @@ declare <2 x double> @normal_callee_vec_arg(<2 x double>)
 define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale x 2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: streaming_compatible_with_scalable_vectors:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #8] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-18
 ; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -216,7 +215,10 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #20
 ; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [x9, #24]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB5_2
@@ -263,8 +265,8 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
-; CHECK-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x double> %arg)
   %fadd = fadd <vscale x 2 x double> %res, %arg
@@ -276,9 +278,8 @@ declare <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x doub
 define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: streaming_compatible_with_predicate_vectors:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #8] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-18
 ; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -309,7 +310,10 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #19
 ; CHECK-NEXT:    str p0, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str x8, [x9, #24]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB6_2
@@ -356,8 +360,8 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
-; CHECK-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
   %and = and <vscale x 2 x i1> %res, %arg
@@ -370,12 +374,12 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
 ; CHECK-LABEL: conditional_smstart_unreachable_block:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbnz w19, #0, .LBB7_2
@@ -394,15 +398,15 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
 define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: conditional_smstart_no_successor_block:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    tbz w0, #0, .LBB8_6
 ; CHECK-NEXT:  // %bb.1: // %if.then
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbnz w19, #0, .LBB8_3
@@ -414,10 +418,9 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
 ; CHECK-NEXT:  // %bb.4: // %if.then
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB8_5: // %if.then
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:  .LBB8_6: // %exit
@@ -436,12 +439,12 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: disable_tailcallopt:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB9_2
@@ -453,10 +456,9 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB9_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -470,14 +472,12 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #128
 ; CHECK-NEXT:    .cfi_def_cfa_offset 128
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    stp x30, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -24
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
@@ -487,13 +487,15 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK-NEXT:    .cfi_offset b13, -80
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    cntd x10
 ; CHECK-NEXT:    stp d2, d3, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x8, x1
-; CHECK-NEXT:    mov x9, x0
 ; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    str x10, [sp, #112]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    .cfi_offset vg, -24
+; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    tbz w19, #0, .LBB10_2
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    smstop sm
@@ -508,10 +510,9 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB10_4: // %entry
 ; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x30, x19, [sp, #96] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #112] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #128
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 438b941198449..3b03dab5fbbe6 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -22,11 +22,12 @@ define void @normal_caller_streaming_callee() nounwind {
 ; CHECK-LABEL: normal_caller_streaming_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    smstop sm
@@ -48,11 +49,12 @@ define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enable
 ; CHECK-LABEL: streaming_caller_normal_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl normal_callee
 ; CHECK-NEXT:    smstart sm
@@ -105,11 +107,12 @@ define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind {
 ; CHECK-LABEL: call_to_function_pointer_streaming_enabled:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    blr x0
 ; CHECK-NEXT:    smstop sm
@@ -128,13 +131,14 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
 ; CHECK-LABEL: smstart_clobber_simdfp:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #88]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    smstop sm
@@ -155,8 +159,6 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
 ; CHECK-LABEL: smstart_clobber_sve:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-18
 ; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -187,7 +189,10 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
 ; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #19
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [x9, #16]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    smstop sm
@@ -234,8 +239,6 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
 ; CHECK-LABEL: smstart_clobber_sve_duplicate:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-18
 ; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -266,7 +269,10 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
 ; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #19
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [x9, #16]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    bl streaming_callee
@@ -314,13 +320,15 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
 ; CHECK-LABEL: call_to_intrinsic_without_chain:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d0, d0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #88]
+; CHECK-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl cos
@@ -349,11 +357,12 @@ define void @disable_tailcallopt() nounwind {
 ; CHECK-LABEL: disable_tailcallopt:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    smstop sm
@@ -371,14 +380,15 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
 ; CHECK-LABEL: call_to_non_streaming_pass_sve_objects:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #3
 ; CHECK-NEXT:    rdsvl x3, #1
+; CHECK-NEXT:    str x8, [x9, #80]
 ; CHECK-NEXT:    addvl x0, sp, #2
 ; CHECK-NEXT:    addvl x1, sp, #1
 ; CHECK-NEXT:    mov x2, sp
@@ -409,14 +419,15 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK-LABEL: call_to_non_streaming_pass_args:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
 ; CHECK-NEXT:    stp d2, d3, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #104]
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d2, d3, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index fe3f493353b50..d087b23a64921 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -15,15 +15,17 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
 ; CHECK-LABEL: test_no_stackslot_scavenging:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x24, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x24, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #1
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
+; CHECK-NEXT:    str x8, [x9, #104]
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    smstop sm
@@ -32,8 +34,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x24, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x24, [sp, #72] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -49,20 +51,20 @@ define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-po
 ; CHECK-LABEL: test_no_stackslot_scavenging_with_fp:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-128]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x28, x25, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x28, x25, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x19, [sp, #96] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    lsl x9, x0, #3
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    str s0, [x29, #60] // 4-byte Folded Spill
+; CHECK-NEXT:    str x8, [x29, #48]
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    mov x19, sp
-; CHECK-NEXT:    str s0, [x29, #28] // 4-byte Folded Spill
 ; CHECK-NEXT:    add x9, x9, #15
 ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
 ; CHECK-NEXT:    sub x8, x8, x9
@@ -70,12 +72,12 @@ define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-po
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldr s0, [x29, #28] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr s0, [x29, #60] // 4-byte Folded Reload
 ; CHECK-NEXT:    bl use_f
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    sub sp, x29, #64
-; CHECK-NEXT:    ldp x24, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x28, x25, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x25, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 0853325e449af..91f7ce05b80bc 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -17,11 +17,10 @@ define void @vg_unwind_simple() #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    .cfi_offset b8, -24
 ; CHECK-NEXT:    .cfi_offset b9, -32
@@ -31,6 +30,8 @@ define void @vg_unwind_simple() #0 {
 ; CHECK-NEXT:    .cfi_offset b13, -64
 ; CHECK-NEXT:    .cfi_offset b14, -72
 ; CHECK-NEXT:    .cfi_offset b15, -80
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    .cfi_offset vg, -8
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
@@ -57,12 +58,10 @@ define void @vg_unwind_simple() #0 {
 ; FP-CHECK:       // %bb.0:
 ; FP-CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 96
-; FP-CHECK-NEXT:    cntd x9
 ; FP-CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
@@ -75,6 +74,8 @@ define void @vg_unwind_simple() #0 {
 ; FP-CHECK-NEXT:    .cfi_offset b13, -80
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
+; FP-CHECK-NEXT:    cntd x8
+; FP-CHECK-NEXT:    str x8, [x29, #16]
 ; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    bl callee
@@ -101,7 +102,6 @@ define void @vg_unwind_simple() #0 {
 ;
 ; OUTLINER-CHECK-LABEL: vg_unwind_simple:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
-;
   call void @callee();
   ret void;
 }
@@ -114,13 +114,11 @@ define void @vg_unwind_needs_gap() #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x20, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    stp x30, x20, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w20, -24
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
@@ -130,17 +128,18 @@ define void @vg_unwind_needs_gap() #0 {
 ; CHECK-NEXT:    .cfi_offset b13, -80
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    .cfi_offset vg, -24
+; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x30, x20, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x20, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
@@ -160,15 +159,14 @@ define void @vg_unwind_needs_gap() #0 {
 ; FP-CHECK:       // %bb.0:
 ; FP-CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 96
-; FP-CHECK-NEXT:    cntd x9
 ; FP-CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    stp x9, x20, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT:    str x20, [sp, #80] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
-; FP-CHECK-NEXT:    .cfi_offset w20, -8
+; FP-CHECK-NEXT:    .cfi_offset w20, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -179,16 +177,18 @@ define void @vg_unwind_needs_gap() #0 {
 ; FP-CHECK-NEXT:    .cfi_offset b13, -80
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
+; FP-CHECK-NEXT:    cntd x8
+; FP-CHECK-NEXT:    str x8, [x29, #24]
 ; FP-CHECK-NEXT:    //APP
 ; FP-CHECK-NEXT:    //NO_APP
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
+; FP-CHECK-NEXT:    .cfi_offset vg, -8
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    bl callee
 ; FP-CHECK-NEXT:    smstart sm
 ; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; FP-CHECK-NEXT:    ldr x20, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT:    ldr x20, [sp, #80] // 8-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -209,7 +209,6 @@ define void @vg_unwind_needs_gap() #0 {
 ;
 ; OUTLINER-CHECK-LABEL: vg_unwind_needs_gap:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
-;
   call void asm sideeffect "", "~{x20}"()
   call void @callee();
   ret void;
@@ -220,12 +219,11 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    .cfi_offset b8, -24
 ; CHECK-NEXT:    .cfi_offset b9, -32
@@ -235,7 +233,9 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 ; CHECK-NEXT:    .cfi_offset b13, -64
 ; CHECK-NEXT:    .cfi_offset b14, -72
 ; CHECK-NEXT:    .cfi_offset b15, -80
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #88]
 ; CHECK-NEXT:    .cfi_offset vg, -8
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -264,13 +264,11 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 ; FP-CHECK:       // %bb.0:
 ; FP-CHECK-NEXT:    sub sp, sp, #112
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 112
-; FP-CHECK-NEXT:    cntd x9
 ; FP-CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp x29, x30, [sp, #80] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    str x9, [sp, #96] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #80
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
@@ -283,7 +281,9 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 ; FP-CHECK-NEXT:    .cfi_offset b13, -80
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
+; FP-CHECK-NEXT:    cntd x8
 ; FP-CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
+; FP-CHECK-NEXT:    str x8, [x29, #16]
 ; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -312,7 +312,6 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 ;
 ; OUTLINER-CHECK-LABEL: vg_unwind_with_fixed_args:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
-;
   call void @fixed_callee(<4 x i32> %x);
   ret void;
 }
@@ -320,11 +319,10 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-LABEL: vg_unwind_with_sve_args:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #-32]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w28, -8
+; CHECK-NEXT:    stp x30, x28, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w28, -16
 ; CHECK-NEXT:    .cfi_offset w30, -24
 ; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    addvl sp, sp, #-18
@@ -361,10 +359,13 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    addvl x9, sp, #19
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str x8, [x9, #24]
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset vg, -8
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl scalable_callee
@@ -404,8 +405,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-NEXT:    .cfi_restore z13
 ; CHECK-NEXT:    .cfi_restore z14
 ; CHECK-NEXT:    .cfi_restore z15
-; CHECK-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x28, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w28
 ; CHECK-NEXT:    .cfi_restore w30
@@ -416,13 +417,11 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; FP-CHECK:       // %bb.0:
 ; FP-CHECK-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 48
-; FP-CHECK-NEXT:    cntd x9
-; FP-CHECK-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; FP-CHECK-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    mov x29, sp
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 48
-; FP-CHECK-NEXT:    .cfi_offset w27, -8
-; FP-CHECK-NEXT:    .cfi_offset w28, -16
+; FP-CHECK-NEXT:    .cfi_offset w27, -24
+; FP-CHECK-NEXT:    .cfi_offset w28, -32
 ; FP-CHECK-NEXT:    .cfi_offset w30, -40
 ; FP-CHECK-NEXT:    .cfi_offset w29, -48
 ; FP-CHECK-NEXT:    addvl sp, sp, #-18
@@ -457,10 +456,12 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
 ; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
 ; FP-CHECK-NEXT:    addvl sp, sp, #-1
+; FP-CHECK-NEXT:    cntd x8
 ; FP-CHECK-NEXT:    str z0, [x29, #-19, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT:    str x8, [x29, #32]
 ; FP-CHECK-NEXT:    //APP
 ; FP-CHECK-NEXT:    //NO_APP
-; FP-CHECK-NEXT:    .cfi_offset vg, -32
+; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    bl scalable_callee
@@ -499,7 +500,7 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; FP-CHECK-NEXT:    .cfi_restore z14
 ; FP-CHECK-NEXT:    .cfi_restore z15
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 48
-; FP-CHECK-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; FP-CHECK-NEXT:    .cfi_restore w27
@@ -510,7 +511,6 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ;
 ; OUTLINER-CHECK-LABEL: vg_unwind_with_sve_args:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
-;
   call void asm sideeffect "", "~{x28}"()
   call void @scalable_callee(<vscale x 2 x i64> %x);
   ret void;
@@ -524,12 +524,10 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset w30, -24
 ; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
@@ -550,8 +548,12 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ; CHECK-NEXT:    b.ne .LBB4_1
 ; CHECK-NEXT:  // %bb.2: // %entry
 ; CHECK-NEXT:    .cfi_def_cfa_register wsp
-; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add x10, sp, #72, lsl #12 // =294912
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    add x10, x10, #88
+; CHECK-NEXT:    str x8, [x10, #32760]
+; CHECK-NEXT:    str x9, [x0]
 ; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
@@ -581,15 +583,14 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ; FP-CHECK:       // %bb.0: // %entry
 ; FP-CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 96
-; FP-CHECK-NEXT:    cntd x9
 ; FP-CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT:    str x28, [sp, #80] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
-; FP-CHECK-NEXT:    .cfi_offset w28, -8
+; FP-CHECK-NEXT:    .cfi_offset w28, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -608,9 +609,11 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ; FP-CHECK-NEXT:    str xzr, [sp]
 ; FP-CHECK-NEXT:    b.ne .LBB4_1
 ; FP-CHECK-NEXT:  // %bb.2: // %entry
-; FP-CHECK-NEXT:    mov x8, sp
-; FP-CHECK-NEXT:    str x8, [x0]
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
+; FP-CHECK-NEXT:    cntd x8
+; FP-CHECK-NEXT:    mov x9, sp
+; FP-CHECK-NEXT:    str x8, [x29, #24]
+; FP-CHECK-NEXT:    str x9, [x0]
+; FP-CHECK-NEXT:    .cfi_offset vg, -8
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    bl callee
 ; FP-CHECK-NEXT:    smstart sm
@@ -618,7 +621,7 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ; FP-CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; FP-CHECK-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT:    ldr x28, [sp, #80] // 8-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -639,7 +642,6 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ;
 ; OUTLINER-CHECK-LABEL: vg_unwind_multiple_scratch_regs:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
-;
 entry:
   %v = alloca i8, i64 327680, align 1
   store ptr %v, ptr %out, align 8
@@ -655,14 +657,10 @@ define void @vg_locally_streaming_fn() #3 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    rdsvl x9, #1
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
@@ -673,6 +671,11 @@ define void @vg_locally_streaming_fn() #3 {
 ; CHECK-NEXT:    .cfi_offset b13, -80
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    str x8, [sp, #72]
+; CHECK-NEXT:    str x9, [sp, #80]
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    .cfi_restore vg
@@ -701,15 +704,10 @@ define void @vg_locally_streaming_fn() #3 {
 ; FP-CHECK:       // %bb.0:
 ; FP-CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 96
-; FP-CHECK-NEXT:    rdsvl x9, #1
 ; FP-CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    lsr x9, x9, #3
 ; FP-CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; FP-CHECK-NEXT:    cntd x9
 ; FP-CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
 ; FP-CHECK-NEXT:    .cfi_offset vg, -8
@@ -723,6 +721,11 @@ define void @vg_locally_streaming_fn() #3 {
 ; FP-CHECK-NEXT:    .cfi_offset b13, -80
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
+; FP-CHECK-NEXT:    rdsvl x8, #1
+; FP-CHECK-NEXT:    cntd x9
+; FP-CHECK-NEXT:    lsr x8, x8, #3
+; FP-CHECK-NEXT:    str x8, [x29, #16]
+; FP-CHECK-NEXT:    str x9, [x29, #24]
 ; FP-CHECK-NEXT:    bl callee
 ; FP-CHECK-NEXT:    smstart sm
 ; FP-CHECK-NEXT:    .cfi_restore vg
@@ -751,7 +754,6 @@ define void @vg_locally_streaming_fn() #3 {
 ;
 ; OUTLINER-CHECK-LABEL: vg_locally_streaming_fn:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
-;
   call void @callee()
   call void @streaming_callee()
   call void @callee()
@@ -763,13 +765,11 @@ define void @streaming_compatible_to_streaming() #4 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -24
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
@@ -779,9 +779,11 @@ define void @streaming_compatible_to_streaming() #4 {
 ; CHECK-NEXT:    .cfi_offset b13, -80
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    .cfi_offset vg, -24
+; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    tbnz w19, #0, .LBB6_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstart sm
@@ -792,10 +794,9 @@ define void @streaming_compatible_to_streaming() #4 {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB6_4:
 ; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
@@ -815,15 +816,14 @@ define void @streaming_compatible_to_streaming() #4 {
 ; FP-CHECK:       // %bb.0:
 ; FP-CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 96
-; FP-CHECK-NEXT:    cntd x9
 ; FP-CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
-; FP-CHECK-NEXT:    .cfi_offset w19, -8
+; FP-CHECK-NEXT:    .cfi_offset w19, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -834,9 +834,11 @@ define void @streaming_compatible_to_streaming() #4 {
 ; FP-CHECK-NEXT:    .cfi_offset b13, -80
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
+; FP-CHECK-NEXT:    cntd x8
+; FP-CHECK-NEXT:    str x8, [x29, #24]
 ; FP-CHECK-NEXT:    bl __arm_sme_state
 ; FP-CHECK-NEXT:    and x19, x0, #0x1
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
+; FP-CHECK-NEXT:    .cfi_offset vg, -8
 ; FP-CHECK-NEXT:    tbnz w19, #0, .LBB6_2
 ; FP-CHECK-NEXT:  // %bb.1:
 ; FP-CHECK-NEXT:    smstart sm
@@ -849,7 +851,7 @@ define void @streaming_compatible_to_streaming() #4 {
 ; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; FP-CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -870,7 +872,6 @@ define void @streaming_compatible_to_streaming() #4 {
 ;
 ; OUTLINER-CHECK-LABEL: streaming_compatible_to_streaming:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
-;
   call void @streaming_callee()
   ret void
 }
@@ -880,13 +881,11 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -24
 ; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
@@ -896,9 +895,11 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; CHECK-NEXT:    .cfi_offset b13, -80
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    str x8, [sp, #80]
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    .cfi_offset vg, -24
+; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    tbz w19, #0, .LBB7_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
@@ -909,10 +910,9 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB7_4:
 ; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
@@ -932,15 +932,14 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; FP-CHECK:       // %bb.0:
 ; FP-CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 96
-; FP-CHECK-NEXT:    cntd x9
 ; FP-CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
-; FP-CHECK-NEXT:    .cfi_offset w19, -8
+; FP-CHECK-NEXT:    .cfi_offset w19, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -951,9 +950,11 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; FP-CHECK-NEXT:    .cfi_offset b13, -80
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
+; FP-CHECK-NEXT:    cntd x8
+; FP-CHECK-NEXT:    str x8, [x29, #24]
 ; FP-CHECK-NEXT:    bl __arm_sme_state
 ; FP-CHECK-NEXT:    and x19, x0, #0x1
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
+; FP-CHECK-NEXT:    .cfi_offset vg, -8
 ; FP-CHECK-NEXT:    tbz w19, #0, .LBB7_2
 ; FP-CHECK-NEXT:  // %bb.1:
 ; FP-CHECK-NEXT:    smstop sm
@@ -966,7 +967,7 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; FP-CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -987,7 +988,6 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ;
 ; OUTLINER-CHECK-LABEL: streaming_compatible_to_non_streaming:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
-;
   call void @callee()
   ret void
 }
@@ -1003,16 +1003,13 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; NO-SVE-CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; NO-SVE-CHECK-NEXT:    .cfi_def_cfa_offset 96
 ; NO-SVE-CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT:    mov x9, x0
 ; NO-SVE-CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; NO-SVE-CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; NO-SVE-CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT:    bl __arm_get_current_vg
-; NO-SVE-CHECK-NEXT:    stp x0, x19, [sp, #80] // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT:    mov x0, x9
+; NO-SVE-CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; NO-SVE-CHECK-NEXT:    add x29, sp, #64
 ; NO-SVE-CHECK-NEXT:    .cfi_def_cfa w29, 32
-; NO-SVE-CHECK-NEXT:    .cfi_offset w19, -8
+; NO-SVE-CHECK-NEXT:    .cfi_offset w19, -16
 ; NO-SVE-CHECK-NEXT:    .cfi_offset w30, -24
 ; NO-SVE-CHECK-NEXT:    .cfi_offset w29, -32
 ; NO-SVE-CHECK-NEXT:    .cfi_offset b8, -40
@@ -1023,10 +1020,12 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; NO-SVE-CHECK-NEXT:    .cfi_offset b13, -80
 ; NO-SVE-CHECK-NEXT:    .cfi_offset b14, -88
 ; NO-SVE-CHECK-NEXT:    .cfi_offset b15, -96
+; NO-SVE-CHECK-NEXT:    bl __arm_get_current_vg
 ; NO-SVE-CHECK-NEXT:    mov w8, w0
+; NO-SVE-CHECK-NEXT:    str x0, [x29, #24]
 ; NO-SVE-CHECK-NEXT:    bl __arm_sme_state
 ; NO-SVE-CHECK-NEXT:    and x19, x0, #0x1
-; NO-SVE-CHECK-NEXT:    .cfi_offset vg, -16
+; NO-SVE-CHECK-NEXT:    .cfi_offset vg, -8
 ; NO-SVE-CHECK-NEXT:    tbnz w19, #0, .LBB8_2
 ; NO-SVE-CHECK-NEXT:  // %bb.1:
 ; NO-SVE-CHECK-NEXT:    smstart sm
@@ -1040,7 +1039,7 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; NO-SVE-CHECK-NEXT:    .cfi_restore vg
 ; NO-SVE-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; NO-SVE-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; NO-SVE-CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; NO-SVE-CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; NO-SVE-CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -1058,10 +1057,8 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; NO-SVE-CHECK-NEXT:    .cfi_restore b14
 ; NO-SVE-CHECK-NEXT:    .cfi_restore b15
 ; NO-SVE-CHECK-NEXT:    ret
-;
 ; OUTLINER-CHECK-LABEL: streaming_compatible_no_sve:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
-;
   call void @streaming_callee_with_arg(i32 %x)
   ret void
 }
@@ -1072,30 +1069,28 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; user-code as if it is part of the frame-setup when doing so.
 define void @test_rdsvl_right_after_prologue(i64 %x0) nounwind {
 ; NO-SVE-CHECK-LABEL: test_rdsvl_right_after_prologue:
-; NO-SVE-CHECK:     // %bb.0:
-; NO-SVE-CHECK-NEXT: stp     d15, d14, [sp, #-96]!           // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: stp     d13, d12, [sp, #16]             // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: mov     x9, x0
-; NO-SVE-CHECK-NEXT: stp     d11, d10, [sp, #32]             // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: stp     d9, d8, [sp, #48]               // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: stp     x29, x30, [sp, #64]             // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: bl      __arm_get_current_vg
-; NO-SVE-CHECK-NEXT: str     x0, [sp, #80]                   // 8-byte Folded Spill
-; NO-SVE-CHECK-NEXT: mov     x0, x9
-; NO-SVE-CHECK-NEXT: rdsvl   x8, #1
-; NO-SVE-CHECK-NEXT: add     x29, sp, #64
-; NO-SVE-CHECK-NEXT: lsr     x8, x8, #3
-; NO-SVE-CHECK-NEXT: mov     x1, x0
-; NO-SVE-CHECK-NEXT: smstart sm
-; NO-SVE-CHECK-NEXT: mov     x0, x8
-; NO-SVE-CHECK-NEXT: bl      bar
-; NO-SVE-CHECK-NEXT: smstop  sm
-; NO-SVE-CHECK-NEXT: ldp     x29, x30, [sp, #64]             // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ldp     d9, d8, [sp, #48]               // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ldp     d11, d10, [sp, #32]             // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ldp     d13, d12, [sp, #16]             // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ldp     d15, d14, [sp], #96             // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ret
+; NO-SVE-CHECK:       // %bb.0:
+; NO-SVE-CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT:    add x29, sp, #64
+; NO-SVE-CHECK-NEXT:    bl __arm_get_current_vg
+; NO-SVE-CHECK-NEXT:    rdsvl x8, #1
+; NO-SVE-CHECK-NEXT:    mov x1, x0
+; NO-SVE-CHECK-NEXT:    lsr x8, x8, #3
+; NO-SVE-CHECK-NEXT:    str x0, [x29, #16]
+; NO-SVE-CHECK-NEXT:    smstart sm
+; NO-SVE-CHECK-NEXT:    mov x0, x8
+; NO-SVE-CHECK-NEXT:    bl bar
+; NO-SVE-CHECK-NEXT:    smstop sm
+; NO-SVE-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    ret
   %some_alloc = alloca i64, align 8
   %rdsvl = tail call i64 @llvm.aarch64.sme.cntsd()
   call void @bar(i64 %rdsvl, i64 %x0)
@@ -1112,11 +1107,10 @@ define void @vg_unwind_noasync() #5 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    .cfi_offset b8, -24
 ; CHECK-NEXT:    .cfi_offset b9, -32
@@ -1126,6 +1120,8 @@ define void @vg_unwind_noasync() #5 {
 ; CHECK-NEXT:    .cfi_offset b13, -64
 ; CHECK-NEXT:    .cfi_offset b14, -72
 ; CHECK-NEXT:    .cfi_offset b15, -80
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    str x8, [sp, #72]
 ; CHECK-NEXT:    .cfi_offset vg, -8
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
@@ -1152,12 +1148,10 @@ define void @vg_unwind_noasync() #5 {
 ; FP-CHECK:       // %bb.0:
 ; FP-CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 96
-; FP-CHECK-NEXT:    cntd x9
 ; FP-CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
@@ -1170,6 +1164,8 @@ define void @vg_unwind_noasync() #5 {
 ; FP-CHECK-NEXT:    .cfi_offset b13, -80
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
+; FP-CHECK-NEXT:    cntd x8
+; FP-CHECK-NEXT:    str x8, [x29, #16]
 ; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    bl callee
@@ -1193,6 +1189,7 @@ define void @vg_unwind_noasync() #5 {
 ; FP-CHECK-NEXT:    .cfi_restore b14
 ; FP-CHECK-NEXT:    .cfi_restore b15
 ; FP-CHECK-NEXT:    ret
+;
 ; OUTLINER-CHECK-LABEL: vg_unwind_noasync:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
 ;
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 3a33405200132..52854b8b01aa4 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -616,36 +616,36 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK0:       // %bb.0: // %entry
 ; CHECK0-NEXT:    sub sp, sp, #176
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 176
-; CHECK0-NEXT:    rdsvl x9, #1
-; CHECK0-NEXT:    stp d15, d14, [sp, #48] // 16-byte Folded Spill
-; CHECK0-NEXT:    lsr x9, x9, #3
-; CHECK0-NEXT:    stp d13, d12, [sp, #64] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp d11, d10, [sp, #80] // 16-byte Folded Spill
-; CHECK0-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK0-NEXT:    str x25, [sp, #96] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x24, x23, [sp, #104] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x22, x21, [sp, #120] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x20, x19, [sp, #136] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_offset vg, -16
+; CHECK0-NEXT:    .cfi_offset w19, -32
+; CHECK0-NEXT:    .cfi_offset w20, -40
+; CHECK0-NEXT:    .cfi_offset w21, -48
+; CHECK0-NEXT:    .cfi_offset w22, -56
+; CHECK0-NEXT:    .cfi_offset w23, -64
+; CHECK0-NEXT:    .cfi_offset w24, -72
+; CHECK0-NEXT:    .cfi_offset w25, -80
+; CHECK0-NEXT:    .cfi_offset b8, -88
+; CHECK0-NEXT:    .cfi_offset b9, -96
+; CHECK0-NEXT:    .cfi_offset b10, -104
+; CHECK0-NEXT:    .cfi_offset b11, -112
+; CHECK0-NEXT:    .cfi_offset b12, -120
+; CHECK0-NEXT:    .cfi_offset b13, -128
+; CHECK0-NEXT:    .cfi_offset b14, -136
+; CHECK0-NEXT:    .cfi_offset b15, -144
+; CHECK0-NEXT:    rdsvl x8, #1
 ; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; CHECK0-NEXT:    stp d9, d8, [sp, #96] // 16-byte Folded Spill
-; CHECK0-NEXT:    str x25, [sp, #112] // 8-byte Folded Spill
-; CHECK0-NEXT:    stp x24, x23, [sp, #128] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x22, x21, [sp, #144] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x20, x19, [sp, #160] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w20, -16
-; CHECK0-NEXT:    .cfi_offset w21, -24
-; CHECK0-NEXT:    .cfi_offset w22, -32
-; CHECK0-NEXT:    .cfi_offset w23, -40
-; CHECK0-NEXT:    .cfi_offset w24, -48
-; CHECK0-NEXT:    .cfi_offset w25, -64
-; CHECK0-NEXT:    .cfi_offset b8, -72
-; CHECK0-NEXT:    .cfi_offset b9, -80
-; CHECK0-NEXT:    .cfi_offset b10, -88
-; CHECK0-NEXT:    .cfi_offset b11, -96
-; CHECK0-NEXT:    .cfi_offset b12, -104
-; CHECK0-NEXT:    .cfi_offset b13, -112
-; CHECK0-NEXT:    .cfi_offset b14, -120
-; CHECK0-NEXT:    .cfi_offset b15, -128
-; CHECK0-NEXT:    .cfi_offset vg, -136
 ; CHECK0-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
+; CHECK0-NEXT:    lsr x8, x8, #3
+; CHECK0-NEXT:    str x8, [sp, #152]
+; CHECK0-NEXT:    str x9, [sp, #160]
 ; CHECK0-NEXT:    smstart sm
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
@@ -655,15 +655,15 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK0-NEXT:    str x0, [sp, #24]
 ; CHECK0-NEXT:    str d0, [sp, #16]
 ; CHECK0-NEXT:    smstop sm
-; CHECK0-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x20, x19, [sp, #136] // 16-byte Folded Reload
 ; CHECK0-NEXT:    mov w0, wzr
-; CHECK0-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr x25, [sp, #112] // 8-byte Folded Reload
-; CHECK0-NEXT:    ldp x24, x23, [sp, #128] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp d9, d8, [sp, #96] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp d11, d10, [sp, #80] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp d13, d12, [sp, #64] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp d15, d14, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x22, x21, [sp, #120] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr x25, [sp, #96] // 8-byte Folded Reload
+; CHECK0-NEXT:    ldp x24, x23, [sp, #104] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
 ; CHECK0-NEXT:    add sp, sp, #176
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK0-NEXT:    .cfi_restore w19
@@ -687,37 +687,37 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK64:       // %bb.0: // %entry
 ; CHECK64-NEXT:    sub sp, sp, #304
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 304
-; CHECK64-NEXT:    rdsvl x9, #1
-; CHECK64-NEXT:    stp d15, d14, [sp, #112] // 16-byte Folded Spill
-; CHECK64-NEXT:    lsr x9, x9, #3
-; CHECK64-NEXT:    stp d13, d12, [sp, #128] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp d11, d10, [sp, #144] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x9, [sp, #96] // 8-byte Folded Spill
+; CHECK64-NEXT:    stp d15, d14, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp d13, d12, [sp, #112] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp d11, d10, [sp, #128] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp d9, d8, [sp, #144] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x29, x25, [sp, #224] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x24, x23, [sp, #240] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x22, x21, [sp, #256] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #272] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_offset vg, -8
+; CHECK64-NEXT:    .cfi_offset w19, -24
+; CHECK64-NEXT:    .cfi_offset w20, -32
+; CHECK64-NEXT:    .cfi_offset w21, -40
+; CHECK64-NEXT:    .cfi_offset w22, -48
+; CHECK64-NEXT:    .cfi_offset w23, -56
+; CHECK64-NEXT:    .cfi_offset w24, -64
+; CHECK64-NEXT:    .cfi_offset w25, -72
+; CHECK64-NEXT:    .cfi_offset w29, -80
+; CHECK64-NEXT:    .cfi_offset b8, -152
+; CHECK64-NEXT:    .cfi_offset b9, -160
+; CHECK64-NEXT:    .cfi_offset b10, -168
+; CHECK64-NEXT:    .cfi_offset b11, -176
+; CHECK64-NEXT:    .cfi_offset b12, -184
+; CHECK64-NEXT:    .cfi_offset b13, -192
+; CHECK64-NEXT:    .cfi_offset b14, -200
+; CHECK64-NEXT:    .cfi_offset b15, -208
+; CHECK64-NEXT:    rdsvl x8, #1
 ; CHECK64-NEXT:    cntd x9
-; CHECK64-NEXT:    str x9, [sp, #104] // 8-byte Folded Spill
-; CHECK64-NEXT:    stp d9, d8, [sp, #160] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x29, x25, [sp, #240] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x24, x23, [sp, #256] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x22, x21, [sp, #272] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x20, x19, [sp, #288] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_offset w19, -8
-; CHECK64-NEXT:    .cfi_offset w20, -16
-; CHECK64-NEXT:    .cfi_offset w21, -24
-; CHECK64-NEXT:    .cfi_offset w22, -32
-; CHECK64-NEXT:    .cfi_offset w23, -40
-; CHECK64-NEXT:    .cfi_offset w24, -48
-; CHECK64-NEXT:    .cfi_offset w25, -56
-; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    .cfi_offset b8, -136
-; CHECK64-NEXT:    .cfi_offset b9, -144
-; CHECK64-NEXT:    .cfi_offset b10, -152
-; CHECK64-NEXT:    .cfi_offset b11, -160
-; CHECK64-NEXT:    .cfi_offset b12, -168
-; CHECK64-NEXT:    .cfi_offset b13, -176
-; CHECK64-NEXT:    .cfi_offset b14, -184
-; CHECK64-NEXT:    .cfi_offset b15, -192
-; CHECK64-NEXT:    .cfi_offset vg, -200
 ; CHECK64-NEXT:    str d0, [sp, #80] // 8-byte Folded Spill
+; CHECK64-NEXT:    lsr x8, x8, #3
+; CHECK64-NEXT:    str x8, [sp, #288]
+; CHECK64-NEXT:    str x9, [sp, #296]
 ; CHECK64-NEXT:    smstart sm
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
@@ -727,15 +727,15 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK64-NEXT:    str x0, [sp, #8]
 ; CHECK64-NEXT:    str d0, [sp, #88]
 ; CHECK64-NEXT:    smstop sm
-; CHECK64-NEXT:    ldp x20, x19, [sp, #288] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x20, x19, [sp, #272] // 16-byte Folded Reload
 ; CHECK64-NEXT:    mov w0, wzr
-; CHECK64-NEXT:    ldp x22, x21, [sp, #272] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x24, x23, [sp, #256] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x25, [sp, #240] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp d9, d8, [sp, #160] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp d11, d10, [sp, #144] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp d13, d12, [sp, #128] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp d15, d14, [sp, #112] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x22, x21, [sp, #256] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x24, x23, [sp, #240] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x25, [sp, #224] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp d9, d8, [sp, #144] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp d11, d10, [sp, #128] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp d13, d12, [sp, #112] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp d15, d14, [sp, #96] // 16-byte Folded Reload
 ; CHECK64-NEXT:    add sp, sp, #304
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
@@ -758,45 +758,45 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ;
 ; CHECK1024-LABEL: csr_x18_25_d8_15_allocdi64_locallystreaming:
 ; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    rdsvl x9, #1
-; CHECK1024-NEXT:    lsr x9, x9, #3
 ; CHECK1024-NEXT:    sub sp, sp, #1168
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1168
-; CHECK1024-NEXT:    str x9, [sp] // 8-byte Folded Spill
-; CHECK1024-NEXT:    cntd x9
-; CHECK1024-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; CHECK1024-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK1024-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK1024-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK1024-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str x29, [sp, #1104] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x25, [sp, #1112] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x24, [sp, #1120] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x23, [sp, #1128] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x22, [sp, #1136] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x21, [sp, #1144] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1152] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1160] // 8-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w20, -16
-; CHECK1024-NEXT:    .cfi_offset w21, -24
-; CHECK1024-NEXT:    .cfi_offset w22, -32
-; CHECK1024-NEXT:    .cfi_offset w23, -40
-; CHECK1024-NEXT:    .cfi_offset w24, -48
-; CHECK1024-NEXT:    .cfi_offset w25, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    .cfi_offset b8, -1096
-; CHECK1024-NEXT:    .cfi_offset b9, -1104
-; CHECK1024-NEXT:    .cfi_offset b10, -1112
-; CHECK1024-NEXT:    .cfi_offset b11, -1120
-; CHECK1024-NEXT:    .cfi_offset b12, -1128
-; CHECK1024-NEXT:    .cfi_offset b13, -1136
-; CHECK1024-NEXT:    .cfi_offset b14, -1144
-; CHECK1024-NEXT:    .cfi_offset b15, -1152
-; CHECK1024-NEXT:    .cfi_offset vg, -1160
+; CHECK1024-NEXT:    stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK1024-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK1024-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x25, [sp, #1096] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x24, [sp, #1104] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x23, [sp, #1112] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x22, [sp, #1120] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x21, [sp, #1128] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x20, [sp, #1136] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1144] // 8-byte Folded Spill
+; CHECK1024-NEXT:    .cfi_offset vg, -8
+; CHECK1024-NEXT:    .cfi_offset w19, -24
+; CHECK1024-NEXT:    .cfi_offset w20, -32
+; CHECK1024-NEXT:    .cfi_offset w21, -40
+; CHECK1024-NEXT:    .cfi_offset w22, -48
+; CHECK1024-NEXT:    .cfi_offset w23, -56
+; CHECK1024-NEXT:    .cfi_offset w24, -64
+; CHECK1024-NEXT:    .cfi_offset w25, -72
+; CHECK1024-NEXT:    .cfi_offset w29, -80
+; CHECK1024-NEXT:    .cfi_offset b8, -1112
+; CHECK1024-NEXT:    .cfi_offset b9, -1120
+; CHECK1024-NEXT:    .cfi_offset b10, -1128
+; CHECK1024-NEXT:    .cfi_offset b11, -1136
+; CHECK1024-NEXT:    .cfi_offset b12, -1144
+; CHECK1024-NEXT:    .cfi_offset b13, -1152
+; CHECK1024-NEXT:    .cfi_offset b14, -1160
+; CHECK1024-NEXT:    .cfi_offset b15, -1168
 ; CHECK1024-NEXT:    sub sp, sp, #1056
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 2224
+; CHECK1024-NEXT:    rdsvl x8, #1
+; CHECK1024-NEXT:    cntd x9
 ; CHECK1024-NEXT:    str d0, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    lsr x8, x8, #3
+; CHECK1024-NEXT:    str x8, [sp, #2208]
+; CHECK1024-NEXT:    str x9, [sp, #2216]
 ; CHECK1024-NEXT:    smstart sm
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
@@ -809,18 +809,18 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK1024-NEXT:    mov w0, wzr
 ; CHECK1024-NEXT:    add sp, sp, #1056
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1168
-; CHECK1024-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr x19, [sp, #1160] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1152] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x21, [sp, #1144] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x22, [sp, #1136] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x23, [sp, #1128] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x24, [sp, #1120] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x25, [sp, #1112] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1104] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr x19, [sp, #1144] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr x20, [sp, #1136] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x21, [sp, #1128] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x22, [sp, #1120] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x23, [sp, #1112] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x24, [sp, #1104] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x25, [sp, #1096] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldp d15, d14, [sp] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    add sp, sp, #1168
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK1024-NEXT:    .cfi_restore w19
@@ -1572,18 +1572,17 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK0:       // %bb.0:
 ; CHECK0-NEXT:    sub sp, sp, #176
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 176
-; CHECK0-NEXT:    cntd x9
 ; CHECK0-NEXT:    stp d15, d14, [sp, #64] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp d13, d12, [sp, #80] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp d11, d10, [sp, #96] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp d9, d8, [sp, #112] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x30, x9, [sp, #128] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x22, x21, [sp, #144] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x20, x19, [sp, #160] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w20, -16
-; CHECK0-NEXT:    .cfi_offset w21, -24
-; CHECK0-NEXT:    .cfi_offset w22, -32
+; CHECK0-NEXT:    str x30, [sp, #128] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x22, x21, [sp, #136] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x20, x19, [sp, #152] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_offset w19, -16
+; CHECK0-NEXT:    .cfi_offset w20, -24
+; CHECK0-NEXT:    .cfi_offset w21, -32
+; CHECK0-NEXT:    .cfi_offset w22, -40
 ; CHECK0-NEXT:    .cfi_offset w30, -48
 ; CHECK0-NEXT:    .cfi_offset b8, -56
 ; CHECK0-NEXT:    .cfi_offset b9, -64
@@ -1593,13 +1592,15 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK0-NEXT:    .cfi_offset b13, -96
 ; CHECK0-NEXT:    .cfi_offset b14, -104
 ; CHECK0-NEXT:    .cfi_offset b15, -112
+; CHECK0-NEXT:    cntd x8
 ; CHECK0-NEXT:    mov w19, w1
 ; CHECK0-NEXT:    mov w20, w0
 ; CHECK0-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
 ; CHECK0-NEXT:    stp q2, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK0-NEXT:    str x8, [sp, #168]
 ; CHECK0-NEXT:    bl __arm_sme_state
 ; CHECK0-NEXT:    and x21, x0, #0x1
-; CHECK0-NEXT:    .cfi_offset vg, -40
+; CHECK0-NEXT:    .cfi_offset vg, -8
 ; CHECK0-NEXT:    tbz w21, #0, .LBB27_2
 ; CHECK0-NEXT:  // %bb.1:
 ; CHECK0-NEXT:    smstop sm
@@ -1615,7 +1616,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK0-NEXT:    cset w21, lt
 ; CHECK0-NEXT:    bl __arm_sme_state
 ; CHECK0-NEXT:    and x22, x0, #0x1
-; CHECK0-NEXT:    .cfi_offset vg, -40
+; CHECK0-NEXT:    .cfi_offset vg, -8
 ; CHECK0-NEXT:    tbz w22, #0, .LBB27_6
 ; CHECK0-NEXT:  // %bb.5:
 ; CHECK0-NEXT:    smstop sm
@@ -1631,9 +1632,9 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK0-NEXT:    tst w8, w21
 ; CHECK0-NEXT:    csel w0, w20, w19, ne
 ; CHECK0-NEXT:    .cfi_restore vg
-; CHECK0-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x20, x19, [sp, #152] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldr x30, [sp, #128] // 8-byte Folded Reload
-; CHECK0-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x22, x21, [sp, #136] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp d9, d8, [sp, #112] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp d11, d10, [sp, #96] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp d13, d12, [sp, #80] // 16-byte Folded Reload
@@ -1659,19 +1660,17 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64:       // %bb.0:
 ; CHECK64-NEXT:    sub sp, sp, #320
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 320
-; CHECK64-NEXT:    cntd x9
 ; CHECK64-NEXT:    stp d15, d14, [sp, #128] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp d13, d12, [sp, #144] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp d11, d10, [sp, #160] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp d9, d8, [sp, #176] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp x29, x30, [sp, #256] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x22, [sp, #272] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x21, x20, [sp, #288] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #304] // 8-byte Folded Spill
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w20, -24
-; CHECK64-NEXT:    .cfi_offset w21, -32
-; CHECK64-NEXT:    .cfi_offset w22, -40
+; CHECK64-NEXT:    stp x22, x21, [sp, #272] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #288] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_offset w19, -24
+; CHECK64-NEXT:    .cfi_offset w20, -32
+; CHECK64-NEXT:    .cfi_offset w21, -40
+; CHECK64-NEXT:    .cfi_offset w22, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
 ; CHECK64-NEXT:    .cfi_offset b8, -136
@@ -1682,13 +1681,15 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64-NEXT:    .cfi_offset b13, -176
 ; CHECK64-NEXT:    .cfi_offset b14, -184
 ; CHECK64-NEXT:    .cfi_offset b15, -192
+; CHECK64-NEXT:    cntd x8
 ; CHECK64-NEXT:    mov w19, w1
 ; CHECK64-NEXT:    mov w20, w0
 ; CHECK64-NEXT:    stp q0, q1, [sp, #64] // 32-byte Folded Spill
 ; CHECK64-NEXT:    stp q2, q3, [sp, #96] // 32-byte Folded Spill
+; CHECK64-NEXT:    str x8, [sp, #304]
 ; CHECK64-NEXT:    bl __arm_sme_state
 ; CHECK64-NEXT:    and x21, x0, #0x1
-; CHECK64-NEXT:    .cfi_offset vg, -48
+; CHECK64-NEXT:    .cfi_offset vg, -16
 ; CHECK64-NEXT:    tbz w21, #0, .LBB27_2
 ; CHECK64-NEXT:  // %bb.1:
 ; CHECK64-NEXT:    smstop sm
@@ -1704,7 +1705,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64-NEXT:    cset w21, lt
 ; CHECK64-NEXT:    bl __arm_sme_state
 ; CHECK64-NEXT:    and x22, x0, #0x1
-; CHECK64-NEXT:    .cfi_offset vg, -48
+; CHECK64-NEXT:    .cfi_offset vg, -16
 ; CHECK64-NEXT:    tbz w22, #0, .LBB27_6
 ; CHECK64-NEXT:  // %bb.5:
 ; CHECK64-NEXT:    smstop sm
@@ -1720,8 +1721,8 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64-NEXT:    tst w8, w21
 ; CHECK64-NEXT:    csel w0, w20, w19, ne
 ; CHECK64-NEXT:    .cfi_restore vg
-; CHECK64-NEXT:    ldp x20, x19, [sp, #296] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x22, x21, [sp, #280] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x20, x19, [sp, #288] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x22, x21, [sp, #272] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldp x29, x30, [sp, #256] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldp d9, d8, [sp, #176] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldp d11, d10, [sp, #160] // 16-byte Folded Reload
@@ -1749,22 +1750,20 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024:       // %bb.0:
 ; CHECK1024-NEXT:    sub sp, sp, #1152
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1152
-; CHECK1024-NEXT:    cntd x9
 ; CHECK1024-NEXT:    stp d15, d14, [sp] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str x29, [sp, #1088] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x30, [sp, #1096] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1104] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x22, [sp, #1112] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x21, [sp, #1120] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1128] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1136] // 8-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w20, -24
-; CHECK1024-NEXT:    .cfi_offset w21, -32
-; CHECK1024-NEXT:    .cfi_offset w22, -40
+; CHECK1024-NEXT:    str x22, [sp, #1104] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x21, [sp, #1112] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x20, [sp, #1120] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1128] // 8-byte Folded Spill
+; CHECK1024-NEXT:    .cfi_offset w19, -24
+; CHECK1024-NEXT:    .cfi_offset w20, -32
+; CHECK1024-NEXT:    .cfi_offset w21, -40
+; CHECK1024-NEXT:    .cfi_offset w22, -48
 ; CHECK1024-NEXT:    .cfi_offset w30, -56
 ; CHECK1024-NEXT:    .cfi_offset w29, -64
 ; CHECK1024-NEXT:    .cfi_offset b8, -1096
@@ -1777,15 +1776,17 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:    .cfi_offset b15, -1152
 ; CHECK1024-NEXT:    sub sp, sp, #1088
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 2240
+; CHECK1024-NEXT:    cntd x8
 ; CHECK1024-NEXT:    mov w19, w1
 ; CHECK1024-NEXT:    mov w20, w0
 ; CHECK1024-NEXT:    str q3, [sp, #1072] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str q2, [sp, #1056] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str q1, [sp, #1040] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str q0, [sp, #1024] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str x8, [sp, #2224]
 ; CHECK1024-NEXT:    bl __arm_sme_state
 ; CHECK1024-NEXT:    and x21, x0, #0x1
-; CHECK1024-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NEXT:    .cfi_offset vg, -16
 ; CHECK1024-NEXT:    tbz w21, #0, .LBB27_2
 ; CHECK1024-NEXT:  // %bb.1:
 ; CHECK1024-NEXT:    smstop sm
@@ -1802,7 +1803,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:    cset w21, lt
 ; CHECK1024-NEXT:    bl __arm_sme_state
 ; CHECK1024-NEXT:    and x22, x0, #0x1
-; CHECK1024-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NEXT:    .cfi_offset vg, -16
 ; CHECK1024-NEXT:    tbz w22, #0, .LBB27_6
 ; CHECK1024-NEXT:  // %bb.5:
 ; CHECK1024-NEXT:    smstop sm
@@ -1822,11 +1823,11 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:    add sp, sp, #1088
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1152
 ; CHECK1024-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr x19, [sp, #1136] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x19, [sp, #1128] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1128] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x21, [sp, #1120] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x22, [sp, #1112] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x20, [sp, #1120] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x21, [sp, #1112] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x22, [sp, #1104] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x30, [sp, #1096] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x29, [sp, #1088] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -1858,14 +1859,13 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
 ; CHECK0-LABEL: svecc_call:
 ; CHECK0:       // %bb.0: // %entry
-; CHECK0-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK0-NEXT:    str x29, [sp, #-48]! // 8-byte Folded Spill
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 48
-; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x27, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w27, -16
-; CHECK0-NEXT:    .cfi_offset w28, -24
+; CHECK0-NEXT:    stp x30, x28, [sp, #8] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x27, x19, [sp, #24] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_offset w19, -16
+; CHECK0-NEXT:    .cfi_offset w27, -24
+; CHECK0-NEXT:    .cfi_offset w28, -32
 ; CHECK0-NEXT:    .cfi_offset w30, -40
 ; CHECK0-NEXT:    .cfi_offset w29, -48
 ; CHECK0-NEXT:    addvl sp, sp, #-18
@@ -1906,12 +1906,15 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
+; CHECK0-NEXT:    cntd x9
+; CHECK0-NEXT:    addvl x10, sp, #18
 ; CHECK0-NEXT:    mov x8, x0
+; CHECK0-NEXT:    str x9, [x10, #40]
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
 ; CHECK0-NEXT:    bl __arm_sme_state
 ; CHECK0-NEXT:    and x19, x0, #0x1
-; CHECK0-NEXT:    .cfi_offset vg, -32
+; CHECK0-NEXT:    .cfi_offset vg, -8
 ; CHECK0-NEXT:    tbz w19, #0, .LBB28_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
 ; CHECK0-NEXT:    smstop sm
@@ -1965,9 +1968,9 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK0-NEXT:    .cfi_restore z13
 ; CHECK0-NEXT:    .cfi_restore z14
 ; CHECK0-NEXT:    .cfi_restore z15
-; CHECK0-NEXT:    ldp x27, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
-; CHECK0-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x27, x19, [sp, #24] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x30, x28, [sp, #8] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr x29, [sp], #48 // 8-byte Folded Reload
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK0-NEXT:    .cfi_restore w19
 ; CHECK0-NEXT:    .cfi_restore w27
@@ -1980,13 +1983,12 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64:       // %bb.0: // %entry
 ; CHECK64-NEXT:    sub sp, sp, #112
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 112
-; CHECK64-NEXT:    cntd x9
 ; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_offset w19, -8
-; CHECK64-NEXT:    .cfi_offset w27, -16
-; CHECK64-NEXT:    .cfi_offset w28, -24
+; CHECK64-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK64-NEXT:    .cfi_offset w19, -16
+; CHECK64-NEXT:    .cfi_offset w27, -24
+; CHECK64-NEXT:    .cfi_offset w28, -32
 ; CHECK64-NEXT:    .cfi_offset w30, -40
 ; CHECK64-NEXT:    .cfi_offset w29, -48
 ; CHECK64-NEXT:    addvl sp, sp, #-18
@@ -2029,12 +2031,15 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 112 - 64 * VG
 ; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x01, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 176 + 144 * VG
+; CHECK64-NEXT:    cntd x9
+; CHECK64-NEXT:    addvl x10, sp, #18
 ; CHECK64-NEXT:    mov x8, x0
+; CHECK64-NEXT:    str x9, [x10, #168]
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
 ; CHECK64-NEXT:    bl __arm_sme_state
 ; CHECK64-NEXT:    and x19, x0, #0x1
-; CHECK64-NEXT:    .cfi_offset vg, -32
+; CHECK64-NEXT:    .cfi_offset vg, -8
 ; CHECK64-NEXT:    tbz w19, #0, .LBB28_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
 ; CHECK64-NEXT:    smstop sm
@@ -2090,9 +2095,9 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:    .cfi_restore z13
 ; CHECK64-NEXT:    .cfi_restore z14
 ; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    ldp x27, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x19, [sp, #88] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x30, x28, [sp, #72] // 16-byte Folded Reload
 ; CHECK64-NEXT:    add sp, sp, #112
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
@@ -2106,16 +2111,14 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK1024:       // %bb.0: // %entry
 ; CHECK1024-NEXT:    sub sp, sp, #1072
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1072
-; CHECK1024-NEXT:    cntd x9
 ; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w27, -16
-; CHECK1024-NEXT:    .cfi_offset w28, -24
+; CHECK1024-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NEXT:    .cfi_offset w27, -24
+; CHECK1024-NEXT:    .cfi_offset w28, -32
 ; CHECK1024-NEXT:    .cfi_offset w30, -40
 ; CHECK1024-NEXT:    .cfi_offset w29, -48
 ; CHECK1024-NEXT:    addvl sp, sp, #-18
@@ -2158,12 +2161,15 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 64 * VG
 ; CHECK1024-NEXT:    sub sp, sp, #1024
 ; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xb0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2096 + 144 * VG
+; CHECK1024-NEXT:    cntd x9
+; CHECK1024-NEXT:    addvl x10, sp, #18
 ; CHECK1024-NEXT:    mov x8, x0
+; CHECK1024-NEXT:    str x9, [x10, #2088]
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
 ; CHECK1024-NEXT:    bl __arm_sme_state
 ; CHECK1024-NEXT:    and x19, x0, #0x1
-; CHECK1024-NEXT:    .cfi_offset vg, -32
+; CHECK1024-NEXT:    .cfi_offset vg, -8
 ; CHECK1024-NEXT:    tbz w19, #0, .LBB28_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
 ; CHECK1024-NEXT:    smstop sm
@@ -2219,9 +2225,9 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK1024-NEXT:    .cfi_restore z13
 ; CHECK1024-NEXT:    .cfi_restore z14
 ; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    ldr x19, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x19, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    add sp, sp, #1072
@@ -2241,14 +2247,13 @@ entry:
 define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
 ; CHECK0-LABEL: svecc_alloca_call:
 ; CHECK0:       // %bb.0: // %entry
-; CHECK0-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK0-NEXT:    str x29, [sp, #-48]! // 8-byte Folded Spill
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 48
-; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x27, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w27, -16
-; CHECK0-NEXT:    .cfi_offset w28, -24
+; CHECK0-NEXT:    stp x30, x28, [sp, #8] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x27, x19, [sp, #24] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_offset w19, -16
+; CHECK0-NEXT:    .cfi_offset w27, -24
+; CHECK0-NEXT:    .cfi_offset w28, -32
 ; CHECK0-NEXT:    .cfi_offset w30, -40
 ; CHECK0-NEXT:    .cfi_offset w29, -48
 ; CHECK0-NEXT:    addvl sp, sp, #-18
@@ -2291,11 +2296,14 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
 ; CHECK0-NEXT:    sub sp, sp, #48
 ; CHECK0-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x00, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 96 + 144 * VG
+; CHECK0-NEXT:    cntd x8
+; CHECK0-NEXT:    addvl x9, sp, #18
+; CHECK0-NEXT:    str x8, [x9, #88]
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
 ; CHECK0-NEXT:    bl __arm_sme_state
 ; CHECK0-NEXT:    and x19, x0, #0x1
-; CHECK0-NEXT:    .cfi_offset vg, -32
+; CHECK0-NEXT:    .cfi_offset vg, -8
 ; CHECK0-NEXT:    tbz w19, #0, .LBB29_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
 ; CHECK0-NEXT:    smstop sm
@@ -2351,9 +2359,9 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK0-NEXT:    .cfi_restore z13
 ; CHECK0-NEXT:    .cfi_restore z14
 ; CHECK0-NEXT:    .cfi_restore z15
-; CHECK0-NEXT:    ldp x27, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
-; CHECK0-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x27, x19, [sp, #24] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x30, x28, [sp, #8] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr x29, [sp], #48 // 8-byte Folded Reload
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK0-NEXT:    .cfi_restore w19
 ; CHECK0-NEXT:    .cfi_restore w27
@@ -2366,13 +2374,12 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64:       // %bb.0: // %entry
 ; CHECK64-NEXT:    sub sp, sp, #112
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 112
-; CHECK64-NEXT:    cntd x9
 ; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_offset w19, -8
-; CHECK64-NEXT:    .cfi_offset w27, -16
-; CHECK64-NEXT:    .cfi_offset w28, -24
+; CHECK64-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK64-NEXT:    .cfi_offset w19, -16
+; CHECK64-NEXT:    .cfi_offset w27, -24
+; CHECK64-NEXT:    .cfi_offset w28, -32
 ; CHECK64-NEXT:    .cfi_offset w30, -40
 ; CHECK64-NEXT:    .cfi_offset w29, -48
 ; CHECK64-NEXT:    addvl sp, sp, #-18
@@ -2415,11 +2422,14 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x90, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 112 - 64 * VG
 ; CHECK64-NEXT:    sub sp, sp, #112
 ; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x01, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 224 + 144 * VG
+; CHECK64-NEXT:    cntd x8
+; CHECK64-NEXT:    addvl x9, sp, #18
+; CHECK64-NEXT:    str x8, [x9, #216]
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
 ; CHECK64-NEXT:    bl __arm_sme_state
 ; CHECK64-NEXT:    and x19, x0, #0x1
-; CHECK64-NEXT:    .cfi_offset vg, -32
+; CHECK64-NEXT:    .cfi_offset vg, -8
 ; CHECK64-NEXT:    tbz w19, #0, .LBB29_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
 ; CHECK64-NEXT:    smstop sm
@@ -2475,9 +2485,9 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:    .cfi_restore z13
 ; CHECK64-NEXT:    .cfi_restore z14
 ; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    ldp x27, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x19, [sp, #88] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x30, x28, [sp, #72] // 16-byte Folded Reload
 ; CHECK64-NEXT:    add sp, sp, #112
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
@@ -2491,16 +2501,14 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK1024:       // %bb.0: // %entry
 ; CHECK1024-NEXT:    sub sp, sp, #1072
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1072
-; CHECK1024-NEXT:    cntd x9
 ; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w27, -16
-; CHECK1024-NEXT:    .cfi_offset w28, -24
+; CHECK1024-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NEXT:    .cfi_offset w27, -24
+; CHECK1024-NEXT:    .cfi_offset w28, -32
 ; CHECK1024-NEXT:    .cfi_offset w30, -40
 ; CHECK1024-NEXT:    .cfi_offset w29, -48
 ; CHECK1024-NEXT:    addvl sp, sp, #-18
@@ -2543,11 +2551,14 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xd0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1072 - 64 * VG
 ; CHECK1024-NEXT:    sub sp, sp, #1072
 ; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0xe0, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 2144 + 144 * VG
+; CHECK1024-NEXT:    cntd x8
+; CHECK1024-NEXT:    addvl x9, sp, #18
+; CHECK1024-NEXT:    str x8, [x9, #2136]
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
 ; CHECK1024-NEXT:    bl __arm_sme_state
 ; CHECK1024-NEXT:    and x19, x0, #0x1
-; CHECK1024-NEXT:    .cfi_offset vg, -32
+; CHECK1024-NEXT:    .cfi_offset vg, -8
 ; CHECK1024-NEXT:    tbz w19, #0, .LBB29_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
 ; CHECK1024-NEXT:    smstop sm
@@ -2603,9 +2614,9 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK1024-NEXT:    .cfi_restore z13
 ; CHECK1024-NEXT:    .cfi_restore z14
 ; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    ldr x19, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x19, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    add sp, sp, #1072
@@ -2805,17 +2816,15 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK0:       // %bb.0: // %entry
 ; CHECK0-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 112
-; CHECK0-NEXT:    cntd x9
 ; CHECK0-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK0-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK0-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK0-NEXT:    add x29, sp, #64
 ; CHECK0-NEXT:    .cfi_def_cfa w29, 48
-; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w20, -16
+; CHECK0-NEXT:    .cfi_offset w19, -24
+; CHECK0-NEXT:    .cfi_offset w20, -32
 ; CHECK0-NEXT:    .cfi_offset w30, -40
 ; CHECK0-NEXT:    .cfi_offset w29, -48
 ; CHECK0-NEXT:    .cfi_offset b8, -56
@@ -2831,6 +2840,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK0-NEXT:    mov x9, sp
 ; CHECK0-NEXT:    mov w20, w0
 ; CHECK0-NEXT:    msub x9, x8, x8, x9
+; CHECK0-NEXT:    cntd x10
+; CHECK0-NEXT:    str x10, [x29, #32]
 ; CHECK0-NEXT:    mov sp, x9
 ; CHECK0-NEXT:    stur x9, [x29, #-80]
 ; CHECK0-NEXT:    sub x9, x29, #80
@@ -2838,7 +2849,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK0-NEXT:    stur wzr, [x29, #-68]
 ; CHECK0-NEXT:    sturh w8, [x29, #-72]
 ; CHECK0-NEXT:    msr TPIDR2_EL0, x9
-; CHECK0-NEXT:    .cfi_offset vg, -32
+; CHECK0-NEXT:    .cfi_offset vg, -16
 ; CHECK0-NEXT:    smstop sm
 ; CHECK0-NEXT:    bl other
 ; CHECK0-NEXT:    smstart sm
@@ -2854,7 +2865,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK0-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK0-NEXT:    sub sp, x29, #64
 ; CHECK0-NEXT:    .cfi_def_cfa wsp, 112
-; CHECK0-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
@@ -2879,17 +2890,15 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK64:       // %bb.0: // %entry
 ; CHECK64-NEXT:    stp d15, d14, [sp, #-176]! // 16-byte Folded Spill
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 176
-; CHECK64-NEXT:    cntd x9
 ; CHECK64-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp x29, x30, [sp, #128] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x20, [sp, #144] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #160] // 8-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #144] // 16-byte Folded Spill
 ; CHECK64-NEXT:    add x29, sp, #128
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 48
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w20, -24
+; CHECK64-NEXT:    .cfi_offset w19, -24
+; CHECK64-NEXT:    .cfi_offset w20, -32
 ; CHECK64-NEXT:    .cfi_offset w30, -40
 ; CHECK64-NEXT:    .cfi_offset w29, -48
 ; CHECK64-NEXT:    .cfi_offset b8, -120
@@ -2905,7 +2914,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK64-NEXT:    mov x9, sp
 ; CHECK64-NEXT:    mov w20, w0
 ; CHECK64-NEXT:    msub x9, x8, x8, x9
+; CHECK64-NEXT:    cntd x10
 ; CHECK64-NEXT:    mov x19, sp
+; CHECK64-NEXT:    str x10, [x29, #32]
 ; CHECK64-NEXT:    mov sp, x9
 ; CHECK64-NEXT:    str x9, [x19]
 ; CHECK64-NEXT:    add x9, x19, #0
@@ -2913,7 +2924,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK64-NEXT:    str wzr, [x19, #12]
 ; CHECK64-NEXT:    strh w8, [x19, #8]
 ; CHECK64-NEXT:    msr TPIDR2_EL0, x9
-; CHECK64-NEXT:    .cfi_offset vg, -32
+; CHECK64-NEXT:    .cfi_offset vg, -16
 ; CHECK64-NEXT:    smstop sm
 ; CHECK64-NEXT:    bl other
 ; CHECK64-NEXT:    smstart sm
@@ -2929,7 +2940,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK64-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK64-NEXT:    sub sp, x29, #128
 ; CHECK64-NEXT:    .cfi_def_cfa wsp, 176
-; CHECK64-NEXT:    ldp x20, x19, [sp, #152] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x20, x19, [sp, #144] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldr d14, [sp, #8] // 8-byte Folded Reload
 ; CHECK64-NEXT:    ldp x29, x30, [sp, #128] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
@@ -2955,22 +2966,20 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK1024:       // %bb.0: // %entry
 ; CHECK1024-NEXT:    sub sp, sp, #1136
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1136
-; CHECK1024-NEXT:    cntd x9
 ; CHECK1024-NEXT:    stp d15, d14, [sp] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str x29, [sp, #1088] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x30, [sp, #1096] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1104] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1112] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1120] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1128] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x28, [sp, #1104] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x20, [sp, #1112] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1120] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    add x29, sp, #1088
 ; CHECK1024-NEXT:    .cfi_def_cfa w29, 48
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w20, -16
-; CHECK1024-NEXT:    .cfi_offset w28, -24
+; CHECK1024-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NEXT:    .cfi_offset w20, -24
+; CHECK1024-NEXT:    .cfi_offset w28, -32
 ; CHECK1024-NEXT:    .cfi_offset w30, -40
 ; CHECK1024-NEXT:    .cfi_offset w29, -48
 ; CHECK1024-NEXT:    .cfi_offset b8, -1080
@@ -2986,7 +2995,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK1024-NEXT:    mov x9, sp
 ; CHECK1024-NEXT:    mov w20, w0
 ; CHECK1024-NEXT:    msub x9, x8, x8, x9
+; CHECK1024-NEXT:    cntd x10
 ; CHECK1024-NEXT:    mov x19, sp
+; CHECK1024-NEXT:    str x10, [x29, #40]
 ; CHECK1024-NEXT:    mov sp, x9
 ; CHECK1024-NEXT:    str x9, [x19]
 ; CHECK1024-NEXT:    add x9, x19, #0
@@ -2994,7 +3005,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK1024-NEXT:    str wzr, [x19, #12]
 ; CHECK1024-NEXT:    strh w8, [x19, #8]
 ; CHECK1024-NEXT:    msr TPIDR2_EL0, x9
-; CHECK1024-NEXT:    .cfi_offset vg, -32
+; CHECK1024-NEXT:    .cfi_offset vg, -8
 ; CHECK1024-NEXT:    smstop sm
 ; CHECK1024-NEXT:    bl other
 ; CHECK1024-NEXT:    smstart sm
@@ -3011,10 +3022,10 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK1024-NEXT:    sub sp, x29, #1088
 ; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1136
 ; CHECK1024-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr x19, [sp, #1128] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x19, [sp, #1120] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1120] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x20, [sp, #1112] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1104] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x30, [sp, #1096] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x29, [sp, #1088] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -3150,17 +3161,16 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK0:       // %bb.0: // %entry
 ; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 64
-; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x27, x26, [sp, #24] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x20, x19, [sp, #40] // 16-byte Folded Spill
 ; CHECK0-NEXT:    mov x29, sp
 ; CHECK0-NEXT:    .cfi_def_cfa w29, 64
-; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w20, -16
-; CHECK0-NEXT:    .cfi_offset w26, -24
-; CHECK0-NEXT:    .cfi_offset w27, -32
-; CHECK0-NEXT:    .cfi_offset w28, -40
+; CHECK0-NEXT:    .cfi_offset w19, -16
+; CHECK0-NEXT:    .cfi_offset w20, -24
+; CHECK0-NEXT:    .cfi_offset w26, -32
+; CHECK0-NEXT:    .cfi_offset w27, -40
+; CHECK0-NEXT:    .cfi_offset w28, -48
 ; CHECK0-NEXT:    .cfi_offset w30, -56
 ; CHECK0-NEXT:    .cfi_offset w29, -64
 ; CHECK0-NEXT:    addvl sp, sp, #-18
@@ -3201,18 +3211,20 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 64 - 56 * VG
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG
 ; CHECK0-NEXT:    mov w9, w0
-; CHECK0-NEXT:    mov x8, sp
+; CHECK0-NEXT:    cntd x8
 ; CHECK0-NEXT:    mov w2, w1
 ; CHECK0-NEXT:    add x9, x9, #15
-; CHECK0-NEXT:    mov x19, sp
+; CHECK0-NEXT:    str x8, [x29, #56]
+; CHECK0-NEXT:    mov x8, sp
 ; CHECK0-NEXT:    and x9, x9, #0x1fffffff0
+; CHECK0-NEXT:    mov x19, sp
 ; CHECK0-NEXT:    sub x8, x8, x9
 ; CHECK0-NEXT:    mov sp, x8
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
 ; CHECK0-NEXT:    bl __arm_sme_state
 ; CHECK0-NEXT:    and x20, x0, #0x1
-; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    .cfi_offset vg, -8
 ; CHECK0-NEXT:    tbz w20, #0, .LBB35_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
 ; CHECK0-NEXT:    smstop sm
@@ -3266,9 +3278,9 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK0-NEXT:    .cfi_restore z15
 ; CHECK0-NEXT:    mov sp, x29
 ; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
-; CHECK0-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
-; CHECK0-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x20, x19, [sp, #40] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK0-NEXT:    ldp x27, x26, [sp, #24] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK0-NEXT:    .cfi_restore w19
@@ -3284,18 +3296,17 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64:       // %bb.0: // %entry
 ; CHECK64-NEXT:    sub sp, sp, #128
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 128
-; CHECK64-NEXT:    cntd x9
 ; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x26, x20, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
 ; CHECK64-NEXT:    add x29, sp, #64
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
-; CHECK64-NEXT:    .cfi_offset w19, -8
-; CHECK64-NEXT:    .cfi_offset w20, -16
-; CHECK64-NEXT:    .cfi_offset w26, -24
-; CHECK64-NEXT:    .cfi_offset w27, -32
-; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset w19, -16
+; CHECK64-NEXT:    .cfi_offset w20, -24
+; CHECK64-NEXT:    .cfi_offset w26, -32
+; CHECK64-NEXT:    .cfi_offset w27, -40
+; CHECK64-NEXT:    .cfi_offset w28, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
 ; CHECK64-NEXT:    addvl sp, sp, #-18
@@ -3337,18 +3348,20 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG
 ; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    mov w9, w0
-; CHECK64-NEXT:    mov x8, sp
+; CHECK64-NEXT:    cntd x8
 ; CHECK64-NEXT:    mov w2, w1
 ; CHECK64-NEXT:    add x9, x9, #15
-; CHECK64-NEXT:    mov x19, sp
+; CHECK64-NEXT:    str x8, [x29, #56]
+; CHECK64-NEXT:    mov x8, sp
 ; CHECK64-NEXT:    and x9, x9, #0x1fffffff0
+; CHECK64-NEXT:    mov x19, sp
 ; CHECK64-NEXT:    sub x8, x8, x9
 ; CHECK64-NEXT:    mov sp, x8
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
 ; CHECK64-NEXT:    bl __arm_sme_state
 ; CHECK64-NEXT:    and x20, x0, #0x1
-; CHECK64-NEXT:    .cfi_offset vg, -48
+; CHECK64-NEXT:    .cfi_offset vg, -8
 ; CHECK64-NEXT:    tbz w20, #0, .LBB35_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
 ; CHECK64-NEXT:    smstop sm
@@ -3403,10 +3416,10 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_restore z15
 ; CHECK64-NEXT:    sub sp, x29, #64
 ; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
-; CHECK64-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
-; CHECK64-NEXT:    ldp x27, x26, [sp, #96] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x20, x19, [sp, #104] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #88] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x30, x28, [sp, #72] // 16-byte Folded Reload
 ; CHECK64-NEXT:    add sp, sp, #128
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
@@ -3422,22 +3435,20 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK1024:       // %bb.0: // %entry
 ; CHECK1024-NEXT:    sub sp, sp, #1088
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT:    cntd x9
 ; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x26, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x20, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    add x29, sp, #1024
 ; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w20, -16
-; CHECK1024-NEXT:    .cfi_offset w26, -24
-; CHECK1024-NEXT:    .cfi_offset w27, -32
-; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NEXT:    .cfi_offset w20, -24
+; CHECK1024-NEXT:    .cfi_offset w26, -32
+; CHECK1024-NEXT:    .cfi_offset w27, -40
+; CHECK1024-NEXT:    .cfi_offset w28, -48
 ; CHECK1024-NEXT:    .cfi_offset w30, -56
 ; CHECK1024-NEXT:    .cfi_offset w29, -64
 ; CHECK1024-NEXT:    addvl sp, sp, #-18
@@ -3479,18 +3490,20 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1088 - 64 * VG
 ; CHECK1024-NEXT:    sub sp, sp, #1024
 ; CHECK1024-NEXT:    mov w9, w0
-; CHECK1024-NEXT:    mov x8, sp
+; CHECK1024-NEXT:    cntd x8
 ; CHECK1024-NEXT:    mov w2, w1
 ; CHECK1024-NEXT:    add x9, x9, #15
-; CHECK1024-NEXT:    mov x19, sp
+; CHECK1024-NEXT:    str x8, [x29, #56]
+; CHECK1024-NEXT:    mov x8, sp
 ; CHECK1024-NEXT:    and x9, x9, #0x1fffffff0
+; CHECK1024-NEXT:    mov x19, sp
 ; CHECK1024-NEXT:    sub x8, x8, x9
 ; CHECK1024-NEXT:    mov sp, x8
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
 ; CHECK1024-NEXT:    bl __arm_sme_state
 ; CHECK1024-NEXT:    and x20, x0, #0x1
-; CHECK1024-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NEXT:    .cfi_offset vg, -8
 ; CHECK1024-NEXT:    tbz w20, #0, .LBB35_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
 ; CHECK1024-NEXT:    smstop sm
@@ -3545,11 +3558,11 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK1024-NEXT:    .cfi_restore z15
 ; CHECK1024-NEXT:    sub sp, x29, #1024
 ; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x20, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x26, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    add sp, sp, #1088
@@ -3575,16 +3588,14 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK0:       // %bb.0: // %entry
 ; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 64
-; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
-; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
-; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x28, x27, [sp, #16] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x26, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK0-NEXT:    mov x29, sp
 ; CHECK0-NEXT:    .cfi_def_cfa w29, 64
-; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w26, -16
-; CHECK0-NEXT:    .cfi_offset w27, -24
-; CHECK0-NEXT:    .cfi_offset w28, -32
+; CHECK0-NEXT:    .cfi_offset w19, -24
+; CHECK0-NEXT:    .cfi_offset w26, -32
+; CHECK0-NEXT:    .cfi_offset w27, -40
+; CHECK0-NEXT:    .cfi_offset w28, -48
 ; CHECK0-NEXT:    .cfi_offset w30, -56
 ; CHECK0-NEXT:    .cfi_offset w29, -64
 ; CHECK0-NEXT:    addvl sp, sp, #-18
@@ -3626,12 +3637,14 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x40, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 64 - 64 * VG
 ; CHECK0-NEXT:    sub x9, sp, #1024
 ; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK0-NEXT:    cntd x8
 ; CHECK0-NEXT:    mov w2, w1
+; CHECK0-NEXT:    str x8, [x29, #48]
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
 ; CHECK0-NEXT:    bl __arm_sme_state
 ; CHECK0-NEXT:    and x19, x0, #0x1
-; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    .cfi_offset vg, -16
 ; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
 ; CHECK0-NEXT:    smstop sm
@@ -3685,8 +3698,8 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK0-NEXT:    .cfi_restore z15
 ; CHECK0-NEXT:    mov sp, x29
 ; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
-; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x26, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x28, x27, [sp, #16] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK0-NEXT:    .cfi_restore w19
@@ -3701,17 +3714,15 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64:       // %bb.0: // %entry
 ; CHECK64-NEXT:    sub sp, sp, #128
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 128
-; CHECK64-NEXT:    cntd x9
 ; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK64-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x26, x19, [sp, #96] // 16-byte Folded Spill
 ; CHECK64-NEXT:    add x29, sp, #64
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w26, -24
-; CHECK64-NEXT:    .cfi_offset w27, -32
-; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset w19, -24
+; CHECK64-NEXT:    .cfi_offset w26, -32
+; CHECK64-NEXT:    .cfi_offset w27, -40
+; CHECK64-NEXT:    .cfi_offset w28, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
 ; CHECK64-NEXT:    addvl sp, sp, #-18
@@ -3753,12 +3764,14 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0x80, 0x7f, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 128 - 64 * VG
 ; CHECK64-NEXT:    sub x9, sp, #1088
 ; CHECK64-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK64-NEXT:    cntd x8
 ; CHECK64-NEXT:    mov w2, w1
+; CHECK64-NEXT:    str x8, [x29, #48]
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
 ; CHECK64-NEXT:    bl __arm_sme_state
 ; CHECK64-NEXT:    and x19, x0, #0x1
-; CHECK64-NEXT:    .cfi_offset vg, -48
+; CHECK64-NEXT:    .cfi_offset vg, -16
 ; CHECK64-NEXT:    tbz w19, #0, .LBB36_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
 ; CHECK64-NEXT:    smstop sm
@@ -3813,8 +3826,8 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    .cfi_restore z15
 ; CHECK64-NEXT:    sub sp, x29, #64
 ; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
-; CHECK64-NEXT:    ldp x26, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x28, x27, [sp, #88] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x26, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x28, x27, [sp, #80] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK64-NEXT:    add sp, sp, #128
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
@@ -3830,20 +3843,18 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK1024:       // %bb.0: // %entry
 ; CHECK1024-NEXT:    sub sp, sp, #1088
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT:    cntd x9
 ; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x26, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1064] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    add x29, sp, #1024
 ; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w26, -24
-; CHECK1024-NEXT:    .cfi_offset w27, -32
-; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset w19, -24
+; CHECK1024-NEXT:    .cfi_offset w26, -32
+; CHECK1024-NEXT:    .cfi_offset w27, -40
+; CHECK1024-NEXT:    .cfi_offset w28, -48
 ; CHECK1024-NEXT:    .cfi_offset w30, -56
 ; CHECK1024-NEXT:    .cfi_offset w29, -64
 ; CHECK1024-NEXT:    addvl sp, sp, #-18
@@ -3885,12 +3896,14 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x11, 0xc0, 0x77, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 1088 - 64 * VG
 ; CHECK1024-NEXT:    sub x9, sp, #2048
 ; CHECK1024-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK1024-NEXT:    cntd x8
 ; CHECK1024-NEXT:    mov w2, w1
+; CHECK1024-NEXT:    str x8, [x29, #48]
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
 ; CHECK1024-NEXT:    bl __arm_sme_state
 ; CHECK1024-NEXT:    and x19, x0, #0x1
-; CHECK1024-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NEXT:    .cfi_offset vg, -16
 ; CHECK1024-NEXT:    tbz w19, #0, .LBB36_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
 ; CHECK1024-NEXT:    smstop sm
@@ -3945,10 +3958,10 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK1024-NEXT:    .cfi_restore z15
 ; CHECK1024-NEXT:    sub sp, x29, #1024
 ; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x19, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x26, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    add sp, sp, #1088
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index f1e684c86e896..5408d72ecb602 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -22,14 +22,15 @@ define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-NO-SME-ROUTINES-LABEL: se_memcpy:
 ; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT:    cntd x8
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x8, [sp, #72]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
@@ -72,13 +73,14 @@ define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-NO-SME-ROUTINES-LABEL: se_memset:
 ; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    cntd x8
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x8, [sp, #72]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov w1, #2 // =0x2
@@ -121,14 +123,15 @@ define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-NO-SME-ROUTINES-LABEL: se_memmove:
 ; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT:    cntd x8
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x8, [sp, #72]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
@@ -172,13 +175,13 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NO-SME-ROUTINES-LABEL: sc_memcpy:
 ; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT:    cntd x8
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x8, [sp, #80]
 ; CHECK-NO-SME-ROUTINES-NEXT:    bl __arm_sme_state
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x8, :got:dst
 ; CHECK-NO-SME-ROUTINES-NEXT:    and x19, x0, #0x1
@@ -195,10 +198,9 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.3: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
 ; CHECK-NO-SME-ROUTINES-NEXT:  .LBB3_4: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ret
@@ -222,15 +224,16 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: sb_memcpy:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    lsr x9, x9, #3
+; CHECK-NEXT:    lsr x8, x8, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x2, x0
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #72]
+; CHECK-NEXT:    str x9, [sp, #80]
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    adrp x0, :got:dst
 ; CHECK-NEXT:    adrp x1, :got:src
@@ -248,15 +251,16 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-NO-SME-ROUTINES-LABEL: sb_memcpy:
 ; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    rdsvl x9, #1
+; CHECK-NO-SME-ROUTINES-NEXT:    rdsvl x8, #1
+; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT:    lsr x9, x9, #3
+; CHECK-NO-SME-ROUTINES-NEXT:    lsr x8, x8, #3
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
-; CHECK-NO-SME-ROUTINES-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x8, [sp, #72]
+; CHECK-NO-SME-ROUTINES-NEXT:    str x9, [sp, #80]
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
@@ -273,15 +277,15 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
 ;
 ; CHECK-MOPS-LABEL: sb_memcpy:
 ; CHECK-MOPS:       // %bb.0: // %entry
-; CHECK-MOPS-NEXT:    rdsvl x9, #1
-; CHECK-MOPS-NEXT:    lsr x9, x9, #3
-; CHECK-MOPS-NEXT:    str x9, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    rdsvl x8, #1
 ; CHECK-MOPS-NEXT:    cntd x9
-; CHECK-MOPS-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    lsr x8, x8, #3
+; CHECK-MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    str x8, [sp, #64]
+; CHECK-MOPS-NEXT:    str x9, [sp, #72]
 ; CHECK-MOPS-NEXT:    smstart sm
 ; CHECK-MOPS-NEXT:    adrp x8, :got:src
 ; CHECK-MOPS-NEXT:    adrp x9, :got:dst
@@ -291,11 +295,10 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
 ; CHECK-MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
 ; CHECK-MOPS-NEXT:    smstop sm
-; CHECK-MOPS-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    add sp, sp, #80
+; CHECK-MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-MOPS-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index c5cf4593cc86d..3ed6d719fd556 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -372,14 +372,13 @@ entry:
 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: svecc_call:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #-48]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x27, x19, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w27, -16
-; CHECK-NEXT:    .cfi_offset w28, -24
+; CHECK-NEXT:    stp x30, x28, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x27, x19, [sp, #24] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w27, -24
+; CHECK-NEXT:    .cfi_offset w28, -32
 ; CHECK-NEXT:    .cfi_offset w30, -40
 ; CHECK-NEXT:    .cfi_offset w29, -48
 ; CHECK-NEXT:    addvl sp, sp, #-18
@@ -420,12 +419,15 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    addvl x10, sp, #18
 ; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    str x9, [x10, #40]
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    .cfi_offset vg, -8
 ; CHECK-NEXT:    tbz w19, #0, .LBB7_2
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    smstop sm
@@ -479,9 +481,9 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK-NEXT:    .cfi_restore z13
 ; CHECK-NEXT:    .cfi_restore z14
 ; CHECK-NEXT:    .cfi_restore z15
-; CHECK-NEXT:    ldp x27, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x27, x19, [sp, #24] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x28, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w19
 ; CHECK-NEXT:    .cfi_restore w27
@@ -500,9 +502,9 @@ declare ptr @memset(ptr, i32, i32)
 ; objects, we emit correct offsets for all objects except for these VLA objects.
 
 ; CHECK-FRAMELAYOUT-LABEL: Function: vastate
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8
@@ -521,17 +523,15 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 112
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    add x29, sp, #64
 ; CHECK-NEXT:    .cfi_def_cfa w29, 48
-; CHECK-NEXT:    .cfi_offset w19, -8
-; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w19, -24
+; CHECK-NEXT:    .cfi_offset w20, -32
 ; CHECK-NEXT:    .cfi_offset w30, -40
 ; CHECK-NEXT:    .cfi_offset w29, -48
 ; CHECK-NEXT:    .cfi_offset b8, -56
@@ -547,6 +547,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    mov w20, w0
 ; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    cntd x10
+; CHECK-NEXT:    str x10, [x29, #32]
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-80]
 ; CHECK-NEXT:    sub x9, x29, #80
@@ -554,7 +556,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK-NEXT:    stur wzr, [x29, #-68]
 ; CHECK-NEXT:    sturh w8, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x9
-; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl other
 ; CHECK-NEXT:    smstart sm
@@ -570,7 +572,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    sub sp, x29, #64
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 112
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload



More information about the llvm-commits mailing list