[llvm] 478b4b0 - [AArch64][SME] Rework VG CFI information for streaming-mode changes (#152283)

via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 20 06:06:17 PDT 2025


Author: Benjamin Maxwell
Date: 2025-08-20T14:06:12+01:00
New Revision: 478b4b012f6c70f799ffeb3523b5a160aed8726b

URL: https://github.com/llvm/llvm-project/commit/478b4b012f6c70f799ffeb3523b5a160aed8726b
DIFF: https://github.com/llvm/llvm-project/commit/478b4b012f6c70f799ffeb3523b5a160aed8726b.diff

LOG: [AArch64][SME] Rework VG CFI information for streaming-mode changes (#152283)

This patch reworks how VG is handled around streaming mode changes.

Previously, for functions with streaming mode changes, we would:

- Save the incoming VG in the prologue
- Emit `.cfi_offset vg, <offset>` and `.cfi_restore vg` around streaming
  mode changes

Additionally, for locally streaming functions, we would:

- Also save the streaming VG in the prologue
- Emit `.cfi_offset vg, <incoming VG offset>` in the prologue
- Emit `.cfi_offset vg, <streaming VG offset>` and `.cfi_restore vg`
  around streaming mode changes

In both cases, this ends up doing more than necessary and would be hard
for an unwinder to parse, as using `.cfi_offset` in this way does not
follow the semantics of the underlying DWARF CFI opcodes.

So the new scheme in this patch is to:

In functions with streaming mode changes (inc locally streaming)

- Save the incoming VG in the prologue
- Emit `.cfi_offset vg, <offset>` in the prologue (not at streaming mode
  changes)
- Emit `.cfi_restore vg` after the saved VG has been deallocated 
- This will be in the function epilogue, where VG is always the same as
  the entry VG
- Explicitly reference the incoming VG expressions for SVE callee-saves
in functions with streaming mode changes
- Ensure the CFA is not described in terms of VG in functions with
  streaming mode changes

A more in-depth discussion of this scheme is available in:
https://gist.github.com/MacDue/b7a5c45d131d2440858165bfc903e97b

But the TLDR is that following this scheme, SME unwinding can be
implemented with minimal changes to existing unwinders. All unwinders
need to do is initialize VG to `CNTD` at the start of unwinding, then
everything else is handled by standard opcodes (which don't need changes
to handle VG).

Added: 
    llvm/test/CodeGen/AArch64/sme-streaming-mode-changes-unwindinfo.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/lib/Target/AArch64/AArch64InstrInfo.h
    llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
    llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
    llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
    llvm/test/CodeGen/AArch64/aarch64-sme-za-call-lowering.ll
    llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
    llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
    llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
    llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
    llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll
    llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
    llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
    llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll
    llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
    llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
    llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
    llvm/test/CodeGen/AArch64/sme-streaming-body.ll
    llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
    llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
    llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
    llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
    llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
    llvm/test/CodeGen/AArch64/stack-hazard.ll
    llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
    llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index fddde668b7f1a..7725fa4f1ccb1 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -338,7 +338,7 @@ static bool requiresSaveVG(const MachineFunction &MF);
 // Conservatively, returns true if the function is likely to have an SVE vectors
 // on the stack. This function is safe to be called before callee-saves or
 // object offsets have been determined.
-static bool isLikelyToHaveSVEStack(MachineFunction &MF) {
+static bool isLikelyToHaveSVEStack(const MachineFunction &MF) {
   auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   if (AFI->isSVECC())
     return true;
@@ -532,6 +532,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
 bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
 
   // Win64 EH requires a frame pointer if funclets are present, as the locals
   // are accessed off the frame pointer in both the parent function and the
@@ -545,6 +546,29 @@ bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const {
       MFI.hasStackMap() || MFI.hasPatchPoint() ||
       RegInfo->hasStackRealignment(MF))
     return true;
+
+  // If we:
+  //
+  //   1. Have streaming mode changes
+  //     OR:
+  //   2. Have a streaming body with SVE stack objects
+  //
+  // Then the value of VG restored when unwinding to this function may not match
+  // the value of VG used to set up the stack.
+  //
+  // This is a problem as the CFA can be described with an expression of the
+  // form: CFA = SP + NumBytes + VG * NumScalableBytes.
+  //
+  // If the value of VG used in that expression does not match the value used to
+  // set up the stack, an incorrect address for the CFA will be computed, and
+  // unwinding will fail.
+  //
+  // We work around this issue by ensuring the frame-pointer can describe the
+  // CFA in either of these cases.
+  if (AFI.needsDwarfUnwindInfo(MF) &&
+      ((requiresSaveVG(MF) || AFI.getSMEFnAttrs().hasStreamingBody()) &&
+       (!AFI.hasCalculatedStackSizeSVE() || AFI.getStackSizeSVE() > 0)))
+    return true;
   // With large callframes around we may need to use FP to access the scavenging
   // emergency spillslot.
   //
@@ -663,10 +687,6 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  SMEAttrs Attrs = AFI->getSMEFnAttrs();
-  bool LocallyStreaming =
-      Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface();
 
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
   if (CSI.empty())
@@ -680,14 +700,6 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
 
     assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
     int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
-
-    // The location of VG will be emitted before each streaming-mode change in
-    // the function. Only locally-streaming functions require emitting the
-    // non-streaming VG location here.
-    if ((LocallyStreaming && FrameIdx == AFI->getStreamingVGIdx()) ||
-        (!LocallyStreaming && Info.getReg() == AArch64::VG))
-      continue;
-
     CFIBuilder.buildOffset(Info.getReg(), Offset);
   }
 }
@@ -707,8 +719,16 @@ void AArch64FrameLowering::emitCalleeSavedSVELocations(
   AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
   CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
 
+  std::optional<int64_t> IncomingVGOffsetFromDefCFA;
+  if (requiresSaveVG(MF)) {
+    auto IncomingVG = *find_if(
+        reverse(CSI), [](auto &Info) { return Info.getReg() == AArch64::VG; });
+    IncomingVGOffsetFromDefCFA =
+        MFI.getObjectOffset(IncomingVG.getFrameIdx()) - getOffsetOfLocalArea();
+  }
+
   for (const auto &Info : CSI) {
-    if (!(MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
+    if (MFI.getStackID(Info.getFrameIdx()) != TargetStackID::ScalableVector)
       continue;
 
     // Not all unwinders may know about SVE registers, so assume the lowest
@@ -722,7 +742,8 @@ void AArch64FrameLowering::emitCalleeSavedSVELocations(
         StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
         StackOffset::getFixed(AFI.getCalleeSavedStackSize(MFI));
 
-    CFIBuilder.insertCFIInst(createCFAOffset(TRI, Reg, Offset));
+    CFIBuilder.insertCFIInst(
+        createCFAOffset(TRI, Reg, Offset, IncomingVGOffsetFromDefCFA));
   }
 }
 
@@ -783,9 +804,6 @@ static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
         !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
       continue;
 
-    if (!Info.isRestored())
-      continue;
-
     CFIBuilder.buildRestore(Info.getReg());
   }
 }
@@ -1465,10 +1483,10 @@ bool requiresGetVGCall(MachineFunction &MF) {
 
 static bool requiresSaveVG(const MachineFunction &MF) {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  if (!AFI->needsDwarfUnwindInfo(MF) || !AFI->hasStreamingModeChanges())
+    return false;
   // For Darwin platforms we don't save VG for non-SVE functions, even if SME
   // is enabled with streaming mode changes.
-  if (!AFI->hasStreamingModeChanges())
-    return false;
   auto &ST = MF.getSubtarget<AArch64Subtarget>();
   if (ST.isTargetDarwin())
     return ST.hasSVE();
@@ -1484,8 +1502,7 @@ static bool matchLibcall(const TargetLowering &TLI, const MachineOperand &MO,
 bool isVGInstruction(MachineBasicBlock::iterator MBBI,
                      const TargetLowering &TLI) {
   unsigned Opc = MBBI->getOpcode();
-  if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
-      Opc == AArch64::UBFMXri)
+  if (Opc == AArch64::CNTD_XPiI)
     return true;
 
   if (!requiresGetVGCall(*MBBI->getMF()))
@@ -1494,7 +1511,7 @@ bool isVGInstruction(MachineBasicBlock::iterator MBBI,
   if (Opc == AArch64::BL)
     return matchLibcall(TLI, MBBI->getOperand(0), RTLIB::SMEABI_GET_CURRENT_VG);
 
-  return Opc == AArch64::ORRXrr;
+  return Opc == TargetOpcode::COPY;
 }
 
 // Convert callee-save register save/restore instruction to do stack pointer
@@ -1509,9 +1526,8 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
   unsigned NewOpc;
 
   // If the function contains streaming mode changes, we expect instructions
-  // to calculate the value of VG before spilling. For locally-streaming
-  // functions, we need to do this for both the streaming and non-streaming
-  // vector length. Move past these instructions if necessary.
+  // to calculate the value of VG before spilling. Move past these instructions
+  // if necessary.
   MachineFunction &MF = *MBB.getParent();
   if (requiresSaveVG(MF)) {
     auto &TLI = *MF.getSubtarget().getTargetLowering();
@@ -3475,7 +3491,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
   MachineFunction &MF = *MBB.getParent();
   auto &TLI = *MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   bool NeedsWinCFI = needsWinCFI(MF);
   DebugLoc DL;
   SmallVector<RegPairInfo, 8> RegPairs;
@@ -3544,48 +3559,34 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     }
 
     unsigned X0Scratch = AArch64::NoRegister;
+    auto RestoreX0 = make_scope_exit([&] {
+      if (X0Scratch != AArch64::NoRegister)
+        BuildMI(MBB, MI, DL, TII.get(TargetOpcode::COPY), AArch64::X0)
+            .addReg(X0Scratch)
+            .setMIFlag(MachineInstr::FrameSetup);
+    });
+
     if (Reg1 == AArch64::VG) {
       // Find an available register to store value of VG to.
       Reg1 = findScratchNonCalleeSaveRegister(&MBB, true);
       assert(Reg1 != AArch64::NoRegister);
-      SMEAttrs Attrs = AFI->getSMEFnAttrs();
-
-      if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() &&
-          AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) {
-        // For locally-streaming functions, we need to store both the streaming
-        // & non-streaming VG. Spill the streaming value first.
-        BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1)
-            .addImm(1)
-            .setMIFlag(MachineInstr::FrameSetup);
-        BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1)
-            .addReg(Reg1)
-            .addImm(3)
-            .addImm(63)
-            .setMIFlag(MachineInstr::FrameSetup);
-
-        AFI->setStreamingVGIdx(RPI.FrameIdx);
-      } else if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+      if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
         BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
             .addImm(31)
             .addImm(1)
             .setMIFlag(MachineInstr::FrameSetup);
-        AFI->setVGIdx(RPI.FrameIdx);
       } else {
         const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
-        if (llvm::any_of(
-                MBB.liveins(),
-                [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
-                  return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
-                      AArch64::X0, LiveIn.PhysReg);
-                }))
+        if (any_of(MBB.liveins(),
+                   [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
+                     return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
+                         AArch64::X0, LiveIn.PhysReg);
+                   })) {
           X0Scratch = Reg1;
-
-        if (X0Scratch != AArch64::NoRegister)
-          BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)
-              .addReg(AArch64::XZR)
-              .addReg(AArch64::X0, RegState::Undef)
-              .addReg(AArch64::X0, RegState::Implicit)
+          BuildMI(MBB, MI, DL, TII.get(TargetOpcode::COPY), X0Scratch)
+              .addReg(AArch64::X0)
               .setMIFlag(MachineInstr::FrameSetup);
+        }
 
         RTLIB::Libcall LC = RTLIB::SMEABI_GET_CURRENT_VG;
         const uint32_t *RegMask =
@@ -3596,7 +3597,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
             .addReg(AArch64::X0, RegState::ImplicitDefine)
             .setMIFlag(MachineInstr::FrameSetup);
         Reg1 = AArch64::X0;
-        AFI->setVGIdx(RPI.FrameIdx);
       }
     }
 
@@ -3691,13 +3691,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       if (RPI.isPaired())
         MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
     }
-
-    if (X0Scratch != AArch64::NoRegister)
-      BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0)
-          .addReg(AArch64::XZR)
-          .addReg(X0Scratch, RegState::Undef)
-          .addReg(X0Scratch, RegState::Implicit)
-          .setMIFlag(MachineInstr::FrameSetup);
   }
   return true;
 }
@@ -4076,15 +4069,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // Increase the callee-saved stack size if the function has streaming mode
   // changes, as we will need to spill the value of the VG register.
-  // For locally streaming functions, we spill both the streaming and
-  // non-streaming VG value.
-  SMEAttrs Attrs = AFI->getSMEFnAttrs();
-  if (requiresSaveVG(MF)) {
-    if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
-      CSStackSize += 16;
-    else
-      CSStackSize += 8;
-  }
+  if (requiresSaveVG(MF))
+    CSStackSize += 8;
 
   // Determine if a Hazard slot should be used, and increase the CSStackSize by
   // StackHazardSize if so.
@@ -4235,29 +4221,13 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
 
   // Insert VG into the list of CSRs, immediately before LR if saved.
   if (requiresSaveVG(MF)) {
-    std::vector<CalleeSavedInfo> VGSaves;
-    SMEAttrs Attrs = AFI->getSMEFnAttrs();
-
-    auto VGInfo = CalleeSavedInfo(AArch64::VG);
-    VGInfo.setRestored(false);
-    VGSaves.push_back(VGInfo);
-
-    // Add VG again if the function is locally-streaming, as we will spill two
-    // values.
-    if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
-      VGSaves.push_back(VGInfo);
-
-    bool InsertBeforeLR = false;
-
-    for (unsigned I = 0; I < CSI.size(); I++)
-      if (CSI[I].getReg() == AArch64::LR) {
-        InsertBeforeLR = true;
-        CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end());
-        break;
-      }
-
-    if (!InsertBeforeLR)
-      llvm::append_range(CSI, VGSaves);
+    CalleeSavedInfo VGInfo(AArch64::VG);
+    auto It =
+        find_if(CSI, [](auto &Info) { return Info.getReg() == AArch64::LR; });
+    if (It != CSI.end())
+      CSI.insert(It, VGInfo);
+    else
+      CSI.push_back(VGInfo);
   }
 
   Register LastReg = 0;
@@ -5260,46 +5230,11 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
 }
 } // namespace
 
-static void emitVGSaveRestore(MachineBasicBlock::iterator II,
-                              const AArch64FrameLowering *TFI) {
-  MachineInstr &MI = *II;
-  MachineBasicBlock *MBB = MI.getParent();
-  MachineFunction *MF = MBB->getParent();
-
-  if (MI.getOpcode() != AArch64::VGSavePseudo &&
-      MI.getOpcode() != AArch64::VGRestorePseudo)
-    return;
-
-  auto *AFI = MF->getInfo<AArch64FunctionInfo>();
-  SMEAttrs FuncAttrs = AFI->getSMEFnAttrs();
-  bool LocallyStreaming =
-      FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
-
-  int64_t VGFrameIdx =
-      LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx();
-  assert(VGFrameIdx != std::numeric_limits<int>::max() &&
-         "Expected FrameIdx for VG");
-
-  CFIInstBuilder CFIBuilder(*MBB, II, MachineInstr::NoFlags);
-  if (MI.getOpcode() == AArch64::VGSavePseudo) {
-    const MachineFrameInfo &MFI = MF->getFrameInfo();
-    int64_t Offset =
-        MFI.getObjectOffset(VGFrameIdx) - TFI->getOffsetOfLocalArea();
-    CFIBuilder.buildOffset(AArch64::VG, Offset);
-  } else {
-    CFIBuilder.buildRestore(AArch64::VG);
-  }
-
-  MI.eraseFromParent();
-}
-
 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
     MachineFunction &MF, RegScavenger *RS = nullptr) const {
   for (auto &BB : MF)
     for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) {
-      if (requiresSaveVG(MF))
-        emitVGSaveRestore(II++, this);
-      else if (StackTaggingMergeSetTag)
+      if (StackTaggingMergeSetTag)
         II = tryMergeAdjacentSTG(II, this, RS);
     }
 

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index fd91e52282475..d168cc8d1bd06 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9517,17 +9517,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SDValue InGlue;
   if (RequiresSMChange) {
-    if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
-      Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
-                          DAG.getVTList(MVT::Other, MVT::Glue), Chain);
-      InGlue = Chain.getValue(1);
-    }
-
-    SDValue NewChain =
+    Chain =
         changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(),
                             Chain, InGlue, getSMToggleCondition(CallAttrs));
-    Chain = NewChain.getValue(0);
-    InGlue = NewChain.getValue(1);
+    InGlue = Chain.getValue(1);
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -9712,13 +9705,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Result = changeStreamingMode(
         DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
         getSMToggleCondition(CallAttrs));
-
-    if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
-      InGlue = Result.getValue(1);
-      Result =
-          DAG.getNode(AArch64ISD::VG_RESTORE, DL,
-                      DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
-    }
   }
 
   if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d15f90deba74e..db028b4b7677c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5900,6 +5900,18 @@ static void appendReadRegExpr(SmallVectorImpl<char> &Expr, unsigned RegNum) {
   Expr.push_back(0);
 }
 
+// Convenience function to create a DWARF expression for loading a register from
+// a CFA offset.
+static void appendLoadRegExpr(SmallVectorImpl<char> &Expr,
+                              int64_t OffsetFromDefCFA) {
+  // This assumes the top of the DWARF stack contains the CFA.
+  Expr.push_back(dwarf::DW_OP_dup);
+  // Add the offset to the register.
+  appendConstantExpr(Expr, OffsetFromDefCFA, dwarf::DW_OP_plus);
+  // Dereference the address (loads a 64 bit value)..
+  Expr.push_back(dwarf::DW_OP_deref);
+}
+
 // Convenience function to create a comment for
 //  (+/-) NumBytes (* RegScale)?
 static void appendOffsetComment(int NumBytes, llvm::raw_string_ostream &Comment,
@@ -5968,9 +5980,10 @@ MCCFIInstruction llvm::createDefCFA(const TargetRegisterInfo &TRI,
   return MCCFIInstruction::cfiDefCfa(nullptr, DwarfReg, (int)Offset.getFixed());
 }
 
-MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
-                                       unsigned Reg,
-                                       const StackOffset &OffsetFromDefCFA) {
+MCCFIInstruction
+llvm::createCFAOffset(const TargetRegisterInfo &TRI, unsigned Reg,
+                      const StackOffset &OffsetFromDefCFA,
+                      std::optional<int64_t> IncomingVGOffsetFromDefCFA) {
   int64_t NumBytes, NumVGScaledBytes;
   AArch64InstrInfo::decomposeStackOffsetForDwarfOffsets(
       OffsetFromDefCFA, NumBytes, NumVGScaledBytes);
@@ -5989,9 +6002,16 @@ MCCFIInstruction llvm::createCFAOffset(const TargetRegisterInfo &TRI,
   assert(NumVGScaledBytes && "Expected scalable offset");
   SmallString<64> OffsetExpr;
   // + VG * NumVGScaledBytes
-  appendOffsetComment(NumVGScaledBytes, Comment, "* VG");
-  appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
+  StringRef VGRegScale;
+  if (IncomingVGOffsetFromDefCFA) {
+    appendLoadRegExpr(OffsetExpr, *IncomingVGOffsetFromDefCFA);
+    VGRegScale = "* IncomingVG";
+  } else {
+    appendReadRegExpr(OffsetExpr, TRI.getDwarfRegNum(AArch64::VG, true));
+    VGRegScale = "* VG";
+  }
   appendConstantExpr(OffsetExpr, NumVGScaledBytes, dwarf::DW_OP_mul);
+  appendOffsetComment(NumVGScaledBytes, Comment, VGRegScale);
   OffsetExpr.push_back(dwarf::DW_OP_plus);
   if (NumBytes) {
     // + NumBytes

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 70c814a3a48c9..179574a73aa01 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -646,8 +646,10 @@ bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
 MCCFIInstruction createDefCFA(const TargetRegisterInfo &TRI, unsigned FrameReg,
                               unsigned Reg, const StackOffset &Offset,
                               bool LastAdjustmentWasScalable = true);
-MCCFIInstruction createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg,
-                                 const StackOffset &OffsetFromDefCFA);
+MCCFIInstruction
+createCFAOffset(const TargetRegisterInfo &MRI, unsigned Reg,
+                const StackOffset &OffsetFromDefCFA,
+                std::optional<int64_t> IncomingVGOffsetFromDefCFA);
 
 /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
 /// plus Offset.  This is intended to be used from within the prolog/epilog

diff  --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index ed3374ae68d00..1fde87e65a34b 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -235,10 +235,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   // The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
   unsigned PredicateRegForFillSpill = 0;
 
-  // The stack slots where VG values are stored to.
-  int64_t VGIdx = std::numeric_limits<int>::max();
-  int64_t StreamingVGIdx = std::numeric_limits<int>::max();
-
   // Holds the SME function attributes (streaming mode, ZA/ZT0 state).
   SMEAttrs SMEFnAttrs;
 
@@ -280,12 +276,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   unsigned isPStateSMRegUsed() const { return PStateSMRegUsed; };
   void setPStateSMRegUsed(bool Used = true) { PStateSMRegUsed = Used; };
 
-  int64_t getVGIdx() const { return VGIdx; };
-  void setVGIdx(unsigned Idx) { VGIdx = Idx; };
-
-  int64_t getStreamingVGIdx() const { return StreamingVGIdx; };
-  void setStreamingVGIdx(unsigned FrameIdx) { StreamingVGIdx = FrameIdx; };
-
   bool isSVECC() const { return IsSVECC; };
   void setIsSVECC(bool s) { IsSVECC = s; };
 

diff  --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 5c4e0c1093187..125225df15464 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -48,12 +48,6 @@ let usesCustomInserter = 1 in {
 }
 def : Pat<(i64 (AArch64EntryPStateSM)), (EntryPStateSM)>;
 
-def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
-                           [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
-
-def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
-                              [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
-
 //===----------------------------------------------------------------------===//
 // Old SME ABI lowering ISD nodes/pseudos (deprecated)
 //===----------------------------------------------------------------------===//
@@ -362,16 +356,6 @@ def : Pat<(AArch64_smstart (i32 svcr_op:$pstate)),
 def : Pat<(AArch64_smstop (i32 svcr_op:$pstate)),
           (MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
 
-
-// Pseudo to insert cfi_offset/cfi_restore instructions. Used to save or restore
-// the streaming value of VG around streaming-mode changes in locally-streaming
-// functions.
-def VGSavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
-def : Pat<(AArch64VGSave), (VGSavePseudo)>;
-
-def VGRestorePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
-def : Pat<(AArch64VGRestore), (VGRestorePseudo)>;
-
 //===----------------------------------------------------------------------===//
 // SME2 Instructions
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index 564af6708e1ed..85cca1de47b78 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -128,13 +128,6 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
 
   bool Changed = false;
   MachineInstr *Prev = nullptr;
-  SmallVector<MachineInstr *, 4> ToBeRemoved;
-
-  // Convenience function to reset the matching of a sequence.
-  auto Reset = [&]() {
-    Prev = nullptr;
-    ToBeRemoved.clear();
-  };
 
   // Walk through instructions in the block trying to find pairs of smstart
   // and smstop nodes that cancel each other out. We only permit a limited
@@ -156,14 +149,10 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
         // that we marked for deletion in between.
         Prev->eraseFromParent();
         MI.eraseFromParent();
-        for (MachineInstr *TBR : ToBeRemoved)
-          TBR->eraseFromParent();
-        ToBeRemoved.clear();
         Prev = nullptr;
         Changed = true;
         NumSMChangesRemoved += 2;
       } else {
-        Reset();
         Prev = &MI;
       }
       continue;
@@ -179,7 +168,7 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
     // of streaming mode. If not, the algorithm should reset.
     switch (MI.getOpcode()) {
     default:
-      Reset();
+      Prev = nullptr;
       break;
     case AArch64::COALESCER_BARRIER_FPR16:
     case AArch64::COALESCER_BARRIER_FPR32:
@@ -193,7 +182,7 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
       // concrete example/test-case.
       if (isSVERegOp(TRI, MRI, MI.getOperand(0)) ||
           isSVERegOp(TRI, MRI, MI.getOperand(1)))
-        Reset();
+        Prev = nullptr;
       break;
     case AArch64::ADJCALLSTACKDOWN:
     case AArch64::ADJCALLSTACKUP:
@@ -201,12 +190,6 @@ bool SMEPeepholeOpt::optimizeStartStopPairs(
     case AArch64::ADDXri:
       // We permit these as they don't generate SVE/NEON instructions.
       break;
-    case AArch64::VGRestorePseudo:
-    case AArch64::VGSavePseudo:
-      // When the smstart/smstop are removed, we should also remove
-      // the pseudos that save/restore the VG value for CFI info.
-      ToBeRemoved.push_back(&MI);
-      break;
     case AArch64::MSRpstatesvcrImm1:
     case AArch64::MSRpstatePseudo:
       llvm_unreachable("Should have been handled");

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-sme-za-call-lowering.ll b/llvm/test/CodeGen/AArch64/aarch64-sme-za-call-lowering.ll
index eb23a0f77accf..0b8645f66b5f3 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sme-za-call-lowering.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sme-za-call-lowering.ll
@@ -46,12 +46,10 @@ define void @requires_za_save_streaming_mode_change() nounwind "aarch64_inout_za
   ; CHECK-BEFORE-SMEABI: bb.0 (%ir-block.0):
   ; CHECK-BEFORE-SMEABI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-BEFORE-SMEABI-NEXT:   RequiresZASavePseudo
-  ; CHECK-BEFORE-SMEABI-NEXT:   VGSavePseudo
   ; CHECK-BEFORE-SMEABI-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-BEFORE-SMEABI-NEXT:   BL @private_za_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
   ; CHECK-BEFORE-SMEABI-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-BEFORE-SMEABI-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
-  ; CHECK-BEFORE-SMEABI-NEXT:   VGRestorePseudo
   ; CHECK-BEFORE-SMEABI-NEXT:   RET_ReallyLR
   ;
   ; CHECK-AFTER-SMEABI-LABEL: name: requires_za_save_streaming_mode_change
@@ -66,12 +64,10 @@ define void @requires_za_save_streaming_mode_change() nounwind "aarch64_inout_za
   ; CHECK-AFTER-SMEABI-NEXT:   MSR 56965, [[COPY1]]
   ; CHECK-AFTER-SMEABI-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-AFTER-SMEABI-NEXT:   RequiresZASavePseudo
-  ; CHECK-AFTER-SMEABI-NEXT:   VGSavePseudo
   ; CHECK-AFTER-SMEABI-NEXT:   MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit $vg, implicit-def $vg, implicit-def $fpmr
   ; CHECK-AFTER-SMEABI-NEXT:   BL @private_za_callee, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
   ; CHECK-AFTER-SMEABI-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
   ; CHECK-AFTER-SMEABI-NEXT:   MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
-  ; CHECK-AFTER-SMEABI-NEXT:   VGRestorePseudo
   ; CHECK-AFTER-SMEABI-NEXT:   MSRpstatesvcrImm1 2, 1, implicit-def $nzcv
   ; CHECK-AFTER-SMEABI-NEXT:   [[MRS:%[0-9]+]]:gpr64 = MRS 56965, implicit-def $nzcv
   ; CHECK-AFTER-SMEABI-NEXT:   $x0 = ADDXri %stack.0, 0, 0

diff  --git a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
index 94fe06733347a..22774ebf1a662 100644
--- a/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
+++ b/llvm/test/CodeGen/AArch64/outlining-with-streaming-mode-changes.ll
@@ -7,11 +7,10 @@ define void @streaming_mode_change1() #0 {
 ; CHECK-LABEL: streaming_mode_change1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
@@ -24,7 +23,6 @@ define void @streaming_mode_change1() #0 {
 ;
 ; OUTLINER-LABEL: streaming_mode_change1:
 ; OUTLINER-NOT: OUTLINED_FUNCTION_
-;
   call void @callee();
   ret void;
 }
@@ -33,11 +31,10 @@ define void @streaming_mode_change2() #0 {
 ; CHECK-LABEL: streaming_mode_change2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
@@ -50,7 +47,6 @@ define void @streaming_mode_change2() #0 {
 ;
 ; OUTLINER-LABEL: streaming_mode_change2:
 ; OUTLINER-NOT: OUTLINED_FUNCTION_
-;
   call void @callee();
   ret void;
 }
@@ -59,11 +55,10 @@ define void @streaming_mode_change3() #0 {
 ; CHECK-LABEL: streaming_mode_change3:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
@@ -76,7 +71,6 @@ define void @streaming_mode_change3() #0 {
 ;
 ; OUTLINER-LABEL: streaming_mode_change3:
 ; OUTLINER-NOT: OUTLINED_FUNCTION_
-;
   call void @callee();
   ret void;
 }

diff  --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 3579baae1d7d8..25a7b87d37d9e 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -88,18 +88,14 @@ define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" "a
 define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: streaming_agnostic_caller_nonstreaming_private_za_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __arm_get_current_vg
-; CHECK-NEXT:    str x0, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    mov x0, x9
 ; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state_size
 ; CHECK-NEXT:    sub sp, sp, x0
 ; CHECK-NEXT:    mov x20, sp
@@ -123,12 +119,12 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
 ; CHECK-NEXT:    bl __arm_sme_restore
 ; CHECK-NEXT:    mov x0, x1
 ; CHECK-NEXT:    sub sp, x29, #64
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i64 @private_za_decl(i64 %v)
   %res2 = call i64 @private_za_decl(i64 %res)
@@ -139,18 +135,14 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
 define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nounwind "aarch64_za_state_agnostic" "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: streaming_compatible_agnostic_caller_nonstreaming_private_za_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x9, x0
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    bl __arm_get_current_vg
-; CHECK-NEXT:    str x0, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    mov x0, x9
 ; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    bl __arm_sme_state_size
@@ -188,12 +180,12 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
 ; CHECK-NEXT:    bl __arm_sme_restore
 ; CHECK-NEXT:    mov x0, x1
 ; CHECK-NEXT:    sub sp, x29, #64
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i64 @private_za_decl(i64 %v)
   %res2 = call i64 @private_za_decl(i64 %res)

diff  --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 1567ca258cccb..9bc5ee6988bcf 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -10,13 +10,11 @@ target triple = "aarch64"
 define void @streaming_compatible() #0 {
 ; CHECK-LABEL: streaming_compatible:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT:    bl __arm_get_current_vg
-; CHECK-NEXT:    stp x0, x19, [sp, #72] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB0_2
@@ -28,12 +26,11 @@ define void @streaming_compatible() #0 {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @non_streaming()
   ret void
@@ -47,14 +44,12 @@ declare void @non_streaming()
 define void @streaming_compatible_arg(float %f) #0 {
 ; CHECK-LABEL: streaming_compatible_arg:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    bl __arm_get_current_vg
-; CHECK-NEXT:    stp x0, x19, [sp, #88] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
@@ -68,13 +63,12 @@ define void @streaming_compatible_arg(float %f) #0 {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   call void @non_streaming(float %f)
   ret void

diff  --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
index 1933eb85b77f2..8d6432ced8e1d 100644
--- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -9,9 +9,8 @@ declare void @my_func2(<vscale x 16 x i8> %v)
 define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; NOPAIR-LABEL: fbyte:
 ; NOPAIR:       // %bb.0:
-; NOPAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; NOPAIR-NEXT:    cntd x9
-; NOPAIR-NEXT:    stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; NOPAIR-NEXT:    str x29, [sp, #-32]! // 8-byte Folded Spill
+; NOPAIR-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
 ; NOPAIR-NEXT:    addvl sp, sp, #-18
 ; NOPAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; NOPAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -85,15 +84,14 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; NOPAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; NOPAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; NOPAIR-NEXT:    addvl sp, sp, #18
-; NOPAIR-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
-; NOPAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
 ; NOPAIR-NEXT:    ret
 ;
 ; PAIR-LABEL: fbyte:
 ; PAIR:       // %bb.0:
-; PAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; PAIR-NEXT:    cntd x9
-; PAIR-NEXT:    stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; PAIR-NEXT:    str x29, [sp, #-32]! // 8-byte Folded Spill
+; PAIR-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
 ; PAIR-NEXT:    addvl sp, sp, #-18
 ; PAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -167,8 +165,8 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; PAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    addvl sp, sp, #18
-; PAIR-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
-; PAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; PAIR-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; PAIR-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
 ; PAIR-NEXT:    ret
   call void @my_func2(<vscale x 16 x i8> %v)
   ret void
@@ -177,9 +175,7 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 define void @fhalf(<vscale x 8 x half> %v) #1{
 ; NOPAIR-LABEL: fhalf:
 ; NOPAIR:       // %bb.0:
-; NOPAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; NOPAIR-NEXT:    cntd x9
-; NOPAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; NOPAIR-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; NOPAIR-NEXT:    addvl sp, sp, #-18
 ; NOPAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; NOPAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -241,14 +237,12 @@ define void @fhalf(<vscale x 8 x half> %v) #1{
 ; NOPAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; NOPAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; NOPAIR-NEXT:    addvl sp, sp, #18
-; NOPAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; NOPAIR-NEXT:    ret
 ;
 ; PAIR-LABEL: fhalf:
 ; PAIR:       // %bb.0:
-; PAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; PAIR-NEXT:    cntd x9
-; PAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; PAIR-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; PAIR-NEXT:    addvl sp, sp, #-18
 ; PAIR-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    ptrue pn8.b
@@ -298,7 +292,7 @@ define void @fhalf(<vscale x 8 x half> %v) #1{
 ; PAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    addvl sp, sp, #18
-; PAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; PAIR-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; PAIR-NEXT:    ret
   call void @my_func()
   ret void
@@ -307,12 +301,7 @@ define void @fhalf(<vscale x 8 x half> %v) #1{
 define void @ffloat(<vscale x 4 x i32> %v) #2 {
 ; NOPAIR-LABEL: ffloat:
 ; NOPAIR:       // %bb.0:
-; NOPAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; NOPAIR-NEXT:    rdsvl x9, #1
-; NOPAIR-NEXT:    lsr x9, x9, #3
-; NOPAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
-; NOPAIR-NEXT:    cntd x9
-; NOPAIR-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
+; NOPAIR-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; NOPAIR-NEXT:    addsvl sp, sp, #-18
 ; NOPAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; NOPAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -374,17 +363,12 @@ define void @ffloat(<vscale x 4 x i32> %v) #2 {
 ; NOPAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; NOPAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; NOPAIR-NEXT:    addsvl sp, sp, #18
-; NOPAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; NOPAIR-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; NOPAIR-NEXT:    ret
 ;
 ; PAIR-LABEL: ffloat:
 ; PAIR:       // %bb.0:
-; PAIR-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; PAIR-NEXT:    rdsvl x9, #1
-; PAIR-NEXT:    lsr x9, x9, #3
-; PAIR-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
-; PAIR-NEXT:    cntd x9
-; PAIR-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
+; PAIR-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; PAIR-NEXT:    addsvl sp, sp, #-18
 ; PAIR-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; PAIR-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -446,7 +430,7 @@ define void @ffloat(<vscale x 4 x i32> %v) #2 {
 ; PAIR-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; PAIR-NEXT:    addsvl sp, sp, #18
-; PAIR-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; PAIR-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; PAIR-NEXT:    ret
   call void @my_func()
   ret void

diff  --git a/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll b/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll
index a08e4896f5ee9..48ac156a43875 100644
--- a/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll
+++ b/llvm/test/CodeGen/AArch64/sme-darwin-sve-vg.ll
@@ -5,18 +5,17 @@ declare void @normal_callee();
 define void @locally_streaming_fn() #0 {
 ; CHECK-LABEL: locally_streaming_fn:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! ; 16-byte Folded Spill
+; CHECK-NEXT:    stp     d15, d14, [sp, #-96]!           ; 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    stp d13, d12, [sp, #16] ; 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
-; CHECK-NEXT:    stp d11, d10, [sp, #32] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #48] ; 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] ; 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] ; 8-byte Folded Spill
+; CHECK-NEXT:    cntd    x9
+; CHECK-NEXT:    stp     d13, d12, [sp, #16]             ; 16-byte Folded Spill
+; CHECK-NEXT:    stp     d11, d10, [sp, #32]             ; 16-byte Folded Spill
+; CHECK-NEXT:    stp     d9, d8, [sp, #48]               ; 16-byte Folded Spill
+; CHECK-NEXT:    stp     x29, x30, [sp, #64]             ; 16-byte Folded Spill
+; CHECK-NEXT:    str     x9, [sp, #80]                   ; 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset vg, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    .cfi_offset b10, -56
@@ -26,19 +25,19 @@ define void @locally_streaming_fn() #0 {
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_offset vg, -24
-; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    bl _normal_callee
+; CHECK-NEXT:    smstop  sm
+; CHECK-NEXT:    bl      _normal_callee
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
-; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] ; 8-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] ; 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 ; 16-byte Folded Reload
+; CHECK-NEXT:    smstop  sm
+; CHECK-NEXT:    ldp     x29, x30, [sp, #64]             ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp     d9, d8, [sp, #48]               ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp     d11, d10, [sp, #32]             ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp     d13, d12, [sp, #16]             ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp     d15, d14, [sp], #96             ; 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10

diff  --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 759f3ee609e58..e1bfdddaba923 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -17,8 +17,6 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
 ; CHECK-FISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-FISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-FISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-FISEL-NEXT:    cntd x9
-; CHECK-FISEL-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
 ; CHECK-FISEL-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-FISEL-NEXT:    smstart sm
 ; CHECK-FISEL-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
@@ -45,8 +43,6 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
 ; CHECK-GISEL-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-GISEL-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-GISEL-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-GISEL-NEXT:    cntd x9
-; CHECK-GISEL-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
 ; CHECK-GISEL-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-GISEL-NEXT:    smstart sm
 ; CHECK-GISEL-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
@@ -80,8 +76,6 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    cntd x9
-; CHECK-COMMON-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
 ; CHECK-COMMON-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
@@ -108,17 +102,12 @@ entry:
 define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" {
 ; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee:
 ; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    sub sp, sp, #128
+; CHECK-COMMON-NEXT:    sub sp, sp, #112
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    rdsvl x9, #1
-; CHECK-COMMON-NEXT:    lsr x9, x9, #3
-; CHECK-COMMON-NEXT:    str x9, [sp, #104] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    cntd x9
-; CHECK-COMMON-NEXT:    str x9, [sp, #112] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str d0, [sp, #24] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:    ldr d0, [sp, #24] // 8-byte Folded Reload
@@ -140,7 +129,7 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli
 ; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    add sp, sp, #128
+; CHECK-COMMON-NEXT:    add sp, sp, #112
 ; CHECK-COMMON-NEXT:    ret
   %call = call double  @normal_callee(double %x);
   %add = fadd double %call, 4.200000e+01
@@ -177,16 +166,11 @@ define double @normal_caller_to_locally_streaming_callee(double %x) nounwind noi
 define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noinline optnone "aarch64_pstate_sm_body" {
 ; CHECK-COMMON-LABEL: locally_streaming_caller_streaming_callee_ptr:
 ; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    rdsvl x9, #1
-; CHECK-COMMON-NEXT:    lsr x9, x9, #3
-; CHECK-COMMON-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    cntd x9
-; CHECK-COMMON-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:    blr x0
 ; CHECK-COMMON-NEXT:    smstop sm
@@ -194,7 +178,7 @@ define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noin
 ; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
  call void %p() "aarch64_pstate_sm_enabled"
   ret void
@@ -208,8 +192,6 @@ define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optno
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    cntd x9
-; CHECK-COMMON-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:    blr x0
 ; CHECK-COMMON-NEXT:    smstop sm
@@ -339,22 +321,21 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
 ; CHECK-COMMON-LABEL: f128_call_sm:
 ; CHECK-COMMON:       // %bb.0:
 ; CHECK-COMMON-NEXT:    sub sp, sp, #112
-; CHECK-COMMON-NEXT:    cntd x9
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
 ; CHECK-COMMON-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
 ; CHECK-COMMON-NEXT:    bl __addtf3
 ; CHECK-COMMON-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
-; CHECK-COMMON-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    add sp, sp, #112
@@ -403,22 +384,21 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
 ; CHECK-COMMON-LABEL: frem_call_sm:
 ; CHECK-COMMON:       // %bb.0:
 ; CHECK-COMMON-NEXT:    sub sp, sp, #96
-; CHECK-COMMON-NEXT:    cntd x9
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstop sm
 ; CHECK-COMMON-NEXT:    ldp s0, s1, [sp, #8] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    bl fmodf
 ; CHECK-COMMON-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-COMMON-NEXT:    smstart sm
-; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    add sp, sp, #96
@@ -431,14 +411,12 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
 define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-COMMON-LABEL: frem_call_sm_compat:
 ; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    sub sp, sp, #112
-; CHECK-COMMON-NEXT:    cntd x9
+; CHECK-COMMON-NEXT:    sub sp, sp, #96
 ; CHECK-COMMON-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    bl __arm_sme_state
 ; CHECK-COMMON-NEXT:    mov x19, x0
@@ -453,14 +431,13 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
 ; CHECK-COMMON-NEXT:  // %bb.3:
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:  .LBB12_4:
+; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    add sp, sp, #112
+; CHECK-COMMON-NEXT:    add sp, sp, #96
 ; CHECK-COMMON-NEXT:    ret
   %res = frem float %a, %b
   ret float %res

diff  --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index e623d3fb075f7..c57cb8e0873d0 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -213,15 +213,13 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
 define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -253,25 +251,23 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:  .LBB3_6:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    sub sp, x29, #64
-; CHECK-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: test_lazy_save_and_conditional_smstart:
 ; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    cntd x9
+; CHECK-NEWLOWERING-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEWLOWERING-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEWLOWERING-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEWLOWERING-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEWLOWERING-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEWLOWERING-NEXT:    add x29, sp, #64
-; CHECK-NEWLOWERING-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #16
 ; CHECK-NEWLOWERING-NEXT:    rdsvl x8, #1
 ; CHECK-NEWLOWERING-NEXT:    mov x9, sp
@@ -300,12 +296,12 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEWLOWERING-NEXT:  .LBB3_6:
 ; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEWLOWERING-NEXT:    sub sp, x29, #64
-; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    ret
   call void @private_za_callee()
   ret void

diff  --git a/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll b/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll
index 69f603458670c..2e198ad8f0d05 100644
--- a/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll
+++ b/llvm/test/CodeGen/AArch64/sme-must-save-lr-for-vg.ll
@@ -11,14 +11,12 @@ define void @foo() "aarch64_pstate_sm_body" {
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    lsr x9, x9, #3
-; CHECK-NEXT:    str x9, [sp, #72] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_get_current_vg
 ; CHECK-NEXT:    str x0, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset vg, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    .cfi_offset b10, -56
@@ -29,13 +27,15 @@ define void @foo() "aarch64_pstate_sm_body" {
 ; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10

diff  --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index ff5b7c047eaf5..ab7c661d27187 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -11,11 +11,10 @@ define void @test0(ptr %callee) nounwind {
 ; CHECK-LABEL: test0:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl callee_sm
 ; CHECK-NEXT:    bl callee_sm
@@ -36,11 +35,10 @@ define void @test1() nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    bl callee
@@ -60,13 +58,11 @@ define void @test1() nounwind "aarch64_pstate_sm_enabled" {
 define void @test2() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB2_2
@@ -79,12 +75,11 @@ define void @test2() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee()
   call void @callee()
@@ -95,13 +90,11 @@ define void @test2() nounwind "aarch64_pstate_sm_compatible" {
 define void @test3() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: test3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbnz w19, #0, .LBB3_2
@@ -131,12 +124,11 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:  // %bb.11:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB3_12:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee_sm()
   call void @callee()
@@ -149,11 +141,10 @@ define void @test4() nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    fmov s0, wzr
 ; CHECK-NEXT:    bl callee_farg
@@ -176,12 +167,11 @@ define void @test5(float %f) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test5:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
@@ -205,12 +195,11 @@ define float @test6(float %f) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test6:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
@@ -218,10 +207,10 @@ define float @test6(float %f) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-NEXT:    bl callee_farg_fret
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
@@ -264,11 +253,10 @@ define void @test8() nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
@@ -289,11 +277,98 @@ define void @test8() nounwind "aarch64_pstate_sm_enabled" {
 define void @test9() "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: test9:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee()
+  ret void
+}
+
+; test that if the 'smstart' and 'smtop' are entirely removed in a locally
+; streaming function, we use the FP, not an expression to describe the CFA.
+define aarch64_sve_vector_pcs void @test9_1() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: test9_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addsvl sp, sp, #-18
+; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w28, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d8 @ cfa - 8 * VG - 32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d9 @ cfa - 16 * VG - 32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d10 @ cfa - 24 * VG - 32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d11 @ cfa - 32 * VG - 32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d12 @ cfa - 40 * VG - 32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d13 @ cfa - 48 * VG - 32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d14 @ cfa - 56 * VG - 32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d15 @ cfa - 64 * VG - 32
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addsvl sp, sp, #18
+; CHECK-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee()
   ret void
@@ -307,16 +382,15 @@ define void @test10() "aarch64_pstate_sm_body" {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset vg, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    .cfi_offset b10, -56
@@ -327,18 +401,18 @@ define void @test10() "aarch64_pstate_sm_body" {
 ; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    bl callee_sm
-; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
@@ -359,13 +433,11 @@ define void @test10() "aarch64_pstate_sm_body" {
 define void @test11(ptr %p) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test11:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
@@ -375,12 +447,11 @@ define void @test11(ptr %p) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee()
   store <vscale x 16 x i8> zeroinitializer, ptr %p
@@ -396,16 +467,15 @@ define void @test12() "aarch64_pstate_sm_body" {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset vg, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    .cfi_offset b10, -56
@@ -416,20 +486,20 @@ define void @test12() "aarch64_pstate_sm_body" {
 ; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    smstop za
-; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
@@ -452,12 +522,11 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-LABEL: test13:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    mov x19, x0
@@ -475,8 +544,8 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload

diff  --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index 5ea5e3e7766e8..b947c943ba448 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -16,12 +16,11 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    mov x19, x1
@@ -32,8 +31,8 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -49,12 +48,11 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    mov x19, x1
@@ -65,8 +63,8 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -82,12 +80,11 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    mov x19, x1
@@ -98,8 +95,8 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -115,12 +112,11 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    mov x19, x1
@@ -131,8 +127,8 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -148,12 +144,11 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
@@ -171,8 +166,8 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -188,12 +183,11 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
@@ -211,8 +205,8 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -228,12 +222,11 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -251,8 +244,8 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -273,12 +266,11 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -296,8 +288,8 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -314,12 +306,11 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -337,8 +328,8 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -355,12 +346,11 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -378,8 +368,8 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -396,12 +386,11 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -419,8 +408,8 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -437,12 +426,11 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
@@ -460,8 +448,8 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -478,12 +466,11 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -501,8 +488,8 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -519,12 +506,11 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v1f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -542,8 +528,8 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -564,12 +550,11 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -587,8 +572,8 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -604,12 +589,11 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -627,8 +611,8 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -644,12 +628,11 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -667,8 +650,8 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -684,12 +667,11 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -707,8 +689,8 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -724,12 +706,11 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -747,8 +728,8 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -764,12 +745,11 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v8bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -787,8 +767,8 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -804,12 +784,11 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v4f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -827,8 +806,8 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -844,12 +823,11 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
@@ -867,8 +845,8 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -887,12 +865,11 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_arg_v8i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
@@ -901,10 +878,10 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    str p0, [x8, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8i1
@@ -914,8 +891,8 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    str p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -934,25 +911,22 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
 define void @dont_coalesce_res_i8(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i8
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i8 @get_i8()
   %vec = insertelement <vscale x 16 x i8> poison, i8 %res, i32 0
@@ -963,25 +937,22 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 {
 define void @dont_coalesce_res_i16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i16
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i16 @get_i16()
   %vec = insertelement <vscale x 8 x i16> poison, i16 %res, i32 0
@@ -992,25 +963,22 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 {
 define void @dont_coalesce_res_i32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i32
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    fmov s0, w0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i32 @get_i32()
   %vec = insertelement <vscale x 4 x i32> poison, i32 %res, i32 0
@@ -1021,25 +989,22 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 {
 define void @dont_coalesce_res_i64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i64
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    fmov d0, x0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i64 @get_i64()
   %vec = insertelement <vscale x 2 x i64> poison, i64 %res, i32 0
@@ -1050,29 +1015,26 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 {
 define void @dont_coalesce_res_f16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_f16
 ; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call half @get_f16()
   %vec = insertelement <vscale x 8 x half> poison, half %res, i32 0
@@ -1083,14 +1045,12 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 {
 define void @dont_coalesce_res_f32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_f32
@@ -1098,13 +1058,12 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call float @get_f32()
   %vec = insertelement <vscale x 4 x float> poison, float %res, i32 0
@@ -1115,14 +1074,12 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
 define void @dont_coalesce_res_f64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_f64
@@ -1130,13 +1087,12 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call double @get_f64()
   %vec = insertelement <vscale x 2 x double> poison, double %res, i32 0
@@ -1151,14 +1107,12 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
 define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1i8
@@ -1166,13 +1120,12 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x i8> @get_v1i8()
   %elt = extractelement <1 x i8> %res, i32 0
@@ -1184,14 +1137,12 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
 define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1i16
@@ -1199,13 +1150,12 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x i16> @get_v1i16()
   %elt = extractelement <1 x i16> %res, i32 0
@@ -1217,14 +1167,12 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
 define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1i32
@@ -1232,13 +1180,12 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x i32> @get_v1i32()
   %elt = extractelement <1 x i32> %res, i32 0
@@ -1250,14 +1197,12 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
 define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1i64
@@ -1265,13 +1210,12 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x i64> @get_v1i64()
   %elt = extractelement <1 x i64> %res, i32 0
@@ -1283,29 +1227,26 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
 define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1f16
 ; CHECK-NEXT:    str h0, [sp, #14] // 2-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
-; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x half> @get_v1f16()
   %elt = extractelement <1 x half> %res, i32 0
@@ -1317,14 +1258,12 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
 define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1f32
@@ -1332,13 +1271,12 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x float> @get_v1f32()
   %elt = extractelement <1 x float> %res, i32 0
@@ -1350,14 +1288,12 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
 define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v1f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v1f64
@@ -1365,13 +1301,12 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    str z0, [x19]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x double> @get_v1f64()
   %elt = extractelement <1 x double> %res, i32 0
@@ -1387,29 +1322,26 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
 define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v16i8
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <16 x i8> @get_v16i8()
   %vec = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> %res, i64 0)
@@ -1420,29 +1352,26 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
 define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v8i16
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <8 x i16> @get_v8i16()
   %vec = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> %res, i64 0)
@@ -1453,29 +1382,26 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
 define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v4i32
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <4 x i32> @get_v4i32()
   %vec = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> %res, i64 0)
@@ -1486,29 +1412,26 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
 define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v2i64
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <2 x i64> @get_v2i64()
   %vec = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %res, i64 0)
@@ -1519,29 +1442,26 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
 define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v8f16
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <8 x half> @get_v8f16()
   %vec = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %res, i64 0)
@@ -1552,29 +1472,26 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
 define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v4f32
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <4 x float> @get_v4f32()
   %vec = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %res, i64 0)
@@ -1585,29 +1502,26 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
 define void @dont_coalesce_res_v2f64(ptr %ptr) #0 {
 ; CHECK-LABEL: dont_coalesce_res_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_v2f64
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    str z0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <2 x double> @get_v2f64()
   %vec = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> %res, i64 0)

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
index 52078941aa745..39ea180e7ed81 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
@@ -8,15 +8,11 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
 define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: sm_body_sm_compatible_simple:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    tbnz w0, #0, .LBB0_2
 ; CHECK-NEXT:  // %bb.1:
@@ -31,7 +27,7 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   ret float zeroinitializer
 }
@@ -39,15 +35,11 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
 define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: sm_body_caller_sm_compatible_caller_normal_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbnz w19, #0, .LBB1_2
@@ -61,12 +53,11 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @normal_callee()
   ret void
@@ -76,16 +67,12 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
 define void @streaming_body_and_streaming_compatible_interface_multi_basic_block(i32 noundef %x) "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: streaming_body_and_streaming_compatible_interface_multi_basic_block:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov w8, w0
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbnz w19, #0, .LBB2_2
@@ -99,12 +86,11 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
 ; CHECK-NEXT:  // %bb.4: // %if.else
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB2_5: // %if.else
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_6: // %if.then
 ; CHECK-NEXT:    smstop sm
@@ -114,12 +100,11 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
 ; CHECK-NEXT:  // %bb.7: // %if.then
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB2_8: // %if.then
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   %cmp = icmp eq i32 %x, 0

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
index dd336e0f2e686..a3ec2ddb2b872 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
@@ -8,15 +8,11 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
 define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: locally_streaming_caller_streaming_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_compatible_callee
 ; CHECK-NEXT:    bl streaming_compatible_callee
@@ -25,7 +21,7 @@ define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body
 ; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 
   call void @streaming_compatible_callee();
@@ -51,33 +47,26 @@ define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_
 define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: locally_streaming_multiple_exit:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    lsr x9, x9, #3
-; CHECK-NEXT:    str x9, [sp, #-80]! // 8-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    cmp x0, #1
 ; CHECK-NEXT:    b.ne .LBB2_2
 ; CHECK-NEXT:  // %bb.1: // %if.else
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB2_2: // %if.end
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 
 entry:
@@ -98,16 +87,11 @@ if.end:
 define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: locally_streaming_caller_no_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
-; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    index z0.d, #0, #1
@@ -118,12 +102,12 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
 
   %add = add <2 x i64> %a, <i64 41, i64 42>;
@@ -155,16 +139,12 @@ define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate
 define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
@@ -177,7 +157,7 @@ define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i
 ; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible"
   ret <2 x i64> %res;
@@ -188,16 +168,12 @@ declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_
 define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #128
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    sub sp, sp, #112
 ; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #112] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
 ; CHECK-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
@@ -210,7 +186,7 @@ define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct
 ; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    add sp, sp, #112
 ; CHECK-NEXT:    ret
   %v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1
   %res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible"
@@ -224,16 +200,11 @@ declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<
 define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: locally_streaming_caller_alloca:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
 ; CHECK-NEXT:    addsvl sp, sp, #-1
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    mov x0, sp
@@ -244,7 +215,7 @@ define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body"
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %alloca = alloca <vscale x 4 x i32>
   call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible"
@@ -271,16 +242,11 @@ declare double @llvm.cos.f64(double)
 define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstate_sm_body" {
 ; CHECK-LABEL: test_arg_survives_loop:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    rdsvl x9, #1
-; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
-; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #24] // 8-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB9_1: // %for.body
@@ -293,12 +259,12 @@ define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstat
 ; CHECK-NEXT:    fadd s0, s1, s0
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
 entry:
   br label %for.body
@@ -318,15 +284,11 @@ for.cond.cleanup:
 define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: disable_tailcallopt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_compatible_callee
 ; CHECK-NEXT:    smstop sm
@@ -334,7 +296,7 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
 ; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   tail call void @streaming_compatible_callee();
   ret void;

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 636c3ece9d411..ff4f36363edcf 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -36,13 +36,11 @@ define void @normal_caller_streaming_compatible_callee() nounwind {
 define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: streaming_compatible_caller_normal_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB1_2
@@ -54,12 +52,11 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB1_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 
   call void @normal_callee();
@@ -75,13 +72,11 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
 define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: streaming_compatible_caller_streaming_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbnz w19, #0, .LBB2_2
@@ -93,12 +88,11 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB2_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 
   call void @streaming_callee();
@@ -130,12 +124,11 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-LABEL: streaming_compatible_with_neon_vectors:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    add x8, sp, #16
@@ -167,8 +160,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -183,9 +176,8 @@ declare <2 x double> @normal_callee_vec_arg(<2 x double>)
 define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale x 2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: streaming_compatible_with_scalable_vectors:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-18
 ; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -263,8 +255,8 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
-; CHECK-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x double> %arg)
   %fadd = fadd <vscale x 2 x double> %res, %arg
@@ -276,9 +268,8 @@ declare <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x doub
 define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: streaming_compatible_with_predicate_vectors:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-18
 ; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -356,8 +347,8 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
-; CHECK-NEXT:    ldr x19, [sp, #24] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp], #32 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
   %and = and <vscale x 2 x i1> %res, %arg
@@ -369,13 +360,11 @@ declare <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1>)
 define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: conditional_smstart_unreachable_block:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbnz w19, #0, .LBB7_2
@@ -383,10 +372,6 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB7_2:
 ; CHECK-NEXT:    bl streaming_callee
-; CHECK-NEXT:    tbnz w19, #0, .LBB7_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:  .LBB7_4:
   call void @streaming_callee()
   unreachable
 }
@@ -394,16 +379,14 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
 define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: conditional_smstart_no_successor_block:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    tbz w8, #0, .LBB8_6
+; CHECK-NEXT:    tbz w8, #0, .LBB8_5
 ; CHECK-NEXT:  // %bb.1: // %if.then
 ; CHECK-NEXT:    tbnz w0, #0, .LBB8_3
 ; CHECK-NEXT:  // %bb.2: // %if.then
@@ -414,14 +397,12 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
 ; CHECK-NEXT:    tbnz w19, #0, .LBB8_5
 ; CHECK-NEXT:  // %bb.4: // %if.then
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:  .LBB8_5: // %if.then
-; CHECK-NEXT:  .LBB8_6: // %exit
+; CHECK-NEXT:  .LBB8_5: // %exit
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   br i1 %p, label %if.then, label %exit
 
@@ -436,13 +417,11 @@ exit:
 define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: disable_tailcallopt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB9_2
@@ -454,12 +433,11 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB9_4:
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 
   tail call void @normal_callee();
@@ -476,10 +454,12 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w19, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    stp x29, x30, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x9, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    .cfi_offset b10, -56
@@ -494,7 +474,6 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    tbz w19, #0, .LBB10_2
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    smstop sm
@@ -508,17 +487,18 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK-NEXT:  // %bb.3: // %entry
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB10_4: // %entry
-; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x29, x30, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #120] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #112] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #128
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 438b941198449..8c4d57e244e03 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -22,11 +22,10 @@ define void @normal_caller_streaming_callee() nounwind {
 ; CHECK-LABEL: normal_caller_streaming_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    smstop sm
@@ -48,11 +47,10 @@ define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enable
 ; CHECK-LABEL: streaming_caller_normal_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl normal_callee
 ; CHECK-NEXT:    smstart sm
@@ -105,11 +103,10 @@ define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind {
 ; CHECK-LABEL: call_to_function_pointer_streaming_enabled:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    blr x0
 ; CHECK-NEXT:    smstop sm
@@ -128,20 +125,19 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
 ; CHECK-LABEL: smstart_clobber_simdfp:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
@@ -154,9 +150,7 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
 define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
 ; CHECK-LABEL: smstart_clobber_sve:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-18
 ; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -222,7 +216,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @streaming_callee()
   ret <vscale x 4 x i32> %x;
@@ -233,9 +227,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
 define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x) nounwind {
 ; CHECK-LABEL: smstart_clobber_sve_duplicate:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-18
 ; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -302,7 +294,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @streaming_callee()
   call void @streaming_callee()
@@ -314,12 +306,11 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
 ; CHECK-LABEL: call_to_intrinsic_without_chain:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    stp d0, d0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr d0, [sp] // 8-byte Folded Reload
@@ -327,11 +318,11 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
 ; CHECK-NEXT:    str d0, [sp] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    ldp d1, d0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    fadd d0, d1, d0
-; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    fadd d0, d1, d0
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
@@ -349,11 +340,10 @@ define void @disable_tailcallopt() nounwind {
 ; CHECK-LABEL: disable_tailcallopt:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    smstop sm
@@ -370,13 +360,11 @@ define void @disable_tailcallopt() nounwind {
 define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #0 {
 ; CHECK-LABEL: call_to_non_streaming_pass_sve_objects:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-3
 ; CHECK-NEXT:    rdsvl x3, #1
 ; CHECK-NEXT:    addvl x0, sp, #2
@@ -392,7 +380,7 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 entry:
   %Data1 = alloca <vscale x 16 x i8>, align 16
@@ -409,12 +397,11 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK-LABEL: call_to_non_streaming_pass_args:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
 ; CHECK-NEXT:    stp d2, d3, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstop sm

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changes-unwindinfo.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changes-unwindinfo.ll
new file mode 100644
index 0000000000000..991776f11ae40
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changes-unwindinfo.ll
@@ -0,0 +1,308 @@
+; DEFINE: %{compile} = llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sme -mattr=+sve -verify-machineinstrs -enable-aarch64-sme-peephole-opt=false < %s
+; RUN: %{compile} | FileCheck %s
+; RUN: %{compile} -filetype=obj -o %t
+; RUN: llvm-objdump --dwarf=frames %t | FileCheck %s --check-prefix=UNWINDINFO
+
+; This tests that functions with streaming mode changes use explicitly use the
+; "IncomingVG" (the value of VG on entry to the function) in SVE unwind information.
+;
+; [ ] N  ->  S    (Normal -> Streaming, mode change)
+; [ ] S  ->  N    (Streaming -> Normal, mode change)
+; [ ] N  ->  N    (Normal -> Normal, no mode change)
+; [ ] S  ->  S    (Streaming -> Streaming, no mode change)
+; [ ] LS ->  S    (Locally-streaming -> Streaming, mode change)
+; [ ] SC ->  S    (Streaming-compatible -> Streaming, mode change)
+
+declare void @normal_callee()
+declare void @streaming_callee() "aarch64_pstate_sm_enabled"
+
+; [x] N  ->  S
+; [ ] S  ->  N
+; [ ] N  ->  N
+; [ ] S  ->  S
+; [ ] LS ->  S
+; [ ] SC ->  S
+define aarch64_sve_vector_pcs void @normal_caller_streaming_callee() {
+; CHECK-LABEL: normal_caller_streaming_callee:
+; CHECK:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK:    .cfi_def_cfa_offset 32
+; CHECK:    cntd x9
+; CHECK:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK:    mov x29, sp
+; CHECK:    .cfi_def_cfa w29, 32
+; CHECK:    .cfi_offset vg, -16
+; CHECK:    addvl sp, sp, #-18
+; CHECK:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK:    .cfi_escape 0x10, 0x48, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d8 @ cfa - 8 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x49, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d9 @ cfa - 16 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4a, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d10 @ cfa - 24 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4b, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d11 @ cfa - 32 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4c, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d12 @ cfa - 40 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4d, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d13 @ cfa - 48 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4e, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d14 @ cfa - 56 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4f, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d15 @ cfa - 64 * IncomingVG - 32
+; CHECK:    smstart sm
+; CHECK:    bl streaming_callee
+; CHECK:    smstop sm
+;
+; UNWINDINFO:      DW_CFA_def_cfa: reg29 +32
+; UNWINDINFO:      DW_CFA_offset: reg46 -16
+; UNWINDINFO:      DW_CFA_expression: reg72 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -8, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg76 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -40, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -48, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -56, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -64, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+  call void @streaming_callee()
+  ret void
+}
+
+; [ ] N  ->  S
+; [x] S  ->  N
+; [ ] N  ->  N
+; [ ] S  ->  S
+; [ ] LS ->  S
+; [ ] SC ->  S
+define aarch64_sve_vector_pcs void @streaming_caller_normal_callee() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: streaming_caller_normal_callee:
+; CHECK:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK:    .cfi_def_cfa_offset 32
+; CHECK:    cntd x9
+; CHECK:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK:    mov x29, sp
+; CHECK:    .cfi_def_cfa w29, 32
+; CHECK:    .cfi_offset vg, -16
+; CHECK:    addvl sp, sp, #-18
+; CHECK:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK:    .cfi_escape 0x10, 0x48, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d8 @ cfa - 8 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x49, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d9 @ cfa - 16 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4a, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d10 @ cfa - 24 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4b, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d11 @ cfa - 32 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4c, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d12 @ cfa - 40 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4d, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d13 @ cfa - 48 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4e, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d14 @ cfa - 56 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4f, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d15 @ cfa - 64 * IncomingVG - 32
+; CHECK:    smstop sm
+; CHECK:    bl normal_callee
+; CHECK:    smstart sm
+;
+; UNWINDINFO:      DW_CFA_def_cfa: reg29 +32
+; UNWINDINFO:      DW_CFA_offset: reg46 -16
+; UNWINDINFO:      DW_CFA_expression: reg72 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -8, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg76 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -40, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -48, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -56, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -64, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+  call void @normal_callee()
+  ret void
+}
+
+; [ ] N  ->  S
+; [ ] S  ->  N
+; [x] N  ->  N
+; [ ] S  ->  S
+; [ ] LS ->  S
+; [ ] SC ->  S
+define aarch64_sve_vector_pcs void @normal_caller_normal_callee() {
+; CHECK-LABEL: normal_caller_normal_callee:
+; CHECK:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK:    addvl sp, sp, #-18
+; CHECK:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 16 + 144 * VG
+; CHECK:    .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x49, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x40, 0x1c // $d9 @ cfa - 16 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4a, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x40, 0x1c // $d10 @ cfa - 24 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4b, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x40, 0x1c // $d11 @ cfa - 32 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4c, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x40, 0x1c // $d12 @ cfa - 40 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4d, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x40, 0x1c // $d13 @ cfa - 48 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
+; CHECK:    bl normal_callee
+;
+; UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_consts +144, DW_OP_mul, DW_OP_plus
+; UNWINDINFO:      DW_CFA_expression: reg72 DW_OP_bregx 0x2e +0, DW_OP_consts -8, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_bregx 0x2e +0, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_bregx 0x2e +0, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_bregx 0x2e +0, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg76 DW_OP_bregx 0x2e +0, DW_OP_consts -40, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_bregx 0x2e +0, DW_OP_consts -48, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_bregx 0x2e +0, DW_OP_consts -56, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_bregx 0x2e +0, DW_OP_consts -64, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+  call void @normal_callee()
+  ret void
+}
+
+; [ ] N  ->  S
+; [ ] S  ->  N
+; [ ] N  ->  N
+; [x] S  ->  S
+; [ ] LS ->  S
+; [ ] SC ->  S
+define aarch64_sve_vector_pcs void @streaming_caller_streaming_callee() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: streaming_caller_streaming_callee:
+; CHECK:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK:    addvl sp, sp, #-18
+; CHECK:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 16 + 144 * VG
+; CHECK:    .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x49, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x40, 0x1c // $d9 @ cfa - 16 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4a, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x40, 0x1c // $d10 @ cfa - 24 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4b, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x40, 0x1c // $d11 @ cfa - 32 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4c, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x40, 0x1c // $d12 @ cfa - 40 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4d, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x40, 0x1c // $d13 @ cfa - 48 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16
+; CHECK:    .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16
+; CHECK:    bl streaming_callee
+;
+; UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_consts +144, DW_OP_mul, DW_OP_plus
+; UNWINDINFO:      DW_CFA_expression: reg72 DW_OP_bregx 0x2e +0, DW_OP_consts -8, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_bregx 0x2e +0, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_bregx 0x2e +0, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_bregx 0x2e +0, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg76 DW_OP_bregx 0x2e +0, DW_OP_consts -40, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_bregx 0x2e +0, DW_OP_consts -48, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_bregx 0x2e +0, DW_OP_consts -56, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_bregx 0x2e +0, DW_OP_consts -64, DW_OP_mul, DW_OP_plus, DW_OP_lit16, DW_OP_minus
+  call void @streaming_callee()
+  ret void
+}
+
+; [ ] N  ->  S
+; [ ] S  ->  N
+; [ ] N  ->  N
+; [ ] S  ->  S
+; [x] LS ->  S
+; [ ] SC ->  S
+define aarch64_sve_vector_pcs void @locally_streaming() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: locally_streaming:
+; CHECK:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK:    .cfi_def_cfa_offset 32
+; CHECK:    cntd x9
+; CHECK:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK:    mov x29, sp
+; CHECK:    .cfi_def_cfa w29, 32
+; CHECK:    .cfi_offset vg, -16
+; CHECK:    addsvl sp, sp, #-18
+; CHECK:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK:    .cfi_escape 0x10, 0x48, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d8 @ cfa - 8 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x49, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d9 @ cfa - 16 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4a, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d10 @ cfa - 24 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4b, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d11 @ cfa - 32 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4c, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d12 @ cfa - 40 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4d, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d13 @ cfa - 48 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4e, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d14 @ cfa - 56 * IncomingVG - 32
+; CHECK:    .cfi_escape 0x10, 0x4f, 0x0b, 0x12, 0x40, 0x1c, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d15 @ cfa - 64 * IncomingVG - 32
+; CHECK:    smstart sm
+; CHECK:    bl streaming_callee
+; CHECK:    smstop sm
+;
+; UNWINDINFO:      DW_CFA_def_cfa: reg29 +32
+; UNWINDINFO:      DW_CFA_offset: reg46 -16
+; UNWINDINFO:      DW_CFA_expression: reg72 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -8, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg76 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -40, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -48, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -56, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_dup, DW_OP_lit16, DW_OP_minus, DW_OP_deref, DW_OP_consts -64, DW_OP_mul, DW_OP_plus, DW_OP_consts -32, DW_OP_plus
+  call void @streaming_callee()
+  ret void
+}
+
+; [ ] N  ->  S
+; [ ] S  ->  N
+; [ ] N  ->  N
+; [ ] S  ->  S
+; [ ] LS ->  S
+; [x] SC ->  S
+define aarch64_sve_vector_pcs void @streaming_compatible_caller_conditional_mode_switch() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: streaming_compatible_caller_conditional_mode_switch:
+; CHECK:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK:    .cfi_def_cfa_offset 48
+; CHECK:    cntd x9
+; CHECK:    stp x28, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK:    mov x29, sp
+; CHECK:    .cfi_def_cfa w29, 48
+; CHECK:    .cfi_offset vg, -32
+; CHECK:    addvl sp, sp, #-18
+; CHECK:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d8 @ cfa - 8 * IncomingVG - 48
+; CHECK:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d9 @ cfa - 16 * IncomingVG - 48
+; CHECK:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d10 @ cfa - 24 * IncomingVG - 48
+; CHECK:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d11 @ cfa - 32 * IncomingVG - 48
+; CHECK:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d12 @ cfa - 40 * IncomingVG - 48
+; CHECK:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d13 @ cfa - 48 * IncomingVG - 48
+; CHECK:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * IncomingVG - 48
+; CHECK:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * IncomingVG - 48
+; CHECK:    bl __arm_sme_state
+; CHECK:    mov x19, x0
+; CHECK:    tbnz w19, #0, .LBB5_2
+; CHECK:    smstart sm
+; CHECK:  .LBB5_2:
+; CHECK:    bl streaming_callee
+; CHECK:    tbnz w19, #0, .LBB5_4
+; CHECK:    smstop sm
+; CHECK:  .LBB5_4:
+;
+; UNWINDINFO:      DW_CFA_def_cfa: reg29 +48
+; UNWINDINFO:      DW_CFA_offset: reg46 -32
+; UNWINDINFO:      DW_CFA_expression: reg72 DW_OP_dup, DW_OP_consts -32, DW_OP_plus, DW_OP_deref, DW_OP_consts -8, DW_OP_mul, DW_OP_plus, DW_OP_consts -48, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg73 DW_OP_dup, DW_OP_consts -32, DW_OP_plus, DW_OP_deref, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_consts -48, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg74 DW_OP_dup, DW_OP_consts -32, DW_OP_plus, DW_OP_deref, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_consts -48, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg75 DW_OP_dup, DW_OP_consts -32, DW_OP_plus, DW_OP_deref, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_consts -48, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg76 DW_OP_dup, DW_OP_consts -32, DW_OP_plus, DW_OP_deref, DW_OP_consts -40, DW_OP_mul, DW_OP_plus, DW_OP_consts -48, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg77 DW_OP_dup, DW_OP_consts -32, DW_OP_plus, DW_OP_deref, DW_OP_consts -48, DW_OP_mul, DW_OP_plus, DW_OP_consts -48, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg78 DW_OP_dup, DW_OP_consts -32, DW_OP_plus, DW_OP_deref, DW_OP_consts -56, DW_OP_mul, DW_OP_plus, DW_OP_consts -48, DW_OP_plus
+; UNWINDINFO-NEXT: DW_CFA_expression: reg79 DW_OP_dup, DW_OP_consts -32, DW_OP_plus, DW_OP_deref, DW_OP_consts -64, DW_OP_mul, DW_OP_plus, DW_OP_consts -48, DW_OP_plus
+  call void @streaming_callee()
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index fe3f493353b50..7efa1d8f7a6a7 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -15,12 +15,11 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
 ; CHECK-LABEL: test_no_stackslot_scavenging:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x9, x24, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x30, x24, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
@@ -32,8 +31,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x24, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x30, x24, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -48,21 +47,20 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
 define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-pointer"="all" {
 ; CHECK-LABEL: test_no_stackslot_scavenging_with_fp:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-128]! // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    stp x28, x25, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x24, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x28, x25, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    lsl x9, x0, #3
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    mov x19, sp
-; CHECK-NEXT:    str s0, [x29, #28] // 4-byte Folded Spill
+; CHECK-NEXT:    str s0, [x19, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    add x9, x9, #15
 ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
 ; CHECK-NEXT:    sub x8, x8, x9
@@ -70,17 +68,17 @@ define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-po
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldr s0, [x29, #28] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr s0, [x19, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    bl use_f
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    sub sp, x29, #64
-; CHECK-NEXT:    ldp x24, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x28, x25, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x25, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #128 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ptr2 = alloca i64, i64 %n, align 8
   %ptr = alloca <vscale x 16 x i8>

diff  --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 60d8987334c89..dec8eb0d8a936 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -16,34 +16,37 @@ declare void @streaming_callee_with_arg(i32) #0;
 define void @vg_unwind_simple() #0 {
 ; CHECK-LABEL: vg_unwind_simple:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    .cfi_offset b10, -40
-; CHECK-NEXT:    .cfi_offset b11, -48
-; CHECK-NEXT:    .cfi_offset b12, -56
-; CHECK-NEXT:    .cfi_offset b13, -64
-; CHECK-NEXT:    .cfi_offset b14, -72
-; CHECK-NEXT:    .cfi_offset b15, -80
-; CHECK-NEXT:    .cfi_offset vg, -8
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_offset b8, -40
+; CHECK-NEXT:    .cfi_offset b9, -48
+; CHECK-NEXT:    .cfi_offset b10, -56
+; CHECK-NEXT:    .cfi_offset b11, -64
+; CHECK-NEXT:    .cfi_offset b12, -72
+; CHECK-NEXT:    .cfi_offset b13, -80
+; CHECK-NEXT:    .cfi_offset b14, -88
+; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
@@ -66,6 +69,7 @@ define void @vg_unwind_simple() #0 {
 ; FP-CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -76,11 +80,9 @@ define void @vg_unwind_simple() #0 {
 ; FP-CHECK-NEXT:    .cfi_offset b13, -80
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    bl callee
 ; FP-CHECK-NEXT:    smstart sm
-; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
@@ -88,6 +90,7 @@ define void @vg_unwind_simple() #0 {
 ; FP-CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_restore w30
 ; FP-CHECK-NEXT:    .cfi_restore w29
 ; FP-CHECK-NEXT:    .cfi_restore b8
@@ -102,7 +105,6 @@ define void @vg_unwind_simple() #0 {
 ;
 ; OUTLINER-CHECK-LABEL: vg_unwind_simple:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
-;
   call void @callee();
   ret void;
 }
@@ -119,10 +121,12 @@ define void @vg_unwind_needs_gap() #0 {
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x20, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x9, x20, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w20, -8
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    .cfi_offset b10, -56
@@ -133,20 +137,20 @@ define void @vg_unwind_needs_gap() #0 {
 ; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x20, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x20, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w20
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
@@ -170,6 +174,7 @@ define void @vg_unwind_needs_gap() #0 {
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
 ; FP-CHECK-NEXT:    .cfi_offset w20, -8
+; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -182,11 +187,9 @@ define void @vg_unwind_needs_gap() #0 {
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
 ; FP-CHECK-NEXT:    //APP
 ; FP-CHECK-NEXT:    //NO_APP
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    bl callee
 ; FP-CHECK-NEXT:    smstart sm
-; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldr x20, [sp, #88] // 8-byte Folded Reload
@@ -196,6 +199,7 @@ define void @vg_unwind_needs_gap() #0 {
 ; FP-CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; FP-CHECK-NEXT:    .cfi_restore w20
+; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_restore w30
 ; FP-CHECK-NEXT:    .cfi_restore w29
 ; FP-CHECK-NEXT:    .cfi_restore b8
@@ -219,38 +223,41 @@ define void @vg_unwind_needs_gap() #0 {
 define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 ; CHECK-LABEL: vg_unwind_with_fixed_args:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #96
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    sub sp, sp, #112
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    .cfi_offset b10, -40
-; CHECK-NEXT:    .cfi_offset b11, -48
-; CHECK-NEXT:    .cfi_offset b12, -56
-; CHECK-NEXT:    .cfi_offset b13, -64
-; CHECK-NEXT:    .cfi_offset b14, -72
-; CHECK-NEXT:    .cfi_offset b15, -80
+; CHECK-NEXT:    stp x29, x30, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_offset b8, -40
+; CHECK-NEXT:    .cfi_offset b9, -48
+; CHECK-NEXT:    .cfi_offset b10, -56
+; CHECK-NEXT:    .cfi_offset b11, -64
+; CHECK-NEXT:    .cfi_offset b12, -72
+; CHECK-NEXT:    .cfi_offset b13, -80
+; CHECK-NEXT:    .cfi_offset b14, -88
+; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset vg, -8
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl fixed_callee
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x29, x30, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    add sp, sp, #112
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
@@ -274,6 +281,7 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 ; FP-CHECK-NEXT:    str x9, [sp, #96] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #80
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -285,12 +293,10 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
 ; FP-CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    bl fixed_callee
 ; FP-CHECK-NEXT:    smstart sm
-; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 112
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #80] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -299,6 +305,7 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 ; FP-CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    add sp, sp, #112
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_restore w30
 ; FP-CHECK-NEXT:    .cfi_restore w29
 ; FP-CHECK-NEXT:    .cfi_restore b8
@@ -321,15 +328,19 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
 define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-LABEL: vg_unwind_with_sve_args:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w28, -8
-; CHECK-NEXT:    .cfi_offset w30, -24
-; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 48
+; CHECK-NEXT:    .cfi_offset w27, -8
+; CHECK-NEXT:    .cfi_offset w28, -16
+; CHECK-NEXT:    .cfi_offset vg, -32
+; CHECK-NEXT:    .cfi_offset w30, -40
+; CHECK-NEXT:    .cfi_offset w29, -48
 ; CHECK-NEXT:    addvl sp, sp, #-18
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x20, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 32 + 144 * VG
 ; CHECK-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    ptrue pn8.b
 ; CHECK-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
@@ -352,27 +363,23 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d8 @ cfa - 8 * VG - 32
-; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d9 @ cfa - 16 * VG - 32
-; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d10 @ cfa - 24 * VG - 32
-; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d11 @ cfa - 32 * VG - 32
-; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d12 @ cfa - 40 * VG - 32
-; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d13 @ cfa - 48 * VG - 32
-; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d14 @ cfa - 56 * VG - 32
-; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x60, 0x22 // $d15 @ cfa - 64 * VG - 32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d8 @ cfa - 8 * IncomingVG - 48
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d9 @ cfa - 16 * IncomingVG - 48
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d10 @ cfa - 24 * IncomingVG - 48
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d11 @ cfa - 32 * IncomingVG - 48
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d12 @ cfa - 40 * IncomingVG - 48
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d13 @ cfa - 48 * IncomingVG - 48
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * IncomingVG - 48
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * IncomingVG - 48
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x20, 0x92, 0x2e, 0x00, 0x11, 0x98, 0x01, 0x1e, 0x22 // sp + 32 + 152 * VG
-; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z0, [x29, #-19, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl scalable_callee
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x20, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 32 + 144 * VG
 ; CHECK-NEXT:    ptrue pn8.b
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
@@ -396,7 +403,6 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
-; CHECK-NEXT:    .cfi_def_cfa wsp, 32
 ; CHECK-NEXT:    .cfi_restore z8
 ; CHECK-NEXT:    .cfi_restore z9
 ; CHECK-NEXT:    .cfi_restore z10
@@ -405,10 +411,13 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; CHECK-NEXT:    .cfi_restore z13
 ; CHECK-NEXT:    .cfi_restore z14
 ; CHECK-NEXT:    .cfi_restore z15
-; CHECK-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa wsp, 48
+; CHECK-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w27
 ; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    ret
@@ -424,6 +433,7 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 48
 ; FP-CHECK-NEXT:    .cfi_offset w27, -8
 ; FP-CHECK-NEXT:    .cfi_offset w28, -16
+; FP-CHECK-NEXT:    .cfi_offset vg, -32
 ; FP-CHECK-NEXT:    .cfi_offset w30, -40
 ; FP-CHECK-NEXT:    .cfi_offset w29, -48
 ; FP-CHECK-NEXT:    addvl sp, sp, #-18
@@ -449,24 +459,22 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; FP-CHECK-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
 ; FP-CHECK-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d8 @ cfa - 8 * VG - 48
-; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d9 @ cfa - 16 * VG - 48
-; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d10 @ cfa - 24 * VG - 48
-; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d11 @ cfa - 32 * VG - 48
-; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d12 @ cfa - 40 * VG - 48
-; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d13 @ cfa - 48 * VG - 48
-; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * VG - 48
-; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48
+; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d8 @ cfa - 8 * IncomingVG - 48
+; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d9 @ cfa - 16 * IncomingVG - 48
+; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d10 @ cfa - 24 * IncomingVG - 48
+; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d11 @ cfa - 32 * IncomingVG - 48
+; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d12 @ cfa - 40 * IncomingVG - 48
+; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d13 @ cfa - 48 * IncomingVG - 48
+; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * IncomingVG - 48
+; FP-CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x60, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * IncomingVG - 48
 ; FP-CHECK-NEXT:    addvl sp, sp, #-1
 ; FP-CHECK-NEXT:    str z0, [x29, #-19, mul vl] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    //APP
 ; FP-CHECK-NEXT:    //NO_APP
-; FP-CHECK-NEXT:    .cfi_offset vg, -32
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    bl scalable_callee
 ; FP-CHECK-NEXT:    smstart sm
-; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    addvl sp, sp, #1
 ; FP-CHECK-NEXT:    ptrue pn8.b
 ; FP-CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
@@ -505,6 +513,7 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; FP-CHECK-NEXT:    .cfi_restore w27
 ; FP-CHECK-NEXT:    .cfi_restore w28
+; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_restore w30
 ; FP-CHECK-NEXT:    .cfi_restore w29
 ; FP-CHECK-NEXT:    ret
@@ -530,7 +539,9 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w28, -8
+; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    .cfi_offset w30, -24
 ; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
@@ -553,19 +564,20 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ; CHECK-NEXT:    .cfi_def_cfa_register wsp
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    str x8, [x0]
-; CHECK-NEXT:    .cfi_offset vg, -16
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
@@ -591,6 +603,7 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
 ; FP-CHECK-NEXT:    .cfi_offset w28, -8
+; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -611,11 +624,9 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ; FP-CHECK-NEXT:  // %bb.2: // %entry
 ; FP-CHECK-NEXT:    mov x8, sp
 ; FP-CHECK-NEXT:    str x8, [x0]
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    bl callee
 ; FP-CHECK-NEXT:    smstart sm
-; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
@@ -626,6 +637,7 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
 ; FP-CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; FP-CHECK-NEXT:    .cfi_restore w28
+; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_restore w30
 ; FP-CHECK-NEXT:    .cfi_restore w29
 ; FP-CHECK-NEXT:    .cfi_restore b8
@@ -648,24 +660,20 @@ entry:
   ret void
 }
 
-; Locally streaming functions require storing both the streaming and
-; non-streaming values of VG.
-;
 define void @vg_locally_streaming_fn() #3 {
 ; CHECK-LABEL: vg_locally_streaming_fn:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 96
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_offset vg, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    .cfi_offset b10, -56
@@ -676,18 +684,18 @@ define void @vg_locally_streaming_fn() #3 {
 ; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    bl streaming_callee
-; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
@@ -702,18 +710,15 @@ define void @vg_locally_streaming_fn() #3 {
 ; FP-CHECK:       // %bb.0:
 ; FP-CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 96
-; FP-CHECK-NEXT:    rdsvl x9, #1
+; FP-CHECK-NEXT:    cntd x9
 ; FP-CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    lsr x9, x9, #3
 ; FP-CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; FP-CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
-; FP-CHECK-NEXT:    cntd x9
 ; FP-CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; FP-CHECK-NEXT:    str x9, [sp, #88] // 8-byte Folded Spill
+; FP-CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
-; FP-CHECK-NEXT:    .cfi_offset vg, -8
+; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -726,9 +731,7 @@ define void @vg_locally_streaming_fn() #3 {
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
 ; FP-CHECK-NEXT:    bl callee
 ; FP-CHECK-NEXT:    smstart sm
-; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    bl streaming_callee
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    bl callee
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
@@ -738,6 +741,7 @@ define void @vg_locally_streaming_fn() #3 {
 ; FP-CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_restore w30
 ; FP-CHECK-NEXT:    .cfi_restore w29
 ; FP-CHECK-NEXT:    .cfi_restore b8
@@ -768,10 +772,12 @@ define void @streaming_compatible_to_streaming() #4 {
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w19, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    .cfi_offset b10, -56
@@ -782,7 +788,6 @@ define void @streaming_compatible_to_streaming() #4 {
 ; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    tbnz w19, #0, .LBB6_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstart sm
@@ -792,16 +797,17 @@ define void @streaming_compatible_to_streaming() #4 {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB6_4:
-; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
@@ -825,6 +831,7 @@ define void @streaming_compatible_to_streaming() #4 {
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
 ; FP-CHECK-NEXT:    .cfi_offset w19, -8
+; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -837,7 +844,6 @@ define void @streaming_compatible_to_streaming() #4 {
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
 ; FP-CHECK-NEXT:    bl __arm_sme_state
 ; FP-CHECK-NEXT:    mov x19, x0
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    tbnz w19, #0, .LBB6_2
 ; FP-CHECK-NEXT:  // %bb.1:
 ; FP-CHECK-NEXT:    smstart sm
@@ -847,7 +853,6 @@ define void @streaming_compatible_to_streaming() #4 {
 ; FP-CHECK-NEXT:  // %bb.3:
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:  .LBB6_4:
-; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
@@ -857,6 +862,7 @@ define void @streaming_compatible_to_streaming() #4 {
 ; FP-CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; FP-CHECK-NEXT:    .cfi_restore w19
+; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_restore w30
 ; FP-CHECK-NEXT:    .cfi_restore w29
 ; FP-CHECK-NEXT:    .cfi_restore b8
@@ -885,10 +891,12 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w19, -16
-; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
 ; CHECK-NEXT:    .cfi_offset b8, -40
 ; CHECK-NEXT:    .cfi_offset b9, -48
 ; CHECK-NEXT:    .cfi_offset b10, -56
@@ -899,7 +907,6 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    mov x19, x0
-; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    tbz w19, #0, .LBB7_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
@@ -909,16 +916,17 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB7_4:
-; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
@@ -942,6 +950,7 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
 ; FP-CHECK-NEXT:    .cfi_offset w19, -8
+; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -954,7 +963,6 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
 ; FP-CHECK-NEXT:    bl __arm_sme_state
 ; FP-CHECK-NEXT:    mov x19, x0
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    tbz w19, #0, .LBB7_2
 ; FP-CHECK-NEXT:  // %bb.1:
 ; FP-CHECK-NEXT:    smstop sm
@@ -964,7 +972,6 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; FP-CHECK-NEXT:  // %bb.3:
 ; FP-CHECK-NEXT:    smstart sm
 ; FP-CHECK-NEXT:  .LBB7_4:
-; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
@@ -974,6 +981,7 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; FP-CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; FP-CHECK-NEXT:    .cfi_restore w19
+; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_restore w30
 ; FP-CHECK-NEXT:    .cfi_restore w29
 ; FP-CHECK-NEXT:    .cfi_restore b8
@@ -1014,6 +1022,7 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; NO-SVE-CHECK-NEXT:    add x29, sp, #64
 ; NO-SVE-CHECK-NEXT:    .cfi_def_cfa w29, 32
 ; NO-SVE-CHECK-NEXT:    .cfi_offset w19, -8
+; NO-SVE-CHECK-NEXT:    .cfi_offset vg, -16
 ; NO-SVE-CHECK-NEXT:    .cfi_offset w30, -24
 ; NO-SVE-CHECK-NEXT:    .cfi_offset w29, -32
 ; NO-SVE-CHECK-NEXT:    .cfi_offset b8, -40
@@ -1027,7 +1036,6 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; NO-SVE-CHECK-NEXT:    mov w8, w0
 ; NO-SVE-CHECK-NEXT:    bl __arm_sme_state
 ; NO-SVE-CHECK-NEXT:    mov x19, x0
-; NO-SVE-CHECK-NEXT:    .cfi_offset vg, -16
 ; NO-SVE-CHECK-NEXT:    tbnz w19, #0, .LBB8_2
 ; NO-SVE-CHECK-NEXT:  // %bb.1:
 ; NO-SVE-CHECK-NEXT:    smstart sm
@@ -1038,7 +1046,6 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; NO-SVE-CHECK-NEXT:  // %bb.3:
 ; NO-SVE-CHECK-NEXT:    smstop sm
 ; NO-SVE-CHECK-NEXT:  .LBB8_4:
-; NO-SVE-CHECK-NEXT:    .cfi_restore vg
 ; NO-SVE-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; NO-SVE-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; NO-SVE-CHECK-NEXT:    ldr x19, [sp, #88] // 8-byte Folded Reload
@@ -1048,6 +1055,7 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; NO-SVE-CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; NO-SVE-CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; NO-SVE-CHECK-NEXT:    .cfi_restore w19
+; NO-SVE-CHECK-NEXT:    .cfi_restore vg
 ; NO-SVE-CHECK-NEXT:    .cfi_restore w30
 ; NO-SVE-CHECK-NEXT:    .cfi_restore w29
 ; NO-SVE-CHECK-NEXT:    .cfi_restore b8
@@ -1073,30 +1081,29 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; user-code as if it is part of the frame-setup when doing so.
 define void @test_rdsvl_right_after_prologue(i64 %x0) nounwind {
 ; NO-SVE-CHECK-LABEL: test_rdsvl_right_after_prologue:
-; NO-SVE-CHECK:     // %bb.0:
-; NO-SVE-CHECK-NEXT: stp     d15, d14, [sp, #-96]!           // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: stp     d13, d12, [sp, #16]             // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: mov     x9, x0
-; NO-SVE-CHECK-NEXT: stp     d11, d10, [sp, #32]             // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: stp     d9, d8, [sp, #48]               // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: stp     x29, x30, [sp, #64]             // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: bl      __arm_get_current_vg
-; NO-SVE-CHECK-NEXT: str     x0, [sp, #80]                   // 8-byte Folded Spill
-; NO-SVE-CHECK-NEXT: mov     x0, x9
-; NO-SVE-CHECK-NEXT: rdsvl   x8, #1
-; NO-SVE-CHECK-NEXT: add     x29, sp, #64
-; NO-SVE-CHECK-NEXT: lsr     x8, x8, #3
-; NO-SVE-CHECK-NEXT: mov     x1, x0
-; NO-SVE-CHECK-NEXT: smstart sm
-; NO-SVE-CHECK-NEXT: mov     x0, x8
-; NO-SVE-CHECK-NEXT: bl      bar
-; NO-SVE-CHECK-NEXT: smstop  sm
-; NO-SVE-CHECK-NEXT: ldp     x29, x30, [sp, #64]             // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ldp     d9, d8, [sp, #48]               // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ldp     d11, d10, [sp, #32]             // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ldp     d13, d12, [sp, #16]             // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ldp     d15, d14, [sp], #96             // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ret
+; NO-SVE-CHECK:       // %bb.0:
+; NO-SVE-CHECK-NEXT:    sub sp, sp, #96
+; NO-SVE-CHECK-NEXT:    rdsvl x8, #1
+; NO-SVE-CHECK-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT:    mov x1, x0
+; NO-SVE-CHECK-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT:    lsr x8, x8, #3
+; NO-SVE-CHECK-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT:    stp x29, x30, [sp, #80] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT:    add x29, sp, #80
+; NO-SVE-CHECK-NEXT:    smstart sm
+; NO-SVE-CHECK-NEXT:    mov x0, x8
+; NO-SVE-CHECK-NEXT:    bl bar
+; NO-SVE-CHECK-NEXT:    smstop sm
+; NO-SVE-CHECK-NEXT:    ldp x29, x30, [sp, #80] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT:    add sp, sp, #96
+; NO-SVE-CHECK-NEXT:    ret
+;
   %some_alloc = alloca i64, align 8
   %rdsvl = tail call i64 @llvm.aarch64.sme.cntsd()
   call void @bar(i64 %rdsvl, i64 %x0)
@@ -1111,34 +1118,37 @@ declare void @bar(i64, i64) "aarch64_pstate_sm_enabled"
 define void @vg_unwind_noasync() #5 {
 ; CHECK-LABEL: vg_unwind_noasync:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    .cfi_offset b8, -24
-; CHECK-NEXT:    .cfi_offset b9, -32
-; CHECK-NEXT:    .cfi_offset b10, -40
-; CHECK-NEXT:    .cfi_offset b11, -48
-; CHECK-NEXT:    .cfi_offset b12, -56
-; CHECK-NEXT:    .cfi_offset b13, -64
-; CHECK-NEXT:    .cfi_offset b14, -72
-; CHECK-NEXT:    .cfi_offset b15, -80
-; CHECK-NEXT:    .cfi_offset vg, -8
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset vg, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_offset b8, -40
+; CHECK-NEXT:    .cfi_offset b9, -48
+; CHECK-NEXT:    .cfi_offset b10, -56
+; CHECK-NEXT:    .cfi_offset b11, -64
+; CHECK-NEXT:    .cfi_offset b12, -72
+; CHECK-NEXT:    .cfi_offset b13, -80
+; CHECK-NEXT:    .cfi_offset b14, -88
+; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
 ; CHECK-NEXT:    .cfi_restore b9
 ; CHECK-NEXT:    .cfi_restore b10
@@ -1161,6 +1171,7 @@ define void @vg_unwind_noasync() #5 {
 ; FP-CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; FP-CHECK-NEXT:    add x29, sp, #64
 ; FP-CHECK-NEXT:    .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    .cfi_offset w30, -24
 ; FP-CHECK-NEXT:    .cfi_offset w29, -32
 ; FP-CHECK-NEXT:    .cfi_offset b8, -40
@@ -1171,11 +1182,9 @@ define void @vg_unwind_noasync() #5 {
 ; FP-CHECK-NEXT:    .cfi_offset b13, -80
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
-; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    smstop sm
 ; FP-CHECK-NEXT:    bl callee
 ; FP-CHECK-NEXT:    smstart sm
-; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_def_cfa wsp, 96
 ; FP-CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
@@ -1183,6 +1192,7 @@ define void @vg_unwind_noasync() #5 {
 ; FP-CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
 ; FP-CHECK-NEXT:    .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT:    .cfi_restore vg
 ; FP-CHECK-NEXT:    .cfi_restore w30
 ; FP-CHECK-NEXT:    .cfi_restore w29
 ; FP-CHECK-NEXT:    .cfi_restore b8
@@ -1194,6 +1204,7 @@ define void @vg_unwind_noasync() #5 {
 ; FP-CHECK-NEXT:    .cfi_restore b14
 ; FP-CHECK-NEXT:    .cfi_restore b15
 ; FP-CHECK-NEXT:    ret
+;
 ; OUTLINER-CHECK-LABEL: vg_unwind_noasync:
 ; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
 ;

diff  --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
index c67d91952c618..1de8d0a080b70 100644
--- a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll
@@ -72,12 +72,12 @@ entry:
 ; mitigated with the -aarch64-enable-zpr-predicate-spills option.
 
 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 {
-; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-48-258 * vscale] is too close to FPR stack object at [SP-48-256 * vscale]
-; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
-; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale]
+; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-64-258 * vscale] is too close to FPR stack object at [SP-64-256 * vscale]
+; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
+; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at [SP-1088-258 * vscale] is too close to FPR stack object at [SP-1088-256 * vscale]
 ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_call':
 ; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
-; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
+; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
 ; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
 ; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_call': FPR stack object at {{.*}} is too close to GPR stack object
 entry:
@@ -87,12 +87,12 @@ entry:
 }
 
 define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) #2 {
-; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-48-258 * vscale] is too close to FPR stack object at [SP-48-256 * vscale]
-; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
-; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale]
+; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-64-258 * vscale] is too close to FPR stack object at [SP-64-256 * vscale]
+; CHECK: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
+; CHECK-PADDING: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-1088-258 * vscale] is too close to FPR stack object at [SP-1088-256 * vscale]
 ; CHECK-PADDING-NOT: remark: <unknown>:0:0: stack hazard in 'svecc_alloca_call':
 ; CHECK-ZPR-PRED-SPILLS-NOT: <unknown>:0:0: stack hazard in 'svecc_call': PPR stack object at {{.*}} is too close to FPR stack object
-; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48]
+; CHECK-ZPR-PRED-SPILLS: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-64-16 * vscale] is too close to GPR stack object at [SP-64]
 ; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_alloca_call': PPR stack object at {{.*}} is too close to FPR stack object
 ; CHECK-ZPR-PRED-SPILLS-WITH-PADDING-NOT: <unknown>:0:0: stack hazard in 'svecc_alloca_call': FPR stack object at {{.*}} is too close to GPR stack object
 entry:

diff  --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index e5849f32f67d2..c878d888b5f03 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -616,16 +616,13 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK0:       // %bb.0: // %entry
 ; CHECK0-NEXT:    sub sp, sp, #176
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 176
-; CHECK0-NEXT:    rdsvl x9, #1
-; CHECK0-NEXT:    stp d15, d14, [sp, #48] // 16-byte Folded Spill
-; CHECK0-NEXT:    lsr x9, x9, #3
-; CHECK0-NEXT:    stp d13, d12, [sp, #64] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp d11, d10, [sp, #80] // 16-byte Folded Spill
-; CHECK0-NEXT:    str x9, [sp, #32] // 8-byte Folded Spill
 ; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    str x9, [sp, #40] // 8-byte Folded Spill
-; CHECK0-NEXT:    stp d9, d8, [sp, #96] // 16-byte Folded Spill
-; CHECK0-NEXT:    str x25, [sp, #112] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x29, x30, [sp, #96] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x9, x25, [sp, #112] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp x24, x23, [sp, #128] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp x22, x21, [sp, #144] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp x20, x19, [sp, #160] // 16-byte Folded Spill
@@ -635,16 +632,18 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK0-NEXT:    .cfi_offset w22, -32
 ; CHECK0-NEXT:    .cfi_offset w23, -40
 ; CHECK0-NEXT:    .cfi_offset w24, -48
-; CHECK0-NEXT:    .cfi_offset w25, -64
-; CHECK0-NEXT:    .cfi_offset b8, -72
-; CHECK0-NEXT:    .cfi_offset b9, -80
-; CHECK0-NEXT:    .cfi_offset b10, -88
-; CHECK0-NEXT:    .cfi_offset b11, -96
-; CHECK0-NEXT:    .cfi_offset b12, -104
-; CHECK0-NEXT:    .cfi_offset b13, -112
-; CHECK0-NEXT:    .cfi_offset b14, -120
-; CHECK0-NEXT:    .cfi_offset b15, -128
-; CHECK0-NEXT:    .cfi_offset vg, -136
+; CHECK0-NEXT:    .cfi_offset w25, -56
+; CHECK0-NEXT:    .cfi_offset vg, -64
+; CHECK0-NEXT:    .cfi_offset w30, -72
+; CHECK0-NEXT:    .cfi_offset w29, -80
+; CHECK0-NEXT:    .cfi_offset b8, -88
+; CHECK0-NEXT:    .cfi_offset b9, -96
+; CHECK0-NEXT:    .cfi_offset b10, -104
+; CHECK0-NEXT:    .cfi_offset b11, -112
+; CHECK0-NEXT:    .cfi_offset b12, -120
+; CHECK0-NEXT:    .cfi_offset b13, -128
+; CHECK0-NEXT:    .cfi_offset b14, -136
+; CHECK0-NEXT:    .cfi_offset b15, -144
 ; CHECK0-NEXT:    str d0, [sp, #8] // 8-byte Folded Spill
 ; CHECK0-NEXT:    smstart sm
 ; CHECK0-NEXT:    //APP
@@ -658,12 +657,13 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK0-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
 ; CHECK0-NEXT:    mov w0, wzr
 ; CHECK0-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr x25, [sp, #112] // 8-byte Folded Reload
+; CHECK0-NEXT:    ldr x25, [sp, #120] // 8-byte Folded Reload
 ; CHECK0-NEXT:    ldp x24, x23, [sp, #128] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp d9, d8, [sp, #96] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp d11, d10, [sp, #80] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp d13, d12, [sp, #64] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp d15, d14, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp, #96] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp d15, d14, [sp, #32] // 16-byte Folded Reload
 ; CHECK0-NEXT:    add sp, sp, #176
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK0-NEXT:    .cfi_restore w19
@@ -673,6 +673,9 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK0-NEXT:    .cfi_restore w23
 ; CHECK0-NEXT:    .cfi_restore w24
 ; CHECK0-NEXT:    .cfi_restore w25
+; CHECK0-NEXT:    .cfi_restore vg
+; CHECK0-NEXT:    .cfi_restore w30
+; CHECK0-NEXT:    .cfi_restore w29
 ; CHECK0-NEXT:    .cfi_restore b8
 ; CHECK0-NEXT:    .cfi_restore b9
 ; CHECK0-NEXT:    .cfi_restore b10
@@ -687,16 +690,13 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK64:       // %bb.0: // %entry
 ; CHECK64-NEXT:    sub sp, sp, #304
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 304
-; CHECK64-NEXT:    rdsvl x9, #1
-; CHECK64-NEXT:    stp d15, d14, [sp, #112] // 16-byte Folded Spill
-; CHECK64-NEXT:    lsr x9, x9, #3
-; CHECK64-NEXT:    stp d13, d12, [sp, #128] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp d11, d10, [sp, #144] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x9, [sp, #96] // 8-byte Folded Spill
 ; CHECK64-NEXT:    cntd x9
-; CHECK64-NEXT:    str x9, [sp, #104] // 8-byte Folded Spill
-; CHECK64-NEXT:    stp d9, d8, [sp, #160] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x29, x25, [sp, #240] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp d15, d14, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp d13, d12, [sp, #112] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp d11, d10, [sp, #128] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp d9, d8, [sp, #144] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x29, x30, [sp, #224] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x9, x25, [sp, #240] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp x24, x23, [sp, #256] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp x22, x21, [sp, #272] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp x20, x19, [sp, #288] // 16-byte Folded Spill
@@ -707,16 +707,17 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK64-NEXT:    .cfi_offset w23, -40
 ; CHECK64-NEXT:    .cfi_offset w24, -48
 ; CHECK64-NEXT:    .cfi_offset w25, -56
-; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    .cfi_offset b8, -136
-; CHECK64-NEXT:    .cfi_offset b9, -144
-; CHECK64-NEXT:    .cfi_offset b10, -152
-; CHECK64-NEXT:    .cfi_offset b11, -160
-; CHECK64-NEXT:    .cfi_offset b12, -168
-; CHECK64-NEXT:    .cfi_offset b13, -176
-; CHECK64-NEXT:    .cfi_offset b14, -184
-; CHECK64-NEXT:    .cfi_offset b15, -192
-; CHECK64-NEXT:    .cfi_offset vg, -200
+; CHECK64-NEXT:    .cfi_offset vg, -64
+; CHECK64-NEXT:    .cfi_offset w30, -72
+; CHECK64-NEXT:    .cfi_offset w29, -80
+; CHECK64-NEXT:    .cfi_offset b8, -152
+; CHECK64-NEXT:    .cfi_offset b9, -160
+; CHECK64-NEXT:    .cfi_offset b10, -168
+; CHECK64-NEXT:    .cfi_offset b11, -176
+; CHECK64-NEXT:    .cfi_offset b12, -184
+; CHECK64-NEXT:    .cfi_offset b13, -192
+; CHECK64-NEXT:    .cfi_offset b14, -200
+; CHECK64-NEXT:    .cfi_offset b15, -208
 ; CHECK64-NEXT:    str d0, [sp, #80] // 8-byte Folded Spill
 ; CHECK64-NEXT:    smstart sm
 ; CHECK64-NEXT:    //APP
@@ -730,12 +731,13 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK64-NEXT:    ldp x20, x19, [sp, #288] // 16-byte Folded Reload
 ; CHECK64-NEXT:    mov w0, wzr
 ; CHECK64-NEXT:    ldp x22, x21, [sp, #272] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x25, [sp, #248] // 8-byte Folded Reload
 ; CHECK64-NEXT:    ldp x24, x23, [sp, #256] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x25, [sp, #240] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp d9, d8, [sp, #160] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp d11, d10, [sp, #144] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp d13, d12, [sp, #128] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp d15, d14, [sp, #112] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp, #224] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp d9, d8, [sp, #144] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp d11, d10, [sp, #128] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp d13, d12, [sp, #112] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp d15, d14, [sp, #96] // 16-byte Folded Reload
 ; CHECK64-NEXT:    add sp, sp, #304
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
@@ -745,6 +747,8 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK64-NEXT:    .cfi_restore w23
 ; CHECK64-NEXT:    .cfi_restore w24
 ; CHECK64-NEXT:    .cfi_restore w25
+; CHECK64-NEXT:    .cfi_restore vg
+; CHECK64-NEXT:    .cfi_restore w30
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    .cfi_restore b8
 ; CHECK64-NEXT:    .cfi_restore b9
@@ -758,18 +762,16 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ;
 ; CHECK1024-LABEL: csr_x18_25_d8_15_allocdi64_locallystreaming:
 ; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    rdsvl x9, #1
-; CHECK1024-NEXT:    lsr x9, x9, #3
 ; CHECK1024-NEXT:    sub sp, sp, #1168
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1168
-; CHECK1024-NEXT:    str x9, [sp] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    cntd x9
-; CHECK1024-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; CHECK1024-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK1024-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK1024-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK1024-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str x29, [sp, #1104] // 8-byte Folded Spill
+; CHECK1024-NEXT:    stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK1024-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK1024-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-NEXT:    str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x9, [sp, #1104] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x25, [sp, #1112] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x24, [sp, #1120] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x23, [sp, #1128] // 8-byte Folded Spill
@@ -784,16 +786,17 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK1024-NEXT:    .cfi_offset w23, -40
 ; CHECK1024-NEXT:    .cfi_offset w24, -48
 ; CHECK1024-NEXT:    .cfi_offset w25, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    .cfi_offset b8, -1096
-; CHECK1024-NEXT:    .cfi_offset b9, -1104
-; CHECK1024-NEXT:    .cfi_offset b10, -1112
-; CHECK1024-NEXT:    .cfi_offset b11, -1120
-; CHECK1024-NEXT:    .cfi_offset b12, -1128
-; CHECK1024-NEXT:    .cfi_offset b13, -1136
-; CHECK1024-NEXT:    .cfi_offset b14, -1144
-; CHECK1024-NEXT:    .cfi_offset b15, -1152
-; CHECK1024-NEXT:    .cfi_offset vg, -1160
+; CHECK1024-NEXT:    .cfi_offset vg, -64
+; CHECK1024-NEXT:    .cfi_offset w30, -72
+; CHECK1024-NEXT:    .cfi_offset w29, -80
+; CHECK1024-NEXT:    .cfi_offset b8, -1112
+; CHECK1024-NEXT:    .cfi_offset b9, -1120
+; CHECK1024-NEXT:    .cfi_offset b10, -1128
+; CHECK1024-NEXT:    .cfi_offset b11, -1136
+; CHECK1024-NEXT:    .cfi_offset b12, -1144
+; CHECK1024-NEXT:    .cfi_offset b13, -1152
+; CHECK1024-NEXT:    .cfi_offset b14, -1160
+; CHECK1024-NEXT:    .cfi_offset b15, -1168
 ; CHECK1024-NEXT:    sub sp, sp, #1056
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 2224
 ; CHECK1024-NEXT:    str d0, [sp, #1040] // 8-byte Folded Spill
@@ -809,18 +812,19 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK1024-NEXT:    mov w0, wzr
 ; CHECK1024-NEXT:    add sp, sp, #1056
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1168
-; CHECK1024-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x19, [sp, #1160] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x20, [sp, #1152] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x21, [sp, #1144] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x22, [sp, #1136] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x23, [sp, #1128] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x24, [sp, #1120] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x25, [sp, #1112] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1104] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK1024-NEXT:    ldp d15, d14, [sp] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    add sp, sp, #1168
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK1024-NEXT:    .cfi_restore w19
@@ -830,6 +834,8 @@ define i32 @csr_x18_25_d8_15_allocdi64_locallystreaming(i64 %d, double %e) "aarc
 ; CHECK1024-NEXT:    .cfi_restore w23
 ; CHECK1024-NEXT:    .cfi_restore w24
 ; CHECK1024-NEXT:    .cfi_restore w25
+; CHECK1024-NEXT:    .cfi_restore vg
+; CHECK1024-NEXT:    .cfi_restore w30
 ; CHECK1024-NEXT:    .cfi_restore w29
 ; CHECK1024-NEXT:    .cfi_restore b8
 ; CHECK1024-NEXT:    .cfi_restore b9
@@ -1570,36 +1576,38 @@ define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x
 define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32 %b) "aarch64_pstate_sm_compatible" {
 ; CHECK0-LABEL: f128_libcall:
 ; CHECK0:       // %bb.0:
-; CHECK0-NEXT:    sub sp, sp, #176
-; CHECK0-NEXT:    .cfi_def_cfa_offset 176
+; CHECK0-NEXT:    sub sp, sp, #192
+; CHECK0-NEXT:    .cfi_def_cfa_offset 192
 ; CHECK0-NEXT:    cntd x9
 ; CHECK0-NEXT:    stp d15, d14, [sp, #64] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp d13, d12, [sp, #80] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp d11, d10, [sp, #96] // 16-byte Folded Spill
 ; CHECK0-NEXT:    stp d9, d8, [sp, #112] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x30, x9, [sp, #128] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x22, x21, [sp, #144] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x20, x19, [sp, #160] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x29, x30, [sp, #128] // 16-byte Folded Spill
+; CHECK0-NEXT:    str x9, [sp, #144] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x22, x21, [sp, #160] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x20, x19, [sp, #176] // 16-byte Folded Spill
 ; CHECK0-NEXT:    .cfi_offset w19, -8
 ; CHECK0-NEXT:    .cfi_offset w20, -16
 ; CHECK0-NEXT:    .cfi_offset w21, -24
 ; CHECK0-NEXT:    .cfi_offset w22, -32
-; CHECK0-NEXT:    .cfi_offset w30, -48
-; CHECK0-NEXT:    .cfi_offset b8, -56
-; CHECK0-NEXT:    .cfi_offset b9, -64
-; CHECK0-NEXT:    .cfi_offset b10, -72
-; CHECK0-NEXT:    .cfi_offset b11, -80
-; CHECK0-NEXT:    .cfi_offset b12, -88
-; CHECK0-NEXT:    .cfi_offset b13, -96
-; CHECK0-NEXT:    .cfi_offset b14, -104
-; CHECK0-NEXT:    .cfi_offset b15, -112
+; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
+; CHECK0-NEXT:    .cfi_offset b8, -72
+; CHECK0-NEXT:    .cfi_offset b9, -80
+; CHECK0-NEXT:    .cfi_offset b10, -88
+; CHECK0-NEXT:    .cfi_offset b11, -96
+; CHECK0-NEXT:    .cfi_offset b12, -104
+; CHECK0-NEXT:    .cfi_offset b13, -112
+; CHECK0-NEXT:    .cfi_offset b14, -120
+; CHECK0-NEXT:    .cfi_offset b15, -128
 ; CHECK0-NEXT:    mov w19, w1
 ; CHECK0-NEXT:    mov w20, w0
 ; CHECK0-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
 ; CHECK0-NEXT:    stp q2, q3, [sp, #32] // 32-byte Folded Spill
 ; CHECK0-NEXT:    bl __arm_sme_state
 ; CHECK0-NEXT:    mov x21, x0
-; CHECK0-NEXT:    .cfi_offset vg, -40
 ; CHECK0-NEXT:    tbz w21, #0, .LBB27_2
 ; CHECK0-NEXT:  // %bb.1:
 ; CHECK0-NEXT:    smstop sm
@@ -1611,9 +1619,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK0-NEXT:    smstart sm
 ; CHECK0-NEXT:  .LBB27_4:
 ; CHECK0-NEXT:    cmp w0, #0
-; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    cset w22, mi
-; CHECK0-NEXT:    .cfi_offset vg, -40
 ; CHECK0-NEXT:    tbz w21, #0, .LBB27_6
 ; CHECK0-NEXT:  // %bb.5:
 ; CHECK0-NEXT:    smstop sm
@@ -1625,24 +1631,25 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK0-NEXT:    smstart sm
 ; CHECK0-NEXT:  .LBB27_8:
 ; CHECK0-NEXT:    cmp w0, #0
+; CHECK0-NEXT:    ldp x29, x30, [sp, #128] // 16-byte Folded Reload
 ; CHECK0-NEXT:    cset w8, pl
+; CHECK0-NEXT:    ldp d9, d8, [sp, #112] // 16-byte Folded Reload
 ; CHECK0-NEXT:    tst w8, w22
+; CHECK0-NEXT:    ldp x22, x21, [sp, #160] // 16-byte Folded Reload
 ; CHECK0-NEXT:    csel w0, w20, w19, ne
-; CHECK0-NEXT:    .cfi_restore vg
-; CHECK0-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr x30, [sp, #128] // 8-byte Folded Reload
-; CHECK0-NEXT:    ldp x22, x21, [sp, #144] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp d9, d8, [sp, #112] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp d11, d10, [sp, #96] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp d13, d12, [sp, #80] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldp d15, d14, [sp, #64] // 16-byte Folded Reload
-; CHECK0-NEXT:    add sp, sp, #176
+; CHECK0-NEXT:    add sp, sp, #192
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK0-NEXT:    .cfi_restore w19
 ; CHECK0-NEXT:    .cfi_restore w20
 ; CHECK0-NEXT:    .cfi_restore w21
 ; CHECK0-NEXT:    .cfi_restore w22
+; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    .cfi_restore w30
+; CHECK0-NEXT:    .cfi_restore w29
 ; CHECK0-NEXT:    .cfi_restore b8
 ; CHECK0-NEXT:    .cfi_restore b9
 ; CHECK0-NEXT:    .cfi_restore b10
@@ -1663,13 +1670,15 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64-NEXT:    stp d11, d10, [sp, #160] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp d9, d8, [sp, #176] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp x29, x30, [sp, #256] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x22, [sp, #272] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x21, x20, [sp, #288] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #304] // 8-byte Folded Spill
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w20, -24
-; CHECK64-NEXT:    .cfi_offset w21, -32
-; CHECK64-NEXT:    .cfi_offset w22, -40
+; CHECK64-NEXT:    stp x9, x28, [sp, #272] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x22, x21, [sp, #288] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #304] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w20, -16
+; CHECK64-NEXT:    .cfi_offset w21, -24
+; CHECK64-NEXT:    .cfi_offset w22, -32
+; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
 ; CHECK64-NEXT:    .cfi_offset b8, -136
@@ -1686,7 +1695,6 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64-NEXT:    stp q2, q3, [sp, #96] // 32-byte Folded Spill
 ; CHECK64-NEXT:    bl __arm_sme_state
 ; CHECK64-NEXT:    mov x21, x0
-; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    tbz w21, #0, .LBB27_2
 ; CHECK64-NEXT:  // %bb.1:
 ; CHECK64-NEXT:    smstop sm
@@ -1698,9 +1706,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64-NEXT:    smstart sm
 ; CHECK64-NEXT:  .LBB27_4:
 ; CHECK64-NEXT:    cmp w0, #0
-; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    cset w22, mi
-; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    tbz w21, #0, .LBB27_6
 ; CHECK64-NEXT:  // %bb.5:
 ; CHECK64-NEXT:    smstop sm
@@ -1712,15 +1718,15 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64-NEXT:    smstart sm
 ; CHECK64-NEXT:  .LBB27_8:
 ; CHECK64-NEXT:    cmp w0, #0
+; CHECK64-NEXT:    ldp x29, x30, [sp, #256] // 16-byte Folded Reload
 ; CHECK64-NEXT:    cset w8, pl
+; CHECK64-NEXT:    ldp d9, d8, [sp, #176] // 16-byte Folded Reload
 ; CHECK64-NEXT:    tst w8, w22
+; CHECK64-NEXT:    ldp x22, x21, [sp, #288] // 16-byte Folded Reload
 ; CHECK64-NEXT:    csel w0, w20, w19, ne
-; CHECK64-NEXT:    .cfi_restore vg
-; CHECK64-NEXT:    ldp x20, x19, [sp, #296] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x22, x21, [sp, #280] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #256] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp d9, d8, [sp, #176] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x20, x19, [sp, #304] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldp d11, d10, [sp, #160] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x28, [sp, #280] // 8-byte Folded Reload
 ; CHECK64-NEXT:    ldp d13, d12, [sp, #144] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldp d15, d14, [sp, #128] // 16-byte Folded Reload
 ; CHECK64-NEXT:    add sp, sp, #320
@@ -1729,6 +1735,8 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64-NEXT:    .cfi_restore w20
 ; CHECK64-NEXT:    .cfi_restore w21
 ; CHECK64-NEXT:    .cfi_restore w22
+; CHECK64-NEXT:    .cfi_restore w28
+; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    .cfi_restore w30
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    .cfi_restore b8
@@ -1753,14 +1761,17 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:    str x29, [sp, #1088] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x30, [sp, #1096] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x9, [sp, #1104] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x22, [sp, #1112] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x21, [sp, #1120] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1128] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1136] // 8-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w20, -24
-; CHECK1024-NEXT:    .cfi_offset w21, -32
-; CHECK1024-NEXT:    .cfi_offset w22, -40
+; CHECK1024-NEXT:    str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x22, [sp, #1120] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x21, [sp, #1128] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x20, [sp, #1136] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1144] // 8-byte Folded Spill
+; CHECK1024-NEXT:    .cfi_offset w19, -8
+; CHECK1024-NEXT:    .cfi_offset w20, -16
+; CHECK1024-NEXT:    .cfi_offset w21, -24
+; CHECK1024-NEXT:    .cfi_offset w22, -32
+; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset vg, -48
 ; CHECK1024-NEXT:    .cfi_offset w30, -56
 ; CHECK1024-NEXT:    .cfi_offset w29, -64
 ; CHECK1024-NEXT:    .cfi_offset b8, -1096
@@ -1781,7 +1792,6 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:    str q0, [sp, #1024] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    bl __arm_sme_state
 ; CHECK1024-NEXT:    mov x21, x0
-; CHECK1024-NEXT:    .cfi_offset vg, -48
 ; CHECK1024-NEXT:    tbz w21, #0, .LBB27_2
 ; CHECK1024-NEXT:  // %bb.1:
 ; CHECK1024-NEXT:    smstop sm
@@ -1794,9 +1804,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:    smstart sm
 ; CHECK1024-NEXT:  .LBB27_4:
 ; CHECK1024-NEXT:    cmp w0, #0
-; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    cset w22, mi
-; CHECK1024-NEXT:    .cfi_offset vg, -48
 ; CHECK1024-NEXT:    tbz w21, #0, .LBB27_6
 ; CHECK1024-NEXT:  // %bb.5:
 ; CHECK1024-NEXT:    smstop sm
@@ -1812,15 +1820,15 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:    cset w8, pl
 ; CHECK1024-NEXT:    tst w8, w22
 ; CHECK1024-NEXT:    csel w0, w20, w19, ne
-; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    add sp, sp, #1088
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 1152
 ; CHECK1024-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr x19, [sp, #1136] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x19, [sp, #1144] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1128] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x21, [sp, #1120] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x22, [sp, #1112] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x20, [sp, #1136] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x21, [sp, #1128] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x22, [sp, #1120] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x28, [sp, #1112] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x30, [sp, #1096] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x29, [sp, #1088] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -1831,6 +1839,8 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:    .cfi_restore w20
 ; CHECK1024-NEXT:    .cfi_restore w21
 ; CHECK1024-NEXT:    .cfi_restore w22
+; CHECK1024-NEXT:    .cfi_restore w28
+; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    .cfi_restore w30
 ; CHECK1024-NEXT:    .cfi_restore w29
 ; CHECK1024-NEXT:    .cfi_restore b8
@@ -1842,6 +1852,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:    .cfi_restore b14
 ; CHECK1024-NEXT:    .cfi_restore b15
 ; CHECK1024-NEXT:    ret
+;el w0, w20, w19, ne
   %c0 = fcmp olt fp128 %v0, %v1
   %c1 = fcmp oge fp128 %v2, %v3
   %cr = and i1 %c1, %c0
@@ -1852,18 +1863,22 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
 ; CHECK0-LABEL: svecc_call:
 ; CHECK0:       // %bb.0: // %entry
-; CHECK0-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x27, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
 ; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w27, -16
-; CHECK0-NEXT:    .cfi_offset w28, -24
-; CHECK0-NEXT:    .cfi_offset w30, -40
-; CHECK0-NEXT:    .cfi_offset w29, -48
+; CHECK0-NEXT:    .cfi_offset w26, -16
+; CHECK0-NEXT:    .cfi_offset w27, -24
+; CHECK0-NEXT:    .cfi_offset w28, -32
+; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
 ; CHECK0-NEXT:    addvl sp, sp, #-18
-; CHECK0-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x30, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 48 + 144 * VG
 ; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -1892,20 +1907,19 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d8 @ cfa - 8 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d9 @ cfa - 16 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d10 @ cfa - 24 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d11 @ cfa - 32 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d12 @ cfa - 40 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d13 @ cfa - 48 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
 ; CHECK0-NEXT:    mov x8, x0
 ; CHECK0-NEXT:    bl __arm_sme_state
 ; CHECK0-NEXT:    mov x19, x0
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    .cfi_offset vg, -32
 ; CHECK0-NEXT:    tbz w19, #0, .LBB28_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
 ; CHECK0-NEXT:    smstop sm
@@ -1918,13 +1932,12 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK0-NEXT:  // %bb.3: // %entry
 ; CHECK0-NEXT:    smstart sm
 ; CHECK0-NEXT:  .LBB28_4: // %entry
-; CHECK0-NEXT:    mov w0, #22647 // =0x5877
-; CHECK0-NEXT:    movk w0, #59491, lsl #16
-; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    movk w0, #59491, lsl #16
 ; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
@@ -1950,7 +1963,6 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK0-NEXT:    addvl sp, sp, #18
-; CHECK0-NEXT:    .cfi_def_cfa wsp, 48
 ; CHECK0-NEXT:    .cfi_restore z8
 ; CHECK0-NEXT:    .cfi_restore z9
 ; CHECK0-NEXT:    .cfi_restore z10
@@ -1959,32 +1971,39 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK0-NEXT:    .cfi_restore z13
 ; CHECK0-NEXT:    .cfi_restore z14
 ; CHECK0-NEXT:    .cfi_restore z15
-; CHECK0-NEXT:    ldp x27, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
-; CHECK0-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK0-NEXT:    .cfi_restore w19
+; CHECK0-NEXT:    .cfi_restore w26
 ; CHECK0-NEXT:    .cfi_restore w27
 ; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    .cfi_restore w30
 ; CHECK0-NEXT:    .cfi_restore w29
 ; CHECK0-NEXT:    ret
 ;
 ; CHECK64-LABEL: svecc_call:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #112
-; CHECK64-NEXT:    .cfi_def_cfa_offset 112
+; CHECK64-NEXT:    sub sp, sp, #128
+; CHECK64-NEXT:    .cfi_def_cfa_offset 128
 ; CHECK64-NEXT:    cntd x9
 ; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_offset w19, -8
-; CHECK64-NEXT:    .cfi_offset w27, -16
-; CHECK64-NEXT:    .cfi_offset w28, -24
-; CHECK64-NEXT:    .cfi_offset w30, -40
-; CHECK64-NEXT:    .cfi_offset w29, -48
+; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    .cfi_def_cfa w29, 64
+; CHECK64-NEXT:    .cfi_offset w19, -16
+; CHECK64-NEXT:    .cfi_offset w26, -24
+; CHECK64-NEXT:    .cfi_offset w27, -32
+; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset vg, -48
+; CHECK64-NEXT:    .cfi_offset w30, -56
+; CHECK64-NEXT:    .cfi_offset w29, -64
 ; CHECK64-NEXT:    addvl sp, sp, #-18
-; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xf0, 0x00, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 112 + 144 * VG
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -2013,22 +2032,20 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 112
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
 ; CHECK64-NEXT:    sub sp, sp, #64
-; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x01, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 176 + 144 * VG
 ; CHECK64-NEXT:    mov x8, x0
 ; CHECK64-NEXT:    bl __arm_sme_state
 ; CHECK64-NEXT:    mov x19, x0
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
-; CHECK64-NEXT:    .cfi_offset vg, -32
 ; CHECK64-NEXT:    tbz w19, #0, .LBB28_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
 ; CHECK64-NEXT:    smstop sm
@@ -2043,9 +2060,7 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:  .LBB28_4: // %entry
 ; CHECK64-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
-; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    add sp, sp, #64
-; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xf0, 0x00, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 112 + 144 * VG
 ; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -2075,7 +2090,6 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    addvl sp, sp, #18
-; CHECK64-NEXT:    .cfi_def_cfa wsp, 112
 ; CHECK64-NEXT:    .cfi_restore z8
 ; CHECK64-NEXT:    .cfi_restore z9
 ; CHECK64-NEXT:    .cfi_restore z10
@@ -2084,36 +2098,43 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:    .cfi_restore z13
 ; CHECK64-NEXT:    .cfi_restore z14
 ; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    ldp x27, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
+; CHECK64-NEXT:    ldp x26, x19, [sp, #104] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x28, x27, [sp, #88] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #112
+; CHECK64-NEXT:    add sp, sp, #128
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
+; CHECK64-NEXT:    .cfi_restore w26
 ; CHECK64-NEXT:    .cfi_restore w27
 ; CHECK64-NEXT:    .cfi_restore w28
+; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    .cfi_restore w30
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
 ;
 ; CHECK1024-LABEL: svecc_call:
 ; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1072
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 1072
+; CHECK1024-NEXT:    sub sp, sp, #1088
+; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
 ; CHECK1024-NEXT:    cntd x9
 ; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w27, -16
-; CHECK1024-NEXT:    .cfi_offset w28, -24
-; CHECK1024-NEXT:    .cfi_offset w30, -40
-; CHECK1024-NEXT:    .cfi_offset w29, -48
+; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NEXT:    add x29, sp, #1024
+; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NEXT:    .cfi_offset w29, -64
 ; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 1072 + 144 * VG
 ; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -2142,22 +2163,20 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1072
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
 ; CHECK1024-NEXT:    sub sp, sp, #1024
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 2096 + 144 * VG
 ; CHECK1024-NEXT:    mov x8, x0
 ; CHECK1024-NEXT:    bl __arm_sme_state
 ; CHECK1024-NEXT:    mov x19, x0
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    .cfi_offset vg, -32
 ; CHECK1024-NEXT:    tbz w19, #0, .LBB28_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
 ; CHECK1024-NEXT:    smstop sm
@@ -2172,9 +2191,7 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK1024-NEXT:  .LBB28_4: // %entry
 ; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    add sp, sp, #1024
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 1072 + 144 * VG
 ; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -2204,7 +2221,6 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK1024-NEXT:    addvl sp, sp, #18
-; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1072
 ; CHECK1024-NEXT:    .cfi_restore z8
 ; CHECK1024-NEXT:    .cfi_restore z9
 ; CHECK1024-NEXT:    .cfi_restore z10
@@ -2213,16 +2229,20 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK1024-NEXT:    .cfi_restore z13
 ; CHECK1024-NEXT:    .cfi_restore z14
 ; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    ldr x19, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1072
+; CHECK1024-NEXT:    add sp, sp, #1088
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK1024-NEXT:    .cfi_restore w19
+; CHECK1024-NEXT:    .cfi_restore w26
 ; CHECK1024-NEXT:    .cfi_restore w27
 ; CHECK1024-NEXT:    .cfi_restore w28
+; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    .cfi_restore w30
 ; CHECK1024-NEXT:    .cfi_restore w29
 ; CHECK1024-NEXT:    ret
@@ -2235,18 +2255,22 @@ entry:
 define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
 ; CHECK0-LABEL: svecc_alloca_call:
 ; CHECK0:       // %bb.0: // %entry
-; CHECK0-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_def_cfa_offset 48
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
-; CHECK0-NEXT:    stp x27, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
 ; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w27, -16
-; CHECK0-NEXT:    .cfi_offset w28, -24
-; CHECK0-NEXT:    .cfi_offset w30, -40
-; CHECK0-NEXT:    .cfi_offset w29, -48
+; CHECK0-NEXT:    .cfi_offset w26, -16
+; CHECK0-NEXT:    .cfi_offset w27, -24
+; CHECK0-NEXT:    .cfi_offset w28, -32
+; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
 ; CHECK0-NEXT:    addvl sp, sp, #-18
-; CHECK0-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x30, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 48 + 144 * VG
 ; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -2275,21 +2299,19 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d8 @ cfa - 8 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d9 @ cfa - 16 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d10 @ cfa - 24 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d11 @ cfa - 32 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d12 @ cfa - 40 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d13 @ cfa - 48 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * VG - 48
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
 ; CHECK0-NEXT:    sub sp, sp, #48
-; CHECK0-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 96 + 144 * VG
 ; CHECK0-NEXT:    bl __arm_sme_state
 ; CHECK0-NEXT:    mov x19, x0
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    .cfi_offset vg, -32
 ; CHECK0-NEXT:    tbz w19, #0, .LBB29_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
 ; CHECK0-NEXT:    smstop sm
@@ -2304,9 +2326,7 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK0-NEXT:  .LBB29_4: // %entry
 ; CHECK0-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK0-NEXT:    movk w0, #59491, lsl #16
-; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    add sp, sp, #48
-; CHECK0-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x30, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 48 + 144 * VG
 ; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -2336,7 +2356,6 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK0-NEXT:    addvl sp, sp, #18
-; CHECK0-NEXT:    .cfi_def_cfa wsp, 48
 ; CHECK0-NEXT:    .cfi_restore z8
 ; CHECK0-NEXT:    .cfi_restore z9
 ; CHECK0-NEXT:    .cfi_restore z10
@@ -2345,32 +2364,39 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK0-NEXT:    .cfi_restore z13
 ; CHECK0-NEXT:    .cfi_restore z14
 ; CHECK0-NEXT:    .cfi_restore z15
-; CHECK0-NEXT:    ldp x27, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
-; CHECK0-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK0-NEXT:    .cfi_restore w19
+; CHECK0-NEXT:    .cfi_restore w26
 ; CHECK0-NEXT:    .cfi_restore w27
 ; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    .cfi_restore w30
 ; CHECK0-NEXT:    .cfi_restore w29
 ; CHECK0-NEXT:    ret
 ;
 ; CHECK64-LABEL: svecc_alloca_call:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #112
-; CHECK64-NEXT:    .cfi_def_cfa_offset 112
+; CHECK64-NEXT:    sub sp, sp, #128
+; CHECK64-NEXT:    .cfi_def_cfa_offset 128
 ; CHECK64-NEXT:    cntd x9
 ; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_offset w19, -8
-; CHECK64-NEXT:    .cfi_offset w27, -16
-; CHECK64-NEXT:    .cfi_offset w28, -24
-; CHECK64-NEXT:    .cfi_offset w30, -40
-; CHECK64-NEXT:    .cfi_offset w29, -48
+; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    .cfi_def_cfa w29, 64
+; CHECK64-NEXT:    .cfi_offset w19, -16
+; CHECK64-NEXT:    .cfi_offset w26, -24
+; CHECK64-NEXT:    .cfi_offset w27, -32
+; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset vg, -48
+; CHECK64-NEXT:    .cfi_offset w30, -56
+; CHECK64-NEXT:    .cfi_offset w29, -64
 ; CHECK64-NEXT:    addvl sp, sp, #-18
-; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xf0, 0x00, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 112 + 144 * VG
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -2399,21 +2425,19 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 112
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 112
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
 ; CHECK64-NEXT:    sub sp, sp, #112
-; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x01, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 224 + 144 * VG
 ; CHECK64-NEXT:    bl __arm_sme_state
 ; CHECK64-NEXT:    mov x19, x0
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
-; CHECK64-NEXT:    .cfi_offset vg, -32
 ; CHECK64-NEXT:    tbz w19, #0, .LBB29_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
 ; CHECK64-NEXT:    smstop sm
@@ -2428,9 +2452,7 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:  .LBB29_4: // %entry
 ; CHECK64-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
-; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    add sp, sp, #112
-; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xf0, 0x00, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 112 + 144 * VG
 ; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -2460,7 +2482,6 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    addvl sp, sp, #18
-; CHECK64-NEXT:    .cfi_def_cfa wsp, 112
 ; CHECK64-NEXT:    .cfi_restore z8
 ; CHECK64-NEXT:    .cfi_restore z9
 ; CHECK64-NEXT:    .cfi_restore z10
@@ -2469,36 +2490,43 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:    .cfi_restore z13
 ; CHECK64-NEXT:    .cfi_restore z14
 ; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    ldp x27, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
+; CHECK64-NEXT:    ldp x26, x19, [sp, #104] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x28, x27, [sp, #88] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #112
+; CHECK64-NEXT:    add sp, sp, #128
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
+; CHECK64-NEXT:    .cfi_restore w26
 ; CHECK64-NEXT:    .cfi_restore w27
 ; CHECK64-NEXT:    .cfi_restore w28
+; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    .cfi_restore w30
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
 ;
 ; CHECK1024-LABEL: svecc_alloca_call:
 ; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1072
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 1072
+; CHECK1024-NEXT:    sub sp, sp, #1088
+; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
 ; CHECK1024-NEXT:    cntd x9
 ; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w27, -16
-; CHECK1024-NEXT:    .cfi_offset w28, -24
-; CHECK1024-NEXT:    .cfi_offset w30, -40
-; CHECK1024-NEXT:    .cfi_offset w29, -48
+; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NEXT:    add x29, sp, #1024
+; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NEXT:    .cfi_offset w29, -64
 ; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 1072 + 144 * VG
 ; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -2527,21 +2555,19 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1072
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1072
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
 ; CHECK1024-NEXT:    sub sp, sp, #1072
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 2144 + 144 * VG
 ; CHECK1024-NEXT:    bl __arm_sme_state
 ; CHECK1024-NEXT:    mov x19, x0
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    .cfi_offset vg, -32
 ; CHECK1024-NEXT:    tbz w19, #0, .LBB29_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
 ; CHECK1024-NEXT:    smstop sm
@@ -2556,9 +2582,7 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK1024-NEXT:  .LBB29_4: // %entry
 ; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    add sp, sp, #1072
-; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 1072 + 144 * VG
 ; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -2588,7 +2612,6 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK1024-NEXT:    addvl sp, sp, #18
-; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1072
 ; CHECK1024-NEXT:    .cfi_restore z8
 ; CHECK1024-NEXT:    .cfi_restore z9
 ; CHECK1024-NEXT:    .cfi_restore z10
@@ -2597,16 +2620,20 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK1024-NEXT:    .cfi_restore z13
 ; CHECK1024-NEXT:    .cfi_restore z14
 ; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    ldr x19, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1072
+; CHECK1024-NEXT:    add sp, sp, #1088
 ; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK1024-NEXT:    .cfi_restore w19
+; CHECK1024-NEXT:    .cfi_restore w26
 ; CHECK1024-NEXT:    .cfi_restore w27
 ; CHECK1024-NEXT:    .cfi_restore w28
+; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    .cfi_restore w30
 ; CHECK1024-NEXT:    .cfi_restore w29
 ; CHECK1024-NEXT:    ret
@@ -2811,6 +2838,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK0-NEXT:    .cfi_def_cfa w29, 48
 ; CHECK0-NEXT:    .cfi_offset w19, -8
 ; CHECK0-NEXT:    .cfi_offset w20, -16
+; CHECK0-NEXT:    .cfi_offset vg, -32
 ; CHECK0-NEXT:    .cfi_offset w30, -40
 ; CHECK0-NEXT:    .cfi_offset w29, -48
 ; CHECK0-NEXT:    .cfi_offset b8, -56
@@ -2833,11 +2861,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK0-NEXT:    stur wzr, [x29, #-68]
 ; CHECK0-NEXT:    sturh w8, [x29, #-72]
 ; CHECK0-NEXT:    msr TPIDR2_EL0, x9
-; CHECK0-NEXT:    .cfi_offset vg, -32
 ; CHECK0-NEXT:    smstop sm
 ; CHECK0-NEXT:    bl other
 ; CHECK0-NEXT:    smstart sm
-; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    smstart za
 ; CHECK0-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK0-NEXT:    sub x0, x29, #80
@@ -2858,6 +2884,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK0-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK0-NEXT:    .cfi_restore w19
 ; CHECK0-NEXT:    .cfi_restore w20
+; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    .cfi_restore w30
 ; CHECK0-NEXT:    .cfi_restore w29
 ; CHECK0-NEXT:    .cfi_restore b8
@@ -2885,6 +2912,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 48
 ; CHECK64-NEXT:    .cfi_offset w19, -16
 ; CHECK64-NEXT:    .cfi_offset w20, -24
+; CHECK64-NEXT:    .cfi_offset vg, -32
 ; CHECK64-NEXT:    .cfi_offset w30, -40
 ; CHECK64-NEXT:    .cfi_offset w29, -48
 ; CHECK64-NEXT:    .cfi_offset b8, -120
@@ -2908,11 +2936,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK64-NEXT:    str wzr, [x19, #12]
 ; CHECK64-NEXT:    strh w8, [x19, #8]
 ; CHECK64-NEXT:    msr TPIDR2_EL0, x9
-; CHECK64-NEXT:    .cfi_offset vg, -32
 ; CHECK64-NEXT:    smstop sm
 ; CHECK64-NEXT:    bl other
 ; CHECK64-NEXT:    smstart sm
-; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    smstart za
 ; CHECK64-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK64-NEXT:    add x0, x19, #0
@@ -2934,6 +2960,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
 ; CHECK64-NEXT:    .cfi_restore w20
+; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    .cfi_restore w30
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    .cfi_restore b8
@@ -2966,6 +2993,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK1024-NEXT:    .cfi_offset w19, -8
 ; CHECK1024-NEXT:    .cfi_offset w20, -16
 ; CHECK1024-NEXT:    .cfi_offset w28, -24
+; CHECK1024-NEXT:    .cfi_offset vg, -32
 ; CHECK1024-NEXT:    .cfi_offset w30, -40
 ; CHECK1024-NEXT:    .cfi_offset w29, -48
 ; CHECK1024-NEXT:    .cfi_offset b8, -1080
@@ -2989,11 +3017,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK1024-NEXT:    str wzr, [x19, #12]
 ; CHECK1024-NEXT:    strh w8, [x19, #8]
 ; CHECK1024-NEXT:    msr TPIDR2_EL0, x9
-; CHECK1024-NEXT:    .cfi_offset vg, -32
 ; CHECK1024-NEXT:    smstop sm
 ; CHECK1024-NEXT:    bl other
 ; CHECK1024-NEXT:    smstart sm
-; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    smstart za
 ; CHECK1024-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK1024-NEXT:    add x0, x19, #0
@@ -3019,6 +3045,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK1024-NEXT:    .cfi_restore w19
 ; CHECK1024-NEXT:    .cfi_restore w20
 ; CHECK1024-NEXT:    .cfi_restore w28
+; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    .cfi_restore w30
 ; CHECK1024-NEXT:    .cfi_restore w29
 ; CHECK1024-NEXT:    .cfi_restore b8
@@ -3156,6 +3183,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK0-NEXT:    .cfi_offset w26, -24
 ; CHECK0-NEXT:    .cfi_offset w27, -32
 ; CHECK0-NEXT:    .cfi_offset w28, -40
+; CHECK0-NEXT:    .cfi_offset vg, -48
 ; CHECK0-NEXT:    .cfi_offset w30, -56
 ; CHECK0-NEXT:    .cfi_offset w29, -64
 ; CHECK0-NEXT:    addvl sp, sp, #-18
@@ -3187,14 +3215,14 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * VG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
 ; CHECK0-NEXT:    mov x19, sp
 ; CHECK0-NEXT:    mov w2, w1
 ; CHECK0-NEXT:    mov w8, w0
@@ -3208,7 +3236,6 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK0-NEXT:    mov sp, x8
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    .cfi_offset vg, -48
 ; CHECK0-NEXT:    tbz w20, #0, .LBB35_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
 ; CHECK0-NEXT:    smstop sm
@@ -3222,7 +3249,6 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK0-NEXT:  .LBB35_4: // %entry
 ; CHECK0-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK0-NEXT:    movk w0, #59491, lsl #16
-; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    addvl sp, x29, #-18
 ; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -3272,6 +3298,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK0-NEXT:    .cfi_restore w26
 ; CHECK0-NEXT:    .cfi_restore w27
 ; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    .cfi_restore w30
 ; CHECK0-NEXT:    .cfi_restore w29
 ; CHECK0-NEXT:    ret
@@ -3292,6 +3319,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_offset w26, -24
 ; CHECK64-NEXT:    .cfi_offset w27, -32
 ; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
 ; CHECK64-NEXT:    addvl sp, sp, #-18
@@ -3323,14 +3351,14 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
 ; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    mov x19, sp
 ; CHECK64-NEXT:    mov w2, w1
@@ -3345,7 +3373,6 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    mov sp, x8
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
-; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    tbz w20, #0, .LBB35_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
 ; CHECK64-NEXT:    smstop sm
@@ -3358,9 +3385,8 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    smstart sm
 ; CHECK64-NEXT:  .LBB35_4: // %entry
 ; CHECK64-NEXT:    mov w0, #22647 // =0x5877
-; CHECK64-NEXT:    movk w0, #59491, lsl #16
-; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    sub x8, x29, #64
+; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
 ; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -3411,6 +3437,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_restore w26
 ; CHECK64-NEXT:    .cfi_restore w27
 ; CHECK64-NEXT:    .cfi_restore w28
+; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    .cfi_restore w30
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
@@ -3435,6 +3462,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK1024-NEXT:    .cfi_offset w26, -24
 ; CHECK1024-NEXT:    .cfi_offset w27, -32
 ; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset vg, -48
 ; CHECK1024-NEXT:    .cfi_offset w30, -56
 ; CHECK1024-NEXT:    .cfi_offset w29, -64
 ; CHECK1024-NEXT:    addvl sp, sp, #-18
@@ -3466,14 +3494,14 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
 ; CHECK1024-NEXT:    sub sp, sp, #1024
 ; CHECK1024-NEXT:    mov x19, sp
 ; CHECK1024-NEXT:    mov w2, w1
@@ -3488,7 +3516,6 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK1024-NEXT:    mov sp, x8
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    .cfi_offset vg, -48
 ; CHECK1024-NEXT:    tbz w20, #0, .LBB35_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
 ; CHECK1024-NEXT:    smstop sm
@@ -3501,9 +3528,8 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK1024-NEXT:    smstart sm
 ; CHECK1024-NEXT:  .LBB35_4: // %entry
 ; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    sub x8, x29, #1024
+; CHECK1024-NEXT:    movk w0, #59491, lsl #16
 ; CHECK1024-NEXT:    addvl sp, x8, #-18
 ; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -3557,6 +3583,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK1024-NEXT:    .cfi_restore w26
 ; CHECK1024-NEXT:    .cfi_restore w27
 ; CHECK1024-NEXT:    .cfi_restore w28
+; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    .cfi_restore w30
 ; CHECK1024-NEXT:    .cfi_restore w29
 ; CHECK1024-NEXT:    ret
@@ -3583,6 +3610,7 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK0-NEXT:    .cfi_offset w26, -16
 ; CHECK0-NEXT:    .cfi_offset w27, -24
 ; CHECK0-NEXT:    .cfi_offset w28, -32
+; CHECK0-NEXT:    .cfi_offset vg, -48
 ; CHECK0-NEXT:    .cfi_offset w30, -56
 ; CHECK0-NEXT:    .cfi_offset w29, -64
 ; CHECK0-NEXT:    addvl sp, sp, #-18
@@ -3614,14 +3642,14 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * VG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * VG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
 ; CHECK0-NEXT:    sub x9, sp, #1024
 ; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK0-NEXT:    mov w2, w1
@@ -3629,7 +3657,6 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK0-NEXT:    mov x19, x0
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    .cfi_offset vg, -48
 ; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
 ; CHECK0-NEXT:    smstop sm
@@ -3643,7 +3670,6 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK0-NEXT:  .LBB36_4: // %entry
 ; CHECK0-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK0-NEXT:    movk w0, #59491, lsl #16
-; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    addvl sp, x29, #-18
 ; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -3691,6 +3717,7 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK0-NEXT:    .cfi_restore w26
 ; CHECK0-NEXT:    .cfi_restore w27
 ; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    .cfi_restore w30
 ; CHECK0-NEXT:    .cfi_restore w29
 ; CHECK0-NEXT:    ret
@@ -3710,6 +3737,7 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    .cfi_offset w26, -24
 ; CHECK64-NEXT:    .cfi_offset w27, -32
 ; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
 ; CHECK64-NEXT:    addvl sp, sp, #-18
@@ -3741,14 +3769,14 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
 ; CHECK64-NEXT:    sub x9, sp, #1088
 ; CHECK64-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK64-NEXT:    mov w2, w1
@@ -3756,7 +3784,6 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    mov x19, x0
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
-; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    tbz w19, #0, .LBB36_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
 ; CHECK64-NEXT:    smstop sm
@@ -3769,9 +3796,8 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    smstart sm
 ; CHECK64-NEXT:  .LBB36_4: // %entry
 ; CHECK64-NEXT:    mov w0, #22647 // =0x5877
-; CHECK64-NEXT:    movk w0, #59491, lsl #16
-; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    sub x8, x29, #64
+; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
 ; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -3820,6 +3846,7 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    .cfi_restore w26
 ; CHECK64-NEXT:    .cfi_restore w27
 ; CHECK64-NEXT:    .cfi_restore w28
+; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    .cfi_restore w30
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
@@ -3842,6 +3869,7 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK1024-NEXT:    .cfi_offset w26, -24
 ; CHECK1024-NEXT:    .cfi_offset w27, -32
 ; CHECK1024-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NEXT:    .cfi_offset vg, -48
 ; CHECK1024-NEXT:    .cfi_offset w30, -56
 ; CHECK1024-NEXT:    .cfi_offset w29, -64
 ; CHECK1024-NEXT:    addvl sp, sp, #-18
@@ -3873,14 +3901,14 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
 ; CHECK1024-NEXT:    sub x9, sp, #2048
 ; CHECK1024-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK1024-NEXT:    mov w2, w1
@@ -3888,7 +3916,6 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK1024-NEXT:    mov x19, x0
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    .cfi_offset vg, -48
 ; CHECK1024-NEXT:    tbz w19, #0, .LBB36_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
 ; CHECK1024-NEXT:    smstop sm
@@ -3901,9 +3928,8 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK1024-NEXT:    smstart sm
 ; CHECK1024-NEXT:  .LBB36_4: // %entry
 ; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    sub x8, x29, #1024
+; CHECK1024-NEXT:    movk w0, #59491, lsl #16
 ; CHECK1024-NEXT:    addvl sp, x8, #-18
 ; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -3955,6 +3981,7 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK1024-NEXT:    .cfi_restore w26
 ; CHECK1024-NEXT:    .cfi_restore w27
 ; CHECK1024-NEXT:    .cfi_restore w28
+; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    .cfi_restore w30
 ; CHECK1024-NEXT:    .cfi_restore w29
 ; CHECK1024-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index 0650b8271d374..7ad95429949a0 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -22,14 +22,13 @@ define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-NO-SME-ROUTINES-LABEL: se_memcpy:
 ; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
@@ -72,13 +71,12 @@ define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-NO-SME-ROUTINES-LABEL: se_memset:
 ; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
-; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov w1, #2 // =0x2
@@ -121,14 +119,13 @@ define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
 ; CHECK-NO-SME-ROUTINES-LABEL: se_memmove:
 ; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
@@ -171,14 +168,12 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ;
 ; CHECK-NO-SME-ROUTINES-LABEL: sc_memcpy:
 ; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    bl __arm_sme_state
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov x19, x0
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
@@ -194,12 +189,11 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.3: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
 ; CHECK-NO-SME-ROUTINES-NEXT:  .LBB3_4: // %entry
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ret
 ;
 ; CHECK-MOPS-LABEL: sc_memcpy:
@@ -220,16 +214,12 @@ entry:
 define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-LABEL: sb_memcpy:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    rdsvl x9, #1
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    lsr x9, x9, #3
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:    adrp x0, :got:dst
 ; CHECK-NEXT:    adrp x1, :got:src
@@ -241,21 +231,17 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NO-SME-ROUTINES-LABEL: sb_memcpy:
 ; CHECK-NO-SME-ROUTINES:       // %bb.0: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    rdsvl x9, #1
+; CHECK-NO-SME-ROUTINES-NEXT:    stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT:    lsr x9, x9, #3
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT:    cntd x9
-; CHECK-NO-SME-ROUTINES-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstart sm
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
@@ -267,20 +253,15 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NO-SME-ROUTINES-NEXT:    ret
 ;
 ; CHECK-MOPS-LABEL: sb_memcpy:
 ; CHECK-MOPS:       // %bb.0: // %entry
-; CHECK-MOPS-NEXT:    rdsvl x9, #1
-; CHECK-MOPS-NEXT:    lsr x9, x9, #3
-; CHECK-MOPS-NEXT:    str x9, [sp, #-80]! // 8-byte Folded Spill
-; CHECK-MOPS-NEXT:    cntd x9
-; CHECK-MOPS-NEXT:    stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    str x9, [sp, #8] // 8-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT:    stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-MOPS-NEXT:    smstart sm
 ; CHECK-MOPS-NEXT:    adrp x8, :got:src
 ; CHECK-MOPS-NEXT:    adrp x9, :got:dst
@@ -290,11 +271,10 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
 ; CHECK-MOPS-NEXT:    cpyfm [x9]!, [x8]!, x0!
 ; CHECK-MOPS-NEXT:    cpyfe [x9]!, [x8]!, x0!
 ; CHECK-MOPS-NEXT:    smstop sm
-; CHECK-MOPS-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT:    add sp, sp, #80
+; CHECK-MOPS-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
 ; CHECK-MOPS-NEXT:    ret
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)

diff  --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index e8f16e64ce4b0..1f1ca7e3b9ee9 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -340,52 +340,57 @@ entry:
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 8, Size: 8
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-258 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-260 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-262 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-264 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-266 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-268 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-270 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-272 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-274 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-276 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-278 x vscale], Type: Spill, Align: 2, Size: vscale x 2
-; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-280 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-16 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-258 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-260 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-262 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-264 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-266 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-268 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-270 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-272 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-274 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-276 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-278 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-280 x vscale], Type: Spill, Align: 2, Size: vscale x 2
 
 define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
 ; CHECK-COMMON-LABEL: svecc_call:
 ; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-COMMON-NEXT:    cntd x9
-; CHECK-COMMON-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    stp x27, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x29, sp
+; CHECK-COMMON-NEXT:    .cfi_def_cfa w29, 64
 ; CHECK-COMMON-NEXT:    .cfi_offset w19, -8
-; CHECK-COMMON-NEXT:    .cfi_offset w27, -16
-; CHECK-COMMON-NEXT:    .cfi_offset w28, -24
-; CHECK-COMMON-NEXT:    .cfi_offset w30, -40
-; CHECK-COMMON-NEXT:    .cfi_offset w29, -48
+; CHECK-COMMON-NEXT:    .cfi_offset w26, -16
+; CHECK-COMMON-NEXT:    .cfi_offset w27, -24
+; CHECK-COMMON-NEXT:    .cfi_offset w28, -32
+; CHECK-COMMON-NEXT:    .cfi_offset vg, -48
+; CHECK-COMMON-NEXT:    .cfi_offset w30, -56
+; CHECK-COMMON-NEXT:    .cfi_offset w29, -64
 ; CHECK-COMMON-NEXT:    addvl sp, sp, #-18
-; CHECK-COMMON-NEXT:    .cfi_escape 0x0f, 0x0a, 0x8f, 0x30, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 48 + 144 * VG
 ; CHECK-COMMON-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -414,20 +419,19 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK-COMMON-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
 ; CHECK-COMMON-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d8 @ cfa - 8 * VG - 48
-; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d9 @ cfa - 16 * VG - 48
-; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d10 @ cfa - 24 * VG - 48
-; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d11 @ cfa - 32 * VG - 48
-; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d12 @ cfa - 40 * VG - 48
-; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d13 @ cfa - 48 * VG - 48
-; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * VG - 48
-; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48
+; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
+; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
+; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
+; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
+; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
+; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
+; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
+; CHECK-COMMON-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
 ; CHECK-COMMON-NEXT:    mov x8, x0
 ; CHECK-COMMON-NEXT:    bl __arm_sme_state
 ; CHECK-COMMON-NEXT:    mov x19, x0
 ; CHECK-COMMON-NEXT:    //APP
 ; CHECK-COMMON-NEXT:    //NO_APP
-; CHECK-COMMON-NEXT:    .cfi_offset vg, -32
 ; CHECK-COMMON-NEXT:    tbz w19, #0, .LBB7_2
 ; CHECK-COMMON-NEXT:  // %bb.1: // %entry
 ; CHECK-COMMON-NEXT:    smstop sm
@@ -440,13 +444,12 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK-COMMON-NEXT:  // %bb.3: // %entry
 ; CHECK-COMMON-NEXT:    smstart sm
 ; CHECK-COMMON-NEXT:  .LBB7_4: // %entry
-; CHECK-COMMON-NEXT:    mov w0, #22647 // =0x5877
-; CHECK-COMMON-NEXT:    movk w0, #59491, lsl #16
-; CHECK-COMMON-NEXT:    .cfi_restore vg
 ; CHECK-COMMON-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    mov w0, #22647 // =0x5877
 ; CHECK-COMMON-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    movk w0, #59491, lsl #16
 ; CHECK-COMMON-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
@@ -472,7 +475,6 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK-COMMON-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-COMMON-NEXT:    addvl sp, sp, #18
-; CHECK-COMMON-NEXT:    .cfi_def_cfa wsp, 48
 ; CHECK-COMMON-NEXT:    .cfi_restore z8
 ; CHECK-COMMON-NEXT:    .cfi_restore z9
 ; CHECK-COMMON-NEXT:    .cfi_restore z10
@@ -481,13 +483,16 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK-COMMON-NEXT:    .cfi_restore z13
 ; CHECK-COMMON-NEXT:    .cfi_restore z14
 ; CHECK-COMMON-NEXT:    .cfi_restore z15
-; CHECK-COMMON-NEXT:    ldp x27, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK-COMMON-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-COMMON-NEXT:    .cfi_restore w19
+; CHECK-COMMON-NEXT:    .cfi_restore w26
 ; CHECK-COMMON-NEXT:    .cfi_restore w27
 ; CHECK-COMMON-NEXT:    .cfi_restore w28
+; CHECK-COMMON-NEXT:    .cfi_restore vg
 ; CHECK-COMMON-NEXT:    .cfi_restore w30
 ; CHECK-COMMON-NEXT:    .cfi_restore w29
 ; CHECK-COMMON-NEXT:    ret
@@ -534,6 +539,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK-NEXT:    .cfi_def_cfa w29, 48
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset vg, -32
 ; CHECK-NEXT:    .cfi_offset w30, -40
 ; CHECK-NEXT:    .cfi_offset w29, -48
 ; CHECK-NEXT:    .cfi_offset b8, -56
@@ -556,11 +562,9 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK-NEXT:    stur wzr, [x29, #-68]
 ; CHECK-NEXT:    sturh w8, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x9
-; CHECK-NEXT:    .cfi_offset vg, -32
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl other
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    sub x0, x29, #80
@@ -581,6 +585,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    .cfi_restore w19
 ; CHECK-NEXT:    .cfi_restore w20
+; CHECK-NEXT:    .cfi_restore vg
 ; CHECK-NEXT:    .cfi_restore w30
 ; CHECK-NEXT:    .cfi_restore w29
 ; CHECK-NEXT:    .cfi_restore b8
@@ -608,6 +613,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK-NEWLOWERING-NEXT:    .cfi_def_cfa w29, 48
 ; CHECK-NEWLOWERING-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEWLOWERING-NEXT:    .cfi_offset w20, -16
+; CHECK-NEWLOWERING-NEXT:    .cfi_offset vg, -32
 ; CHECK-NEWLOWERING-NEXT:    .cfi_offset w30, -40
 ; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -48
 ; CHECK-NEWLOWERING-NEXT:    .cfi_offset b8, -56
@@ -627,13 +633,11 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK-NEWLOWERING-NEXT:    mov w20, w0
 ; CHECK-NEWLOWERING-NEXT:    stp x9, x8, [x29, #-80]
 ; CHECK-NEWLOWERING-NEXT:    msr TPIDR2_EL0, x10
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset vg, -32
 ; CHECK-NEWLOWERING-NEXT:    smstop sm
 ; CHECK-NEWLOWERING-NEXT:    bl other
 ; CHECK-NEWLOWERING-NEXT:    smstart sm
 ; CHECK-NEWLOWERING-NEXT:    mov w0, w20
 ; CHECK-NEWLOWERING-NEXT:    mov w8, w0
-; CHECK-NEWLOWERING-NEXT:    .cfi_restore vg
 ; CHECK-NEWLOWERING-NEXT:    smstart za
 ; CHECK-NEWLOWERING-NEXT:    mrs x9, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    sub x0, x29, #80
@@ -654,6 +658,7 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
 ; CHECK-NEWLOWERING-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEWLOWERING-NEXT:    .cfi_restore w19
 ; CHECK-NEWLOWERING-NEXT:    .cfi_restore w20
+; CHECK-NEWLOWERING-NEXT:    .cfi_restore vg
 ; CHECK-NEWLOWERING-NEXT:    .cfi_restore w30
 ; CHECK-NEWLOWERING-NEXT:    .cfi_restore w29
 ; CHECK-NEWLOWERING-NEXT:    .cfi_restore b8


        


More information about the llvm-commits mailing list