[llvm] [AArch64][SME] Use entry pstate.sm for conditional streaming-mode changes (PR #152169)

Wed Aug 6 07:31:22 PDT 2025

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/152169

>From c5ef6d529be5b64d566ab3217dfae5eecd002e76 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 5 Aug 2025 15:54:29 +0000
Subject: [PATCH 1/2] [AArch64][SME] Use entry pstate.sm for conditional
 streaming-mode changes

We only do conditional streaming mode changes in two cases:

 - Around calls in streaming-compatible functions that don't have a streaming body
 - At the entry/exit of streaming-compatible functions with a streaming body

In both cases, the condition depends on the entry pstate.sm value. Given
this, we don't need to emit calls to __arm_sme_state at every mode change.

This patch handles this by placing a "AArch64ISD::CALLER_IS_STREAMING"
node in the entry block and copying the result to a register. The register
is then used whenever we need to emit a conditional streaming mode change.
The "CALLER_IS_STREAMING" node expands to a call to  "__arm_sme_state" o
nly if (after SelectionDAG) the function is determined to have
streaming-mode changes.

This has two main advantages:

1. It allows back-to-back conditional smstart/stop pairs to be folded
2. It has the correct behaviour for EH landing pads
  - These are entered with pstate.sm = 0, and should switch mode based on
    the entry pstate.sm
  - Note: This is not fully implemented yet
---
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp |   4 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  91 ++++++++-----
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   6 +-
 .../AArch64/AArch64MachineFunctionInfo.h      |   6 +
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |   9 ++
 llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp    |  12 +-
 llvm/test/CodeGen/AArch64/sme-agnostic-za.ll  |  28 ++--
 ...compatible-to-normal-fn-wihout-sme-attr.ll |   4 +-
 .../AArch64/sme-callee-save-restore-pairs.ll  |   4 +-
 .../AArch64/sme-disable-gisel-fisel.ll        |   2 +-
 .../CodeGen/AArch64/sme-lazy-save-call.ll     |   4 +-
 .../test/CodeGen/AArch64/sme-peephole-opts.ll |  21 +--
 ...ing-body-streaming-compatible-interface.ll |   9 +-
 .../sme-streaming-compatible-interface.ll     |  27 ++--
 llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll  |  11 +-
 .../CodeGen/AArch64/spill-reload-remarks.ll   |   3 +-
 llvm/test/CodeGen/AArch64/stack-hazard.ll     | 124 +++++++++---------
 .../streaming-compatible-memory-ops.ll        |   7 +-
 .../CodeGen/AArch64/sve-stack-frame-layout.ll |   4 +-
 19 files changed, 200 insertions(+), 176 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 8c8daef6dccd4..763b3868a99ca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -1178,7 +1178,9 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   if (Node->getValueType(Node->getNumValues()-1) == MVT::Glue) {
     for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) {
       if (F->getOpcode() == ISD::CopyFromReg) {
-        UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+        Register Reg = cast<RegisterSDNode>(F->getOperand(1))->getReg();
+        if (Reg.isPhysical())
+          UsedRegs.push_back(Reg);
         continue;
       } else if (F->getOpcode() == ISD::CopyToReg) {
         // Skip CopyToReg nodes that are internal to the glue chain.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 43b9743639ab2..2fe3278bb1846 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3101,6 +3101,32 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
   return BB;
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitCallerIsStreaming(MachineInstr &MI,
+                                             MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+  if (FuncInfo->IsPStateSMRegUsed()) {
+    const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
+        .addExternalSymbol("__arm_sme_state")
+        .addReg(AArch64::X0, RegState::ImplicitDefine)
+        .addRegMask(TRI->getCallPreservedMask(
+            *MF, CallingConv::
+                     AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2));
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+            MI.getOperand(0).getReg())
+        .addReg(AArch64::X0);
+  } else {
+    BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+            MI.getOperand(0).getReg())
+        .addReg(AArch64::XZR);
+  }
+  BB->remove_instr(&MI);
+  return BB;
+}
+
 // Helper function to find the instruction that defined a virtual register.
 // If unable to find such instruction, returns nullptr.
 static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI,
@@ -3216,6 +3242,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     return EmitAllocateSMESaveBuffer(MI, BB);
   case AArch64::GetSMESaveSize:
     return EmitGetSMESaveSize(MI, BB);
+  case AArch64::CallerIsStreaming:
+    return EmitCallerIsStreaming(MI, BB);
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
   case TargetOpcode::STATEPOINT:
@@ -8132,19 +8160,26 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   }
   assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
 
+  if (Attrs.hasStreamingCompatibleInterface()) {
+    SDValue CallerIsStreaming =
+        DAG.getNode(AArch64ISD::CALLER_IS_STREAMING, DL,
+                    DAG.getVTList(MVT::i64, MVT::Other), {Chain});
+
+    // Copy the value to a virtual register, and save that in FuncInfo.
+    Register CallerIsStreamingReg =
+        MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    Chain = DAG.getCopyToReg(CallerIsStreaming.getValue(1), DL,
+                             CallerIsStreamingReg, CallerIsStreaming);
+    FuncInfo->setPStateSMReg(CallerIsStreamingReg);
+  }
+
   // Insert the SMSTART if this is a locally streaming function and
   // make sure it is Glued to the last CopyFromReg value.
   if (IsLocallyStreaming) {
-    SDValue PStateSM;
-    if (Attrs.hasStreamingCompatibleInterface()) {
-      PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
-      Register Reg = MF.getRegInfo().createVirtualRegister(
-          getRegClassFor(PStateSM.getValueType().getSimpleVT()));
-      FuncInfo->setPStateSMReg(Reg);
-      Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
+    if (Attrs.hasStreamingCompatibleInterface())
       Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
-                                  AArch64SME::IfCallerIsNonStreaming, PStateSM);
-    } else
+                                  AArch64SME::IfCallerIsNonStreaming);
+    else
       Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
                                   AArch64SME::Always);
 
@@ -8834,8 +8869,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
 SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
                                                    bool Enable, SDValue Chain,
                                                    SDValue InGlue,
-                                                   unsigned Condition,
-                                                   SDValue PStateSM) const {
+                                                   unsigned Condition) const {
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   FuncInfo->setHasStreamingModeChanges(true);
@@ -8847,9 +8881,16 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
   SmallVector<SDValue> Ops = {Chain, MSROp};
   unsigned Opcode;
   if (Condition != AArch64SME::Always) {
+    FuncInfo->setPStateSMRegUsed(true);
+    Register PStateReg = FuncInfo->getPStateSMReg();
+    assert(PStateReg.isValid() && "PStateSM Register is invalid");
+    SDValue PStateSM =
+        DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
+    // Use chain and glue from the CopyFromReg.
+    Ops[0] = PStateSM.getValue(1);
+    InGlue = PStateSM.getValue(2);
     SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
     Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
-    assert(PStateSM && "PStateSM should be defined");
     Ops.push_back(ConditionOp);
     Ops.push_back(PStateSM);
   } else {
@@ -9124,15 +9165,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                     /*IsSave=*/true);
   }
 
-  SDValue PStateSM;
   bool RequiresSMChange = CallAttrs.requiresSMChange();
   if (RequiresSMChange) {
-    if (CallAttrs.caller().hasStreamingInterfaceOrBody())
-      PStateSM = DAG.getConstant(1, DL, MVT::i64);
-    else if (CallAttrs.caller().hasNonStreamingInterface())
-      PStateSM = DAG.getConstant(0, DL, MVT::i64);
-    else
-      PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
     OptimizationRemarkEmitter ORE(&MF.getFunction());
     ORE.emit([&]() {
       auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
@@ -9447,9 +9481,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       InGlue = Chain.getValue(1);
     }
 
-    SDValue NewChain = changeStreamingMode(
-        DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
-        getSMToggleCondition(CallAttrs), PStateSM);
+    SDValue NewChain =
+        changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(),
+                            Chain, InGlue, getSMToggleCondition(CallAttrs));
     Chain = NewChain.getValue(0);
     InGlue = NewChain.getValue(1);
   }
@@ -9633,10 +9667,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     InGlue = Result.getValue(Result->getNumValues() - 1);
 
   if (RequiresSMChange) {
-    assert(PStateSM && "Expected a PStateSM to be set");
     Result = changeStreamingMode(
         DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
-        getSMToggleCondition(CallAttrs), PStateSM);
+        getSMToggleCondition(CallAttrs));
 
     if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
       InGlue = Result.getValue(1);
@@ -9802,14 +9835,11 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   // Emit SMSTOP before returning from a locally streaming function
   SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
   if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
-    if (FuncAttrs.hasStreamingCompatibleInterface()) {
-      Register Reg = FuncInfo->getPStateSMReg();
-      assert(Reg.isValid() && "PStateSM Register is invalid");
-      SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
+    if (FuncAttrs.hasStreamingCompatibleInterface())
       Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
                                   /*Glue*/ SDValue(),
-                                  AArch64SME::IfCallerIsNonStreaming, PStateSM);
-    } else
+                                  AArch64SME::IfCallerIsNonStreaming);
+    else
       Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
                                   /*Glue*/ SDValue(), AArch64SME::Always);
     Glue = Chain.getValue(1);
@@ -28171,6 +28201,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
     case Intrinsic::aarch64_sme_in_streaming_mode: {
       SDLoc DL(N);
       SDValue Chain = DAG.getEntryNode();
+
       SDValue RuntimePStateSM =
           getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
       Results.push_back(
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 88876570ac811..cc23f7e0bdfcd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -181,6 +181,8 @@ class AArch64TargetLowering : public TargetLowering {
                                                MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI,
                                         MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitCallerIsStreaming(MachineInstr &MI,
+                                           MachineBasicBlock *BB) const;
 
   /// Replace (0, vreg) discriminator components with the operands of blend
   /// or with (immediate, NoRegister) when possible.
@@ -523,8 +525,8 @@ class AArch64TargetLowering : public TargetLowering {
   /// node. \p Condition should be one of the enum values from
   /// AArch64SME::ToggleCondition.
   SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable,
-                              SDValue Chain, SDValue InGlue, unsigned Condition,
-                              SDValue PStateSM = SDValue()) const;
+                              SDValue Chain, SDValue InGlue,
+                              unsigned Condition) const;
 
   bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
 
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 800787cc0b4f5..cb9fdb7606329 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -231,6 +231,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   // on function entry to record the initial pstate of a function.
   Register PStateSMReg = MCRegister::NoRegister;
 
+  // true if PStateSMReg is used.
+  bool PStateSMRegUsed = false;
+
   // Holds a pointer to a buffer that is large enough to represent
   // all SME ZA state and any additional state required by the
   // __arm_sme_save/restore support routines.
@@ -274,6 +277,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   Register getPStateSMReg() const { return PStateSMReg; };
   void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
 
+  unsigned IsPStateSMRegUsed() const { return PStateSMRegUsed; };
+  void setPStateSMRegUsed(bool Used = true) { PStateSMRegUsed = Used; };
+
   int64_t getVGIdx() const { return VGIdx; };
   void setVGIdx(unsigned Idx) { VGIdx = Idx; };
 
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index db27ca978980f..7b5f45e96a942 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -39,6 +39,15 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
 def AArch64CoalescerBarrier
     : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
 
+def AArch64CallerIsStreaming
+  : SDNode<"AArch64ISD::CALLER_IS_STREAMING", SDTypeProfile<1, 0,
+    [SDTCisInt<0>]>, [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in {
+  def CallerIsStreaming : Pseudo<(outs GPR64:$is_streaming), (ins), []>, Sched<[]> {}
+}
+def : Pat<(i64 (AArch64CallerIsStreaming)), (CallerIsStreaming)>;
+
 def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
                            [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
 
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index bd28716118880..564af6708e1ed 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -80,16 +80,10 @@ static bool isMatchingStartStopPair(const MachineInstr *MI1,
   if (MI1->getOperand(4).getRegMask() != MI2->getOperand(4).getRegMask())
     return false;
 
-  // This optimisation is unlikely to happen in practice for conditional
-  // smstart/smstop pairs as the virtual registers for pstate.sm will always
-  // be different.
-  // TODO: For this optimisation to apply to conditional smstart/smstop,
-  // this pass will need to do more work to remove redundant calls to
-  // __arm_sme_state.
-
   // Only consider conditional start/stop pairs which read the same register
-  // holding the original value of pstate.sm, as some conditional start/stops
-  // require the state on entry to the function.
+  // holding the original value of pstate.sm. This is somewhat over conservative
+  // as all conditional streaming mode changes only look at the state on entry
+  // to the function.
   if (MI1->getOperand(3).isReg() && MI2->getOperand(3).isReg()) {
     Register Reg1 = MI1->getOperand(3).getReg();
     Register Reg2 = MI2->getOperand(3).getReg();
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 1f68815411097..ba40ccd1c7406 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -150,42 +150,40 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
 ; CHECK-NEXT:    add x29, sp, #64
 ; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    bl __arm_sme_state_size
 ; CHECK-NEXT:    sub sp, sp, x0
-; CHECK-NEXT:    mov x19, sp
-; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x20, sp
+; CHECK-NEXT:    mov x0, x20
 ; CHECK-NEXT:    bl __arm_sme_save
-; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x20, x0, #0x1
-; CHECK-NEXT:    tbz w20, #0, .LBB5_2
+; CHECK-NEXT:    tbz w19, #0, .LBB5_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB5_2:
 ; CHECK-NEXT:    mov x0, x8
 ; CHECK-NEXT:    bl private_za_decl
-; CHECK-NEXT:    mov x2, x0
-; CHECK-NEXT:    tbz w20, #0, .LBB5_4
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    tbz w19, #0, .LBB5_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB5_4:
-; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x0, x20
 ; CHECK-NEXT:    bl __arm_sme_restore
-; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x0, x20
 ; CHECK-NEXT:    bl __arm_sme_save
-; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x20, x0, #0x1
-; CHECK-NEXT:    tbz w20, #0, .LBB5_6
+; CHECK-NEXT:    tbz w19, #0, .LBB5_6
 ; CHECK-NEXT:  // %bb.5:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB5_6:
-; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    mov x0, x1
 ; CHECK-NEXT:    bl private_za_decl
 ; CHECK-NEXT:    mov x1, x0
-; CHECK-NEXT:    tbz w20, #0, .LBB5_8
+; CHECK-NEXT:    tbz w19, #0, .LBB5_8
 ; CHECK-NEXT:  // %bb.7:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB5_8:
-; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    mov x0, x20
 ; CHECK-NEXT:    bl __arm_sme_restore
 ; CHECK-NEXT:    mov x0, x1
 ; CHECK-NEXT:    sub sp, x29, #64
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index c4440e7bcc3ff..1567ca258cccb 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -18,7 +18,7 @@ define void @streaming_compatible() #0 {
 ; CHECK-NEXT:    bl __arm_get_current_vg
 ; CHECK-NEXT:    stp x0, x19, [sp, #72] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB0_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
@@ -57,7 +57,7 @@ define void @streaming_compatible_arg(float %f) #0 {
 ; CHECK-NEXT:    stp x0, x19, [sp, #88] // 16-byte Folded Spill
 ; CHECK-NEXT:    str s0, [sp, #12] // 4-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB1_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
index 980144d6ca584..1933eb85b77f2 100644
--- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -44,7 +44,7 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; NOPAIR-NEXT:    addvl sp, sp, #-1
 ; NOPAIR-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; NOPAIR-NEXT:    bl __arm_sme_state
-; NOPAIR-NEXT:    and x19, x0, #0x1
+; NOPAIR-NEXT:    mov x19, x0
 ; NOPAIR-NEXT:    tbz w19, #0, .LBB0_2
 ; NOPAIR-NEXT:  // %bb.1:
 ; NOPAIR-NEXT:    smstop sm
@@ -126,7 +126,7 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
 ; PAIR-NEXT:    addvl sp, sp, #-1
 ; PAIR-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; PAIR-NEXT:    bl __arm_sme_state
-; PAIR-NEXT:    and x19, x0, #0x1
+; PAIR-NEXT:    mov x19, x0
 ; PAIR-NEXT:    tbz w19, #0, .LBB0_2
 ; PAIR-NEXT:  // %bb.1:
 ; PAIR-NEXT:    smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 4a52bf27a7591..759f3ee609e58 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -441,7 +441,7 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
 ; CHECK-COMMON-NEXT:    str x19, [sp, #96] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    bl __arm_sme_state
-; CHECK-COMMON-NEXT:    and x19, x0, #0x1
+; CHECK-COMMON-NEXT:    mov x19, x0
 ; CHECK-COMMON-NEXT:    tbz w19, #0, .LBB12_2
 ; CHECK-COMMON-NEXT:  // %bb.1:
 ; CHECK-COMMON-NEXT:    smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index e463e833bdbde..3f5e7e9f32a47 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -137,8 +137,10 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov x20, x0
 ; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    stur x9, [x29, #-80]
@@ -147,8 +149,6 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    stur wzr, [x29, #-68]
 ; CHECK-NEXT:    sturh w8, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x9
-; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x20, x0, #0x1
 ; CHECK-NEXT:    tbz w20, #0, .LBB3_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 130a316bcc2ba..ff5b7c047eaf5 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -57,7 +57,6 @@ define void @test1() nounwind "aarch64_pstate_sm_enabled" {
 }
 
 ; streaming-compatible caller -> normal callees
-; these conditional smstart/smstop are not yet optimized away.
 define void @test2() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0:
@@ -69,27 +68,17 @@ define void @test2() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB2_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    tbz w19, #0, .LBB2_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB2_4:
-; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbz w19, #0, .LBB2_6
-; CHECK-NEXT:  // %bb.5:
-; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:  .LBB2_6:
-; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    tbz w19, #0, .LBB2_8
-; CHECK-NEXT:  // %bb.7:
-; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:  .LBB2_8:
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
@@ -114,7 +103,7 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbnz w19, #0, .LBB3_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstart sm
@@ -124,8 +113,6 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB3_4:
-; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB3_6
 ; CHECK-NEXT:  // %bb.5:
 ; CHECK-NEXT:    smstop sm
@@ -135,8 +122,6 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" {
 ; CHECK-NEXT:  // %bb.7:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB3_8:
-; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbnz w19, #0, .LBB3_10
 ; CHECK-NEXT:  // %bb.9:
 ; CHECK-NEXT:    smstart sm
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
index 1a49da84c00ce..52078941aa745 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
@@ -18,12 +18,11 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x8, x0, #0x1
-; CHECK-NEXT:    tbnz w8, #0, .LBB0_2
+; CHECK-NEXT:    tbnz w0, #0, .LBB0_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB0_2:
-; CHECK-NEXT:    tbnz w8, #0, .LBB0_4
+; CHECK-NEXT:    tbnz w0, #0, .LBB0_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB0_4:
@@ -50,7 +49,7 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbnz w19, #0, .LBB1_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstart sm
@@ -88,7 +87,7 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp x9, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbnz w19, #0, .LBB2_2
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    smstart sm
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index e967f3b7be5e8..636c3ece9d411 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -44,7 +44,7 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
 ; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB1_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
@@ -83,7 +83,7 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
 ; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbnz w19, #0, .LBB2_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstart sm
@@ -143,10 +143,10 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-NEXT:    str z0, [x8] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    tbz w19, #0, .LBB4_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
@@ -218,7 +218,7 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str z0, [sp, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB5_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
@@ -311,7 +311,7 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    str p0, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB6_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
@@ -377,7 +377,7 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
 ; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbnz w19, #0, .LBB7_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstart sm
@@ -394,33 +394,34 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
 define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-LABEL: conditional_smstart_no_successor_block:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    tbz w0, #0, .LBB8_6
-; CHECK-NEXT:  // %bb.1: // %if.then
 ; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbnz w19, #0, .LBB8_3
+; CHECK-NEXT:    tbz w8, #0, .LBB8_6
+; CHECK-NEXT:  // %bb.1: // %if.then
+; CHECK-NEXT:    tbnz w0, #0, .LBB8_3
 ; CHECK-NEXT:  // %bb.2: // %if.then
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB8_3: // %if.then
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    bl streaming_callee
 ; CHECK-NEXT:    tbnz w19, #0, .LBB8_5
 ; CHECK-NEXT:  // %bb.4: // %if.then
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB8_5: // %if.then
+; CHECK-NEXT:  .LBB8_6: // %exit
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
-; CHECK-NEXT:  .LBB8_6: // %exit
 ; CHECK-NEXT:    ret
   br i1 %p, label %if.then, label %exit
 
@@ -443,7 +444,7 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    tbz w19, #0, .LBB9_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
@@ -492,7 +493,7 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
 ; CHECK-NEXT:    mov x9, x0
 ; CHECK-NEXT:    stp s0, s1, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    tbz w19, #0, .LBB10_2
 ; CHECK-NEXT:  // %bb.1: // %entry
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 6fcfc5b242c11..60d8987334c89 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=FP-CHECK
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=NO-SVE-CHECK
@@ -780,7 +781,7 @@ define void @streaming_compatible_to_streaming() #4 {
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    tbnz w19, #0, .LBB6_2
 ; CHECK-NEXT:  // %bb.1:
@@ -835,7 +836,7 @@ define void @streaming_compatible_to_streaming() #4 {
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
 ; FP-CHECK-NEXT:    bl __arm_sme_state
-; FP-CHECK-NEXT:    and x19, x0, #0x1
+; FP-CHECK-NEXT:    mov x19, x0
 ; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    tbnz w19, #0, .LBB6_2
 ; FP-CHECK-NEXT:  // %bb.1:
@@ -897,7 +898,7 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    .cfi_offset vg, -24
 ; CHECK-NEXT:    tbz w19, #0, .LBB7_2
 ; CHECK-NEXT:  // %bb.1:
@@ -952,7 +953,7 @@ define void @streaming_compatible_to_non_streaming() #4 {
 ; FP-CHECK-NEXT:    .cfi_offset b14, -88
 ; FP-CHECK-NEXT:    .cfi_offset b15, -96
 ; FP-CHECK-NEXT:    bl __arm_sme_state
-; FP-CHECK-NEXT:    and x19, x0, #0x1
+; FP-CHECK-NEXT:    mov x19, x0
 ; FP-CHECK-NEXT:    .cfi_offset vg, -16
 ; FP-CHECK-NEXT:    tbz w19, #0, .LBB7_2
 ; FP-CHECK-NEXT:  // %bb.1:
@@ -1025,7 +1026,7 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
 ; NO-SVE-CHECK-NEXT:    .cfi_offset b15, -96
 ; NO-SVE-CHECK-NEXT:    mov w8, w0
 ; NO-SVE-CHECK-NEXT:    bl __arm_sme_state
-; NO-SVE-CHECK-NEXT:    and x19, x0, #0x1
+; NO-SVE-CHECK-NEXT:    mov x19, x0
 ; NO-SVE-CHECK-NEXT:    .cfi_offset vg, -16
 ; NO-SVE-CHECK-NEXT:    tbnz w19, #0, .LBB8_2
 ; NO-SVE-CHECK-NEXT:  // %bb.1:
diff --git a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
index 47906252382f4..6c248048e682f 100644
--- a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
@@ -2,8 +2,7 @@
 
 ; We should have both spill and reload for %arg.
 
-; CHECK: remark: <unknown>:0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost generated in function
-
+; CHECK: remark: <unknown>:0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost 1 virtual registers copies 1.000000e+00 total copies cost generated in function
 define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind #0 {
   %res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
   %and = and <vscale x 2 x i1> %res, %arg
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 4615b1a6a9b2e..0a7d42d5d3911 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -1598,7 +1598,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK0-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
 ; CHECK0-NEXT:    stp q2, q3, [sp, #32] // 32-byte Folded Spill
 ; CHECK0-NEXT:    bl __arm_sme_state
-; CHECK0-NEXT:    and x21, x0, #0x1
+; CHECK0-NEXT:    mov x21, x0
 ; CHECK0-NEXT:    .cfi_offset vg, -40
 ; CHECK0-NEXT:    tbz w21, #0, .LBB27_2
 ; CHECK0-NEXT:  // %bb.1:
@@ -1612,23 +1612,21 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK0-NEXT:  .LBB27_4:
 ; CHECK0-NEXT:    cmp w0, #0
 ; CHECK0-NEXT:    .cfi_restore vg
-; CHECK0-NEXT:    cset w21, lt
-; CHECK0-NEXT:    bl __arm_sme_state
-; CHECK0-NEXT:    and x22, x0, #0x1
+; CHECK0-NEXT:    cset w22, lt
 ; CHECK0-NEXT:    .cfi_offset vg, -40
-; CHECK0-NEXT:    tbz w22, #0, .LBB27_6
+; CHECK0-NEXT:    tbz w21, #0, .LBB27_6
 ; CHECK0-NEXT:  // %bb.5:
 ; CHECK0-NEXT:    smstop sm
 ; CHECK0-NEXT:  .LBB27_6:
 ; CHECK0-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
 ; CHECK0-NEXT:    bl __getf2
-; CHECK0-NEXT:    tbz w22, #0, .LBB27_8
+; CHECK0-NEXT:    tbz w21, #0, .LBB27_8
 ; CHECK0-NEXT:  // %bb.7:
 ; CHECK0-NEXT:    smstart sm
 ; CHECK0-NEXT:  .LBB27_8:
 ; CHECK0-NEXT:    cmp w0, #0
 ; CHECK0-NEXT:    cset w8, ge
-; CHECK0-NEXT:    tst w8, w21
+; CHECK0-NEXT:    tst w8, w22
 ; CHECK0-NEXT:    csel w0, w20, w19, ne
 ; CHECK0-NEXT:    .cfi_restore vg
 ; CHECK0-NEXT:    ldp x20, x19, [sp, #160] // 16-byte Folded Reload
@@ -1687,7 +1685,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64-NEXT:    stp q0, q1, [sp, #64] // 32-byte Folded Spill
 ; CHECK64-NEXT:    stp q2, q3, [sp, #96] // 32-byte Folded Spill
 ; CHECK64-NEXT:    bl __arm_sme_state
-; CHECK64-NEXT:    and x21, x0, #0x1
+; CHECK64-NEXT:    mov x21, x0
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    tbz w21, #0, .LBB27_2
 ; CHECK64-NEXT:  // %bb.1:
@@ -1701,23 +1699,21 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK64-NEXT:  .LBB27_4:
 ; CHECK64-NEXT:    cmp w0, #0
 ; CHECK64-NEXT:    .cfi_restore vg
-; CHECK64-NEXT:    cset w21, lt
-; CHECK64-NEXT:    bl __arm_sme_state
-; CHECK64-NEXT:    and x22, x0, #0x1
+; CHECK64-NEXT:    cset w22, lt
 ; CHECK64-NEXT:    .cfi_offset vg, -48
-; CHECK64-NEXT:    tbz w22, #0, .LBB27_6
+; CHECK64-NEXT:    tbz w21, #0, .LBB27_6
 ; CHECK64-NEXT:  // %bb.5:
 ; CHECK64-NEXT:    smstop sm
 ; CHECK64-NEXT:  .LBB27_6:
 ; CHECK64-NEXT:    ldp q0, q1, [sp, #96] // 32-byte Folded Reload
 ; CHECK64-NEXT:    bl __getf2
-; CHECK64-NEXT:    tbz w22, #0, .LBB27_8
+; CHECK64-NEXT:    tbz w21, #0, .LBB27_8
 ; CHECK64-NEXT:  // %bb.7:
 ; CHECK64-NEXT:    smstart sm
 ; CHECK64-NEXT:  .LBB27_8:
 ; CHECK64-NEXT:    cmp w0, #0
 ; CHECK64-NEXT:    cset w8, ge
-; CHECK64-NEXT:    tst w8, w21
+; CHECK64-NEXT:    tst w8, w22
 ; CHECK64-NEXT:    csel w0, w20, w19, ne
 ; CHECK64-NEXT:    .cfi_restore vg
 ; CHECK64-NEXT:    ldp x20, x19, [sp, #296] // 16-byte Folded Reload
@@ -1784,7 +1780,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:    str q1, [sp, #1040] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    str q0, [sp, #1024] // 16-byte Folded Spill
 ; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    and x21, x0, #0x1
+; CHECK1024-NEXT:    mov x21, x0
 ; CHECK1024-NEXT:    .cfi_offset vg, -48
 ; CHECK1024-NEXT:    tbz w21, #0, .LBB27_2
 ; CHECK1024-NEXT:  // %bb.1:
@@ -1799,24 +1795,22 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
 ; CHECK1024-NEXT:  .LBB27_4:
 ; CHECK1024-NEXT:    cmp w0, #0
 ; CHECK1024-NEXT:    .cfi_restore vg
-; CHECK1024-NEXT:    cset w21, lt
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    and x22, x0, #0x1
+; CHECK1024-NEXT:    cset w22, lt
 ; CHECK1024-NEXT:    .cfi_offset vg, -48
-; CHECK1024-NEXT:    tbz w22, #0, .LBB27_6
+; CHECK1024-NEXT:    tbz w21, #0, .LBB27_6
 ; CHECK1024-NEXT:  // %bb.5:
 ; CHECK1024-NEXT:    smstop sm
 ; CHECK1024-NEXT:  .LBB27_6:
 ; CHECK1024-NEXT:    ldr q0, [sp, #1056] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    ldr q1, [sp, #1072] // 16-byte Folded Reload
 ; CHECK1024-NEXT:    bl __getf2
-; CHECK1024-NEXT:    tbz w22, #0, .LBB27_8
+; CHECK1024-NEXT:    tbz w21, #0, .LBB27_8
 ; CHECK1024-NEXT:  // %bb.7:
 ; CHECK1024-NEXT:    smstart sm
 ; CHECK1024-NEXT:  .LBB27_8:
 ; CHECK1024-NEXT:    cmp w0, #0
 ; CHECK1024-NEXT:    cset w8, ge
-; CHECK1024-NEXT:    tst w8, w21
+; CHECK1024-NEXT:    tst w8, w22
 ; CHECK1024-NEXT:    csel w0, w20, w19, ne
 ; CHECK1024-NEXT:    .cfi_restore vg
 ; CHECK1024-NEXT:    add sp, sp, #1088
@@ -1907,10 +1901,10 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * VG - 48
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48
 ; CHECK0-NEXT:    mov x8, x0
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    mov x19, x0
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    bl __arm_sme_state
-; CHECK0-NEXT:    and x19, x0, #0x1
 ; CHECK0-NEXT:    .cfi_offset vg, -32
 ; CHECK0-NEXT:    tbz w19, #0, .LBB28_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
@@ -2030,10 +2024,10 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x01, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 176 + 144 * VG
 ; CHECK64-NEXT:    mov x8, x0
+; CHECK64-NEXT:    bl __arm_sme_state
+; CHECK64-NEXT:    mov x19, x0
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
-; CHECK64-NEXT:    bl __arm_sme_state
-; CHECK64-NEXT:    and x19, x0, #0x1
 ; CHECK64-NEXT:    .cfi_offset vg, -32
 ; CHECK64-NEXT:    tbz w19, #0, .LBB28_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
@@ -2159,10 +2153,10 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK1024-NEXT:    sub sp, sp, #1024
 ; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 2096 + 144 * VG
 ; CHECK1024-NEXT:    mov x8, x0
+; CHECK1024-NEXT:    bl __arm_sme_state
+; CHECK1024-NEXT:    mov x19, x0
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    and x19, x0, #0x1
 ; CHECK1024-NEXT:    .cfi_offset vg, -32
 ; CHECK1024-NEXT:    tbz w19, #0, .LBB28_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
@@ -2291,10 +2285,10 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48
 ; CHECK0-NEXT:    sub sp, sp, #48
 ; CHECK0-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 96 + 144 * VG
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    mov x19, x0
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    bl __arm_sme_state
-; CHECK0-NEXT:    and x19, x0, #0x1
 ; CHECK0-NEXT:    .cfi_offset vg, -32
 ; CHECK0-NEXT:    tbz w19, #0, .LBB29_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
@@ -2415,10 +2409,10 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 112
 ; CHECK64-NEXT:    sub sp, sp, #112
 ; CHECK64-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x01, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 224 + 144 * VG
+; CHECK64-NEXT:    bl __arm_sme_state
+; CHECK64-NEXT:    mov x19, x0
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
-; CHECK64-NEXT:    bl __arm_sme_state
-; CHECK64-NEXT:    and x19, x0, #0x1
 ; CHECK64-NEXT:    .cfi_offset vg, -32
 ; CHECK64-NEXT:    tbz w19, #0, .LBB29_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
@@ -2543,10 +2537,10 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1072
 ; CHECK1024-NEXT:    sub sp, sp, #1072
 ; CHECK1024-NEXT:    .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 2144 + 144 * VG
+; CHECK1024-NEXT:    bl __arm_sme_state
+; CHECK1024-NEXT:    mov x19, x0
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    and x19, x0, #0x1
 ; CHECK1024-NEXT:    .cfi_offset vg, -32
 ; CHECK1024-NEXT:    tbz w19, #0, .LBB29_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
@@ -2616,6 +2610,7 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
 ; CHECK1024-NEXT:    .cfi_restore w30
 ; CHECK1024-NEXT:    .cfi_restore w29
 ; CHECK1024-NEXT:    ret
+
 entry:
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
   %0 = alloca [37 x i8], align 16
@@ -3200,18 +3195,19 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * VG - 64
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * VG - 64
 ; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * VG - 64
-; CHECK0-NEXT:    mov w9, w0
-; CHECK0-NEXT:    mov x8, sp
-; CHECK0-NEXT:    mov w2, w1
-; CHECK0-NEXT:    add x9, x9, #15
 ; CHECK0-NEXT:    mov x19, sp
-; CHECK0-NEXT:    and x9, x9, #0x1fffffff0
-; CHECK0-NEXT:    sub x8, x8, x9
+; CHECK0-NEXT:    mov w2, w1
+; CHECK0-NEXT:    mov w8, w0
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    mov w8, w8
+; CHECK0-NEXT:    mov x9, sp
+; CHECK0-NEXT:    mov x20, x0
+; CHECK0-NEXT:    add x8, x8, #15
+; CHECK0-NEXT:    and x8, x8, #0x1fffffff0
+; CHECK0-NEXT:    sub x8, x9, x8
 ; CHECK0-NEXT:    mov sp, x8
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    bl __arm_sme_state
-; CHECK0-NEXT:    and x20, x0, #0x1
 ; CHECK0-NEXT:    .cfi_offset vg, -48
 ; CHECK0-NEXT:    tbz w20, #0, .LBB35_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
@@ -3336,18 +3332,19 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128
 ; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128
 ; CHECK64-NEXT:    sub sp, sp, #64
-; CHECK64-NEXT:    mov w9, w0
-; CHECK64-NEXT:    mov x8, sp
-; CHECK64-NEXT:    mov w2, w1
-; CHECK64-NEXT:    add x9, x9, #15
 ; CHECK64-NEXT:    mov x19, sp
-; CHECK64-NEXT:    and x9, x9, #0x1fffffff0
-; CHECK64-NEXT:    sub x8, x8, x9
+; CHECK64-NEXT:    mov w2, w1
+; CHECK64-NEXT:    mov w8, w0
+; CHECK64-NEXT:    bl __arm_sme_state
+; CHECK64-NEXT:    mov w8, w8
+; CHECK64-NEXT:    mov x9, sp
+; CHECK64-NEXT:    mov x20, x0
+; CHECK64-NEXT:    add x8, x8, #15
+; CHECK64-NEXT:    and x8, x8, #0x1fffffff0
+; CHECK64-NEXT:    sub x8, x9, x8
 ; CHECK64-NEXT:    mov sp, x8
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
-; CHECK64-NEXT:    bl __arm_sme_state
-; CHECK64-NEXT:    and x20, x0, #0x1
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    tbz w20, #0, .LBB35_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
@@ -3478,18 +3475,19 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
 ; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
 ; CHECK1024-NEXT:    sub sp, sp, #1024
-; CHECK1024-NEXT:    mov w9, w0
-; CHECK1024-NEXT:    mov x8, sp
-; CHECK1024-NEXT:    mov w2, w1
-; CHECK1024-NEXT:    add x9, x9, #15
 ; CHECK1024-NEXT:    mov x19, sp
-; CHECK1024-NEXT:    and x9, x9, #0x1fffffff0
-; CHECK1024-NEXT:    sub x8, x8, x9
+; CHECK1024-NEXT:    mov w2, w1
+; CHECK1024-NEXT:    mov w8, w0
+; CHECK1024-NEXT:    bl __arm_sme_state
+; CHECK1024-NEXT:    mov w8, w8
+; CHECK1024-NEXT:    mov x9, sp
+; CHECK1024-NEXT:    mov x20, x0
+; CHECK1024-NEXT:    add x8, x8, #15
+; CHECK1024-NEXT:    and x8, x8, #0x1fffffff0
+; CHECK1024-NEXT:    sub x8, x9, x8
 ; CHECK1024-NEXT:    mov sp, x8
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    and x20, x0, #0x1
 ; CHECK1024-NEXT:    .cfi_offset vg, -48
 ; CHECK1024-NEXT:    tbz w20, #0, .LBB35_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
@@ -3627,10 +3625,10 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK0-NEXT:    sub x9, sp, #1024
 ; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK0-NEXT:    mov w2, w1
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    mov x19, x0
 ; CHECK0-NEXT:    //APP
 ; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    bl __arm_sme_state
-; CHECK0-NEXT:    and x19, x0, #0x1
 ; CHECK0-NEXT:    .cfi_offset vg, -48
 ; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
 ; CHECK0-NEXT:  // %bb.1: // %entry
@@ -3754,10 +3752,10 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    sub x9, sp, #1088
 ; CHECK64-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK64-NEXT:    mov w2, w1
+; CHECK64-NEXT:    bl __arm_sme_state
+; CHECK64-NEXT:    mov x19, x0
 ; CHECK64-NEXT:    //APP
 ; CHECK64-NEXT:    //NO_APP
-; CHECK64-NEXT:    bl __arm_sme_state
-; CHECK64-NEXT:    and x19, x0, #0x1
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    tbz w19, #0, .LBB36_2
 ; CHECK64-NEXT:  // %bb.1: // %entry
@@ -3886,10 +3884,10 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK1024-NEXT:    sub x9, sp, #2048
 ; CHECK1024-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK1024-NEXT:    mov w2, w1
+; CHECK1024-NEXT:    bl __arm_sme_state
+; CHECK1024-NEXT:    mov x19, x0
 ; CHECK1024-NEXT:    //APP
 ; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    and x19, x0, #0x1
 ; CHECK1024-NEXT:    .cfi_offset vg, -48
 ; CHECK1024-NEXT:    tbz w19, #0, .LBB36_2
 ; CHECK1024-NEXT:  // %bb.1: // %entry
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index f1e684c86e896..0650b8271d374 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -180,16 +180,15 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
 ; CHECK-NO-SME-ROUTINES-NEXT:    stp x30, x9, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
 ; CHECK-NO-SME-ROUTINES-NEXT:    bl __arm_sme_state
-; CHECK-NO-SME-ROUTINES-NEXT:    adrp x8, :got:dst
-; CHECK-NO-SME-ROUTINES-NEXT:    and x19, x0, #0x1
+; CHECK-NO-SME-ROUTINES-NEXT:    mov x19, x0
+; CHECK-NO-SME-ROUTINES-NEXT:    adrp x0, :got:dst
 ; CHECK-NO-SME-ROUTINES-NEXT:    adrp x1, :got:src
-; CHECK-NO-SME-ROUTINES-NEXT:    ldr x8, [x8, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT:    ldr x0, [x0, :got_lo12:dst]
 ; CHECK-NO-SME-ROUTINES-NEXT:    ldr x1, [x1, :got_lo12:src]
 ; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB3_2
 ; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.1: // %entry
 ; CHECK-NO-SME-ROUTINES-NEXT:    smstop sm
 ; CHECK-NO-SME-ROUTINES-NEXT:  .LBB3_2: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT:    mov x0, x8
 ; CHECK-NO-SME-ROUTINES-NEXT:    bl memcpy
 ; CHECK-NO-SME-ROUTINES-NEXT:    tbz w19, #0, .LBB3_4
 ; CHECK-NO-SME-ROUTINES-NEXT:  // %bb.3: // %entry
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index e0da9b57c6556..d440535f022c4 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -421,10 +421,10 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * VG - 48
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48
 ; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    bl __arm_sme_state
+; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
 ; CHECK-NEXT:    .cfi_offset vg, -32
 ; CHECK-NEXT:    tbz w19, #0, .LBB7_2
 ; CHECK-NEXT:  // %bb.1: // %entry

>From 86bc038f37d7d3b16ef61e33619e87f373774505 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 6 Aug 2025 14:30:02 +0000
Subject: [PATCH 2/2] Rename CALLER_IS_STREAMING -> ENTRY_PSTATE_SM

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 20 +++++++++----------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  4 ++--
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  8 ++++----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2fe3278bb1846..3b19956f7650e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3102,8 +3102,8 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
 }
 
 MachineBasicBlock *
-AArch64TargetLowering::EmitCallerIsStreaming(MachineInstr &MI,
-                                             MachineBasicBlock *BB) const {
+AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI,
+                                         MachineBasicBlock *BB) const {
   MachineFunction *MF = BB->getParent();
   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
@@ -3242,8 +3242,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     return EmitAllocateSMESaveBuffer(MI, BB);
   case AArch64::GetSMESaveSize:
     return EmitGetSMESaveSize(MI, BB);
-  case AArch64::CallerIsStreaming:
-    return EmitCallerIsStreaming(MI, BB);
+  case AArch64::EntryPStateSM:
+    return EmitEntryPStateSM(MI, BB);
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
   case TargetOpcode::STATEPOINT:
@@ -8161,16 +8161,16 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
 
   if (Attrs.hasStreamingCompatibleInterface()) {
-    SDValue CallerIsStreaming =
-        DAG.getNode(AArch64ISD::CALLER_IS_STREAMING, DL,
+    SDValue EntryPStateSM =
+        DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
                     DAG.getVTList(MVT::i64, MVT::Other), {Chain});
 
     // Copy the value to a virtual register, and save that in FuncInfo.
-    Register CallerIsStreamingReg =
+    Register EntryPStateSMReg =
         MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
-    Chain = DAG.getCopyToReg(CallerIsStreaming.getValue(1), DL,
-                             CallerIsStreamingReg, CallerIsStreaming);
-    FuncInfo->setPStateSMReg(CallerIsStreamingReg);
+    Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
+                             EntryPStateSM);
+    FuncInfo->setPStateSMReg(EntryPStateSMReg);
   }
 
   // Insert the SMSTART if this is a locally streaming function and
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index cc23f7e0bdfcd..02dfd9aa7b067 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -181,8 +181,8 @@ class AArch64TargetLowering : public TargetLowering {
                                                MachineBasicBlock *BB) const;
   MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI,
                                         MachineBasicBlock *BB) const;
-  MachineBasicBlock *EmitCallerIsStreaming(MachineInstr &MI,
-                                           MachineBasicBlock *BB) const;
+  MachineBasicBlock *EmitEntryPStateSM(MachineInstr &MI,
+                                       MachineBasicBlock *BB) const;
 
   /// Replace (0, vreg) discriminator components with the operands of blend
   /// or with (immediate, NoRegister) when possible.
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 7b5f45e96a942..9c20087159d17 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -39,14 +39,14 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
 def AArch64CoalescerBarrier
     : SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
 
-def AArch64CallerIsStreaming
-  : SDNode<"AArch64ISD::CALLER_IS_STREAMING", SDTypeProfile<1, 0,
+def AArch64EntryPStateSM
+  : SDNode<"AArch64ISD::ENTRY_PSTATE_SM", SDTypeProfile<1, 0,
     [SDTCisInt<0>]>, [SDNPHasChain, SDNPSideEffect]>;
 
 let usesCustomInserter = 1 in {
-  def CallerIsStreaming : Pseudo<(outs GPR64:$is_streaming), (ins), []>, Sched<[]> {}
+  def EntryPStateSM : Pseudo<(outs GPR64:$is_streaming), (ins), []>, Sched<[]> {}
 }
-def : Pat<(i64 (AArch64CallerIsStreaming)), (CallerIsStreaming)>;
+def : Pat<(i64 (AArch64EntryPStateSM)), (EntryPStateSM)>;
 
 def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
                            [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;