[llvm] [AArch64][SME] Use entry pstate.sm for conditional streaming-mode changes (PR #152169)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 6 07:31:22 PDT 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/152169
>From c5ef6d529be5b64d566ab3217dfae5eecd002e76 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 5 Aug 2025 15:54:29 +0000
Subject: [PATCH 1/2] [AArch64][SME] Use entry pstate.sm for conditional
streaming-mode changes
We only do conditional streaming mode changes in two cases:
- Around calls in streaming-compatible functions that don't have a streaming body
- At the entry/exit of streaming-compatible functions with a streaming body
In both cases, the condition depends on the entry pstate.sm value. Given
this, we don't need to emit calls to __arm_sme_state at every mode change.
This patch handles this by placing a "AArch64ISD::CALLER_IS_STREAMING"
node in the entry block and copying the result to a register. The register
is then used whenever we need to emit a conditional streaming mode change.
The "CALLER_IS_STREAMING" node expands to a call to "__arm_sme_state" o
nly if (after SelectionDAG) the function is determined to have
streaming-mode changes.
This has two main advantages:
1. It allows back-to-back conditional smstart/stop pairs to be folded
2. It has the correct behaviour for EH landing pads
- These are entered with pstate.sm = 0, and should switch mode based on
the entry pstate.sm
- Note: This is not fully implemented yet
---
.../lib/CodeGen/SelectionDAG/InstrEmitter.cpp | 4 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 91 ++++++++-----
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 6 +-
.../AArch64/AArch64MachineFunctionInfo.h | 6 +
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 9 ++
llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp | 12 +-
llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 28 ++--
...compatible-to-normal-fn-wihout-sme-attr.ll | 4 +-
.../AArch64/sme-callee-save-restore-pairs.ll | 4 +-
.../AArch64/sme-disable-gisel-fisel.ll | 2 +-
.../CodeGen/AArch64/sme-lazy-save-call.ll | 4 +-
.../test/CodeGen/AArch64/sme-peephole-opts.ll | 21 +--
...ing-body-streaming-compatible-interface.ll | 9 +-
.../sme-streaming-compatible-interface.ll | 27 ++--
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 11 +-
.../CodeGen/AArch64/spill-reload-remarks.ll | 3 +-
llvm/test/CodeGen/AArch64/stack-hazard.ll | 124 +++++++++---------
.../streaming-compatible-memory-ops.ll | 7 +-
.../CodeGen/AArch64/sve-stack-frame-layout.ll | 4 +-
19 files changed, 200 insertions(+), 176 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 8c8daef6dccd4..763b3868a99ca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -1178,7 +1178,9 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
if (Node->getValueType(Node->getNumValues()-1) == MVT::Glue) {
for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) {
if (F->getOpcode() == ISD::CopyFromReg) {
- UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+ Register Reg = cast<RegisterSDNode>(F->getOperand(1))->getReg();
+ if (Reg.isPhysical())
+ UsedRegs.push_back(Reg);
continue;
} else if (F->getOpcode() == ISD::CopyToReg) {
// Skip CopyToReg nodes that are internal to the glue chain.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 43b9743639ab2..2fe3278bb1846 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3101,6 +3101,32 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitCallerIsStreaming(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ if (FuncInfo->IsPStateSMRegUsed()) {
+ const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
+ .addExternalSymbol("__arm_sme_state")
+ .addReg(AArch64::X0, RegState::ImplicitDefine)
+ .addRegMask(TRI->getCallPreservedMask(
+ *MF, CallingConv::
+ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2));
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+ MI.getOperand(0).getReg())
+ .addReg(AArch64::X0);
+ } else {
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
+ MI.getOperand(0).getReg())
+ .addReg(AArch64::XZR);
+ }
+ BB->remove_instr(&MI);
+ return BB;
+}
+
// Helper function to find the instruction that defined a virtual register.
// If unable to find such instruction, returns nullptr.
static const MachineInstr *stripVRegCopies(const MachineRegisterInfo &MRI,
@@ -3216,6 +3242,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
return EmitAllocateSMESaveBuffer(MI, BB);
case AArch64::GetSMESaveSize:
return EmitGetSMESaveSize(MI, BB);
+ case AArch64::CallerIsStreaming:
+ return EmitCallerIsStreaming(MI, BB);
case AArch64::F128CSEL:
return EmitF128CSEL(MI, BB);
case TargetOpcode::STATEPOINT:
@@ -8132,19 +8160,26 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
}
assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
+ if (Attrs.hasStreamingCompatibleInterface()) {
+ SDValue CallerIsStreaming =
+ DAG.getNode(AArch64ISD::CALLER_IS_STREAMING, DL,
+ DAG.getVTList(MVT::i64, MVT::Other), {Chain});
+
+ // Copy the value to a virtual register, and save that in FuncInfo.
+ Register CallerIsStreamingReg =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ Chain = DAG.getCopyToReg(CallerIsStreaming.getValue(1), DL,
+ CallerIsStreamingReg, CallerIsStreaming);
+ FuncInfo->setPStateSMReg(CallerIsStreamingReg);
+ }
+
// Insert the SMSTART if this is a locally streaming function and
// make sure it is Glued to the last CopyFromReg value.
if (IsLocallyStreaming) {
- SDValue PStateSM;
- if (Attrs.hasStreamingCompatibleInterface()) {
- PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
- Register Reg = MF.getRegInfo().createVirtualRegister(
- getRegClassFor(PStateSM.getValueType().getSimpleVT()));
- FuncInfo->setPStateSMReg(Reg);
- Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
+ if (Attrs.hasStreamingCompatibleInterface())
Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
- AArch64SME::IfCallerIsNonStreaming, PStateSM);
- } else
+ AArch64SME::IfCallerIsNonStreaming);
+ else
Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
AArch64SME::Always);
@@ -8834,8 +8869,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
bool Enable, SDValue Chain,
SDValue InGlue,
- unsigned Condition,
- SDValue PStateSM) const {
+ unsigned Condition) const {
MachineFunction &MF = DAG.getMachineFunction();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
FuncInfo->setHasStreamingModeChanges(true);
@@ -8847,9 +8881,16 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
SmallVector<SDValue> Ops = {Chain, MSROp};
unsigned Opcode;
if (Condition != AArch64SME::Always) {
+ FuncInfo->setPStateSMRegUsed(true);
+ Register PStateReg = FuncInfo->getPStateSMReg();
+ assert(PStateReg.isValid() && "PStateSM Register is invalid");
+ SDValue PStateSM =
+ DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
+ // Use chain and glue from the CopyFromReg.
+ Ops[0] = PStateSM.getValue(1);
+ InGlue = PStateSM.getValue(2);
SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
- assert(PStateSM && "PStateSM should be defined");
Ops.push_back(ConditionOp);
Ops.push_back(PStateSM);
} else {
@@ -9124,15 +9165,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
/*IsSave=*/true);
}
- SDValue PStateSM;
bool RequiresSMChange = CallAttrs.requiresSMChange();
if (RequiresSMChange) {
- if (CallAttrs.caller().hasStreamingInterfaceOrBody())
- PStateSM = DAG.getConstant(1, DL, MVT::i64);
- else if (CallAttrs.caller().hasNonStreamingInterface())
- PStateSM = DAG.getConstant(0, DL, MVT::i64);
- else
- PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
OptimizationRemarkEmitter ORE(&MF.getFunction());
ORE.emit([&]() {
auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
@@ -9447,9 +9481,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
InGlue = Chain.getValue(1);
}
- SDValue NewChain = changeStreamingMode(
- DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
- getSMToggleCondition(CallAttrs), PStateSM);
+ SDValue NewChain =
+ changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(),
+ Chain, InGlue, getSMToggleCondition(CallAttrs));
Chain = NewChain.getValue(0);
InGlue = NewChain.getValue(1);
}
@@ -9633,10 +9667,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
InGlue = Result.getValue(Result->getNumValues() - 1);
if (RequiresSMChange) {
- assert(PStateSM && "Expected a PStateSM to be set");
Result = changeStreamingMode(
DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
- getSMToggleCondition(CallAttrs), PStateSM);
+ getSMToggleCondition(CallAttrs));
if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
InGlue = Result.getValue(1);
@@ -9802,14 +9835,11 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// Emit SMSTOP before returning from a locally streaming function
SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
- if (FuncAttrs.hasStreamingCompatibleInterface()) {
- Register Reg = FuncInfo->getPStateSMReg();
- assert(Reg.isValid() && "PStateSM Register is invalid");
- SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
+ if (FuncAttrs.hasStreamingCompatibleInterface())
Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
/*Glue*/ SDValue(),
- AArch64SME::IfCallerIsNonStreaming, PStateSM);
- } else
+ AArch64SME::IfCallerIsNonStreaming);
+ else
Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
/*Glue*/ SDValue(), AArch64SME::Always);
Glue = Chain.getValue(1);
@@ -28171,6 +28201,7 @@ void AArch64TargetLowering::ReplaceNodeResults(
case Intrinsic::aarch64_sme_in_streaming_mode: {
SDLoc DL(N);
SDValue Chain = DAG.getEntryNode();
+
SDValue RuntimePStateSM =
getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
Results.push_back(
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 88876570ac811..cc23f7e0bdfcd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -181,6 +181,8 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitCallerIsStreaming(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
/// Replace (0, vreg) discriminator components with the operands of blend
/// or with (immediate, NoRegister) when possible.
@@ -523,8 +525,8 @@ class AArch64TargetLowering : public TargetLowering {
/// node. \p Condition should be one of the enum values from
/// AArch64SME::ToggleCondition.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable,
- SDValue Chain, SDValue InGlue, unsigned Condition,
- SDValue PStateSM = SDValue()) const;
+ SDValue Chain, SDValue InGlue,
+ unsigned Condition) const;
bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 800787cc0b4f5..cb9fdb7606329 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -231,6 +231,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// on function entry to record the initial pstate of a function.
Register PStateSMReg = MCRegister::NoRegister;
+ // true if PStateSMReg is used.
+ bool PStateSMRegUsed = false;
+
// Holds a pointer to a buffer that is large enough to represent
// all SME ZA state and any additional state required by the
// __arm_sme_save/restore support routines.
@@ -274,6 +277,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
+ unsigned IsPStateSMRegUsed() const { return PStateSMRegUsed; };
+ void setPStateSMRegUsed(bool Used = true) { PStateSMRegUsed = Used; };
+
int64_t getVGIdx() const { return VGIdx; };
void setVGIdx(unsigned Idx) { VGIdx = Idx; };
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index db27ca978980f..7b5f45e96a942 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -39,6 +39,15 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64CallerIsStreaming
+ : SDNode<"AArch64ISD::CALLER_IS_STREAMING", SDTypeProfile<1, 0,
+ [SDTCisInt<0>]>, [SDNPHasChain, SDNPSideEffect]>;
+
+let usesCustomInserter = 1 in {
+ def CallerIsStreaming : Pseudo<(outs GPR64:$is_streaming), (ins), []>, Sched<[]> {}
+}
+def : Pat<(i64 (AArch64CallerIsStreaming)), (CallerIsStreaming)>;
+
def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
[SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
index bd28716118880..564af6708e1ed 100644
--- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp
@@ -80,16 +80,10 @@ static bool isMatchingStartStopPair(const MachineInstr *MI1,
if (MI1->getOperand(4).getRegMask() != MI2->getOperand(4).getRegMask())
return false;
- // This optimisation is unlikely to happen in practice for conditional
- // smstart/smstop pairs as the virtual registers for pstate.sm will always
- // be different.
- // TODO: For this optimisation to apply to conditional smstart/smstop,
- // this pass will need to do more work to remove redundant calls to
- // __arm_sme_state.
-
// Only consider conditional start/stop pairs which read the same register
- // holding the original value of pstate.sm, as some conditional start/stops
- // require the state on entry to the function.
+ // holding the original value of pstate.sm. This is somewhat over conservative
+ // as all conditional streaming mode changes only look at the state on entry
+ // to the function.
if (MI1->getOperand(3).isReg() && MI2->getOperand(3).isReg()) {
Register Reg1 = MI1->getOperand(3).getReg();
Register Reg2 = MI2->getOperand(3).getReg();
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index 1f68815411097..ba40ccd1c7406 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -150,42 +150,40 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
; CHECK-NEXT: add x29, sp, #64
; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: bl __arm_sme_state_size
; CHECK-NEXT: sub sp, sp, x0
-; CHECK-NEXT: mov x19, sp
-; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: mov x20, sp
+; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: bl __arm_sme_save
-; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x20, x0, #0x1
-; CHECK-NEXT: tbz w20, #0, .LBB5_2
+; CHECK-NEXT: tbz w19, #0, .LBB5_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB5_2:
; CHECK-NEXT: mov x0, x8
; CHECK-NEXT: bl private_za_decl
-; CHECK-NEXT: mov x2, x0
-; CHECK-NEXT: tbz w20, #0, .LBB5_4
+; CHECK-NEXT: mov x1, x0
+; CHECK-NEXT: tbz w19, #0, .LBB5_4
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB5_4:
-; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: bl __arm_sme_restore
-; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: bl __arm_sme_save
-; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x20, x0, #0x1
-; CHECK-NEXT: tbz w20, #0, .LBB5_6
+; CHECK-NEXT: tbz w19, #0, .LBB5_6
; CHECK-NEXT: // %bb.5:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB5_6:
-; CHECK-NEXT: mov x0, x2
+; CHECK-NEXT: mov x0, x1
; CHECK-NEXT: bl private_za_decl
; CHECK-NEXT: mov x1, x0
-; CHECK-NEXT: tbz w20, #0, .LBB5_8
+; CHECK-NEXT: tbz w19, #0, .LBB5_8
; CHECK-NEXT: // %bb.7:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB5_8:
-; CHECK-NEXT: mov x0, x19
+; CHECK-NEXT: mov x0, x20
; CHECK-NEXT: bl __arm_sme_restore
; CHECK-NEXT: mov x0, x1
; CHECK-NEXT: sub sp, x29, #64
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index c4440e7bcc3ff..1567ca258cccb 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -18,7 +18,7 @@ define void @streaming_compatible() #0 {
; CHECK-NEXT: bl __arm_get_current_vg
; CHECK-NEXT: stp x0, x19, [sp, #72] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbz w19, #0, .LBB0_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
@@ -57,7 +57,7 @@ define void @streaming_compatible_arg(float %f) #0 {
; CHECK-NEXT: stp x0, x19, [sp, #88] // 16-byte Folded Spill
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbz w19, #0, .LBB1_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
index 980144d6ca584..1933eb85b77f2 100644
--- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -44,7 +44,7 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
; NOPAIR-NEXT: addvl sp, sp, #-1
; NOPAIR-NEXT: str z0, [sp] // 16-byte Folded Spill
; NOPAIR-NEXT: bl __arm_sme_state
-; NOPAIR-NEXT: and x19, x0, #0x1
+; NOPAIR-NEXT: mov x19, x0
; NOPAIR-NEXT: tbz w19, #0, .LBB0_2
; NOPAIR-NEXT: // %bb.1:
; NOPAIR-NEXT: smstop sm
@@ -126,7 +126,7 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
; PAIR-NEXT: addvl sp, sp, #-1
; PAIR-NEXT: str z0, [sp] // 16-byte Folded Spill
; PAIR-NEXT: bl __arm_sme_state
-; PAIR-NEXT: and x19, x0, #0x1
+; PAIR-NEXT: mov x19, x0
; PAIR-NEXT: tbz w19, #0, .LBB0_2
; PAIR-NEXT: // %bb.1:
; PAIR-NEXT: smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 4a52bf27a7591..759f3ee609e58 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -441,7 +441,7 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
; CHECK-COMMON-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: bl __arm_sme_state
-; CHECK-COMMON-NEXT: and x19, x0, #0x1
+; CHECK-COMMON-NEXT: mov x19, x0
; CHECK-COMMON-NEXT: tbz w19, #0, .LBB12_2
; CHECK-COMMON-NEXT: // %bb.1:
; CHECK-COMMON-NEXT: smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index e463e833bdbde..3f5e7e9f32a47 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -137,8 +137,10 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: msub x9, x8, x8, x9
; CHECK-NEXT: mov sp, x9
; CHECK-NEXT: stur x9, [x29, #-80]
@@ -147,8 +149,6 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: stur wzr, [x29, #-68]
; CHECK-NEXT: sturh w8, [x29, #-72]
; CHECK-NEXT: msr TPIDR2_EL0, x9
-; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x20, x0, #0x1
; CHECK-NEXT: tbz w20, #0, .LBB3_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 130a316bcc2ba..ff5b7c047eaf5 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -57,7 +57,6 @@ define void @test1() nounwind "aarch64_pstate_sm_enabled" {
}
; streaming-compatible caller -> normal callees
-; these conditional smstart/smstop are not yet optimized away.
define void @test2() nounwind "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: test2:
; CHECK: // %bb.0:
@@ -69,27 +68,17 @@ define void @test2() nounwind "aarch64_pstate_sm_compatible" {
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbz w19, #0, .LBB2_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_2:
; CHECK-NEXT: bl callee
+; CHECK-NEXT: bl callee
; CHECK-NEXT: tbz w19, #0, .LBB2_4
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB2_4:
-; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
-; CHECK-NEXT: tbz w19, #0, .LBB2_6
-; CHECK-NEXT: // %bb.5:
-; CHECK-NEXT: smstop sm
-; CHECK-NEXT: .LBB2_6:
-; CHECK-NEXT: bl callee
-; CHECK-NEXT: tbz w19, #0, .LBB2_8
-; CHECK-NEXT: // %bb.7:
-; CHECK-NEXT: smstart sm
-; CHECK-NEXT: .LBB2_8:
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
@@ -114,7 +103,7 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" {
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbnz w19, #0, .LBB3_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstart sm
@@ -124,8 +113,6 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB3_4:
-; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB3_6
; CHECK-NEXT: // %bb.5:
; CHECK-NEXT: smstop sm
@@ -135,8 +122,6 @@ define void @test3() nounwind "aarch64_pstate_sm_compatible" {
; CHECK-NEXT: // %bb.7:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB3_8:
-; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB3_10
; CHECK-NEXT: // %bb.9:
; CHECK-NEXT: smstart sm
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
index 1a49da84c00ce..52078941aa745 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
@@ -18,12 +18,11 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
; CHECK-NEXT: cntd x9
; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x8, x0, #0x1
-; CHECK-NEXT: tbnz w8, #0, .LBB0_2
+; CHECK-NEXT: tbnz w0, #0, .LBB0_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB0_2:
-; CHECK-NEXT: tbnz w8, #0, .LBB0_4
+; CHECK-NEXT: tbnz w0, #0, .LBB0_4
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB0_4:
@@ -50,7 +49,7 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbnz w19, #0, .LBB1_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstart sm
@@ -88,7 +87,7 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: smstart sm
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index e967f3b7be5e8..636c3ece9d411 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -44,7 +44,7 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbz w19, #0, .LBB1_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
@@ -83,7 +83,7 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstart sm
@@ -143,10 +143,10 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: add x8, sp, #16
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB4_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
@@ -218,7 +218,7 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: str z0, [sp, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbz w19, #0, .LBB5_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
@@ -311,7 +311,7 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str p0, [sp, #7, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbz w19, #0, .LBB6_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
@@ -377,7 +377,7 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbnz w19, #0, .LBB7_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstart sm
@@ -394,33 +394,34 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: conditional_smstart_no_successor_block:
; CHECK: // %bb.0:
-; CHECK-NEXT: tbz w0, #0, .LBB8_6
-; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
-; CHECK-NEXT: tbnz w19, #0, .LBB8_3
+; CHECK-NEXT: tbz w8, #0, .LBB8_6
+; CHECK-NEXT: // %bb.1: // %if.then
+; CHECK-NEXT: tbnz w0, #0, .LBB8_3
; CHECK-NEXT: // %bb.2: // %if.then
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB8_3: // %if.then
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: bl streaming_callee
; CHECK-NEXT: tbnz w19, #0, .LBB8_5
; CHECK-NEXT: // %bb.4: // %if.then
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB8_5: // %if.then
+; CHECK-NEXT: .LBB8_6: // %exit
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
-; CHECK-NEXT: .LBB8_6: // %exit
; CHECK-NEXT: ret
br i1 %p, label %if.then, label %exit
@@ -443,7 +444,7 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: tbz w19, #0, .LBB9_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
@@ -492,7 +493,7 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-NEXT: mov x9, x0
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: .cfi_offset vg, -24
; CHECK-NEXT: tbz w19, #0, .LBB10_2
; CHECK-NEXT: // %bb.1: // %entry
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 6fcfc5b242c11..60d8987334c89 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=FP-CHECK
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=NO-SVE-CHECK
@@ -780,7 +781,7 @@ define void @streaming_compatible_to_streaming() #4 {
; CHECK-NEXT: .cfi_offset b14, -88
; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: .cfi_offset vg, -24
; CHECK-NEXT: tbnz w19, #0, .LBB6_2
; CHECK-NEXT: // %bb.1:
@@ -835,7 +836,7 @@ define void @streaming_compatible_to_streaming() #4 {
; FP-CHECK-NEXT: .cfi_offset b14, -88
; FP-CHECK-NEXT: .cfi_offset b15, -96
; FP-CHECK-NEXT: bl __arm_sme_state
-; FP-CHECK-NEXT: and x19, x0, #0x1
+; FP-CHECK-NEXT: mov x19, x0
; FP-CHECK-NEXT: .cfi_offset vg, -16
; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_2
; FP-CHECK-NEXT: // %bb.1:
@@ -897,7 +898,7 @@ define void @streaming_compatible_to_non_streaming() #4 {
; CHECK-NEXT: .cfi_offset b14, -88
; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: .cfi_offset vg, -24
; CHECK-NEXT: tbz w19, #0, .LBB7_2
; CHECK-NEXT: // %bb.1:
@@ -952,7 +953,7 @@ define void @streaming_compatible_to_non_streaming() #4 {
; FP-CHECK-NEXT: .cfi_offset b14, -88
; FP-CHECK-NEXT: .cfi_offset b15, -96
; FP-CHECK-NEXT: bl __arm_sme_state
-; FP-CHECK-NEXT: and x19, x0, #0x1
+; FP-CHECK-NEXT: mov x19, x0
; FP-CHECK-NEXT: .cfi_offset vg, -16
; FP-CHECK-NEXT: tbz w19, #0, .LBB7_2
; FP-CHECK-NEXT: // %bb.1:
@@ -1025,7 +1026,7 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
; NO-SVE-CHECK-NEXT: .cfi_offset b15, -96
; NO-SVE-CHECK-NEXT: mov w8, w0
; NO-SVE-CHECK-NEXT: bl __arm_sme_state
-; NO-SVE-CHECK-NEXT: and x19, x0, #0x1
+; NO-SVE-CHECK-NEXT: mov x19, x0
; NO-SVE-CHECK-NEXT: .cfi_offset vg, -16
; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_2
; NO-SVE-CHECK-NEXT: // %bb.1:
diff --git a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
index 47906252382f4..6c248048e682f 100644
--- a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
@@ -2,8 +2,7 @@
; We should have both spill and reload for %arg.
-; CHECK: remark: <unknown>:0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost generated in function
-
+; CHECK: remark: <unknown>:0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost 1 virtual registers copies 1.000000e+00 total copies cost generated in function
define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind #0 {
%res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
%and = and <vscale x 2 x i1> %res, %arg
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 4615b1a6a9b2e..0a7d42d5d3911 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -1598,7 +1598,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK0-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
; CHECK0-NEXT: stp q2, q3, [sp, #32] // 32-byte Folded Spill
; CHECK0-NEXT: bl __arm_sme_state
-; CHECK0-NEXT: and x21, x0, #0x1
+; CHECK0-NEXT: mov x21, x0
; CHECK0-NEXT: .cfi_offset vg, -40
; CHECK0-NEXT: tbz w21, #0, .LBB27_2
; CHECK0-NEXT: // %bb.1:
@@ -1612,23 +1612,21 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK0-NEXT: .LBB27_4:
; CHECK0-NEXT: cmp w0, #0
; CHECK0-NEXT: .cfi_restore vg
-; CHECK0-NEXT: cset w21, lt
-; CHECK0-NEXT: bl __arm_sme_state
-; CHECK0-NEXT: and x22, x0, #0x1
+; CHECK0-NEXT: cset w22, lt
; CHECK0-NEXT: .cfi_offset vg, -40
-; CHECK0-NEXT: tbz w22, #0, .LBB27_6
+; CHECK0-NEXT: tbz w21, #0, .LBB27_6
; CHECK0-NEXT: // %bb.5:
; CHECK0-NEXT: smstop sm
; CHECK0-NEXT: .LBB27_6:
; CHECK0-NEXT: ldp q0, q1, [sp, #32] // 32-byte Folded Reload
; CHECK0-NEXT: bl __getf2
-; CHECK0-NEXT: tbz w22, #0, .LBB27_8
+; CHECK0-NEXT: tbz w21, #0, .LBB27_8
; CHECK0-NEXT: // %bb.7:
; CHECK0-NEXT: smstart sm
; CHECK0-NEXT: .LBB27_8:
; CHECK0-NEXT: cmp w0, #0
; CHECK0-NEXT: cset w8, ge
-; CHECK0-NEXT: tst w8, w21
+; CHECK0-NEXT: tst w8, w22
; CHECK0-NEXT: csel w0, w20, w19, ne
; CHECK0-NEXT: .cfi_restore vg
; CHECK0-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload
@@ -1687,7 +1685,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK64-NEXT: stp q0, q1, [sp, #64] // 32-byte Folded Spill
; CHECK64-NEXT: stp q2, q3, [sp, #96] // 32-byte Folded Spill
; CHECK64-NEXT: bl __arm_sme_state
-; CHECK64-NEXT: and x21, x0, #0x1
+; CHECK64-NEXT: mov x21, x0
; CHECK64-NEXT: .cfi_offset vg, -48
; CHECK64-NEXT: tbz w21, #0, .LBB27_2
; CHECK64-NEXT: // %bb.1:
@@ -1701,23 +1699,21 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK64-NEXT: .LBB27_4:
; CHECK64-NEXT: cmp w0, #0
; CHECK64-NEXT: .cfi_restore vg
-; CHECK64-NEXT: cset w21, lt
-; CHECK64-NEXT: bl __arm_sme_state
-; CHECK64-NEXT: and x22, x0, #0x1
+; CHECK64-NEXT: cset w22, lt
; CHECK64-NEXT: .cfi_offset vg, -48
-; CHECK64-NEXT: tbz w22, #0, .LBB27_6
+; CHECK64-NEXT: tbz w21, #0, .LBB27_6
; CHECK64-NEXT: // %bb.5:
; CHECK64-NEXT: smstop sm
; CHECK64-NEXT: .LBB27_6:
; CHECK64-NEXT: ldp q0, q1, [sp, #96] // 32-byte Folded Reload
; CHECK64-NEXT: bl __getf2
-; CHECK64-NEXT: tbz w22, #0, .LBB27_8
+; CHECK64-NEXT: tbz w21, #0, .LBB27_8
; CHECK64-NEXT: // %bb.7:
; CHECK64-NEXT: smstart sm
; CHECK64-NEXT: .LBB27_8:
; CHECK64-NEXT: cmp w0, #0
; CHECK64-NEXT: cset w8, ge
-; CHECK64-NEXT: tst w8, w21
+; CHECK64-NEXT: tst w8, w22
; CHECK64-NEXT: csel w0, w20, w19, ne
; CHECK64-NEXT: .cfi_restore vg
; CHECK64-NEXT: ldp x20, x19, [sp, #296] // 16-byte Folded Reload
@@ -1784,7 +1780,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK1024-NEXT: str q1, [sp, #1040] // 16-byte Folded Spill
; CHECK1024-NEXT: str q0, [sp, #1024] // 16-byte Folded Spill
; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: and x21, x0, #0x1
+; CHECK1024-NEXT: mov x21, x0
; CHECK1024-NEXT: .cfi_offset vg, -48
; CHECK1024-NEXT: tbz w21, #0, .LBB27_2
; CHECK1024-NEXT: // %bb.1:
@@ -1799,24 +1795,22 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK1024-NEXT: .LBB27_4:
; CHECK1024-NEXT: cmp w0, #0
; CHECK1024-NEXT: .cfi_restore vg
-; CHECK1024-NEXT: cset w21, lt
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: and x22, x0, #0x1
+; CHECK1024-NEXT: cset w22, lt
; CHECK1024-NEXT: .cfi_offset vg, -48
-; CHECK1024-NEXT: tbz w22, #0, .LBB27_6
+; CHECK1024-NEXT: tbz w21, #0, .LBB27_6
; CHECK1024-NEXT: // %bb.5:
; CHECK1024-NEXT: smstop sm
; CHECK1024-NEXT: .LBB27_6:
; CHECK1024-NEXT: ldr q0, [sp, #1056] // 16-byte Folded Reload
; CHECK1024-NEXT: ldr q1, [sp, #1072] // 16-byte Folded Reload
; CHECK1024-NEXT: bl __getf2
-; CHECK1024-NEXT: tbz w22, #0, .LBB27_8
+; CHECK1024-NEXT: tbz w21, #0, .LBB27_8
; CHECK1024-NEXT: // %bb.7:
; CHECK1024-NEXT: smstart sm
; CHECK1024-NEXT: .LBB27_8:
; CHECK1024-NEXT: cmp w0, #0
; CHECK1024-NEXT: cset w8, ge
-; CHECK1024-NEXT: tst w8, w21
+; CHECK1024-NEXT: tst w8, w22
; CHECK1024-NEXT: csel w0, w20, w19, ne
; CHECK1024-NEXT: .cfi_restore vg
; CHECK1024-NEXT: add sp, sp, #1088
@@ -1907,10 +1901,10 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * VG - 48
; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48
; CHECK0-NEXT: mov x8, x0
+; CHECK0-NEXT: bl __arm_sme_state
+; CHECK0-NEXT: mov x19, x0
; CHECK0-NEXT: //APP
; CHECK0-NEXT: //NO_APP
-; CHECK0-NEXT: bl __arm_sme_state
-; CHECK0-NEXT: and x19, x0, #0x1
; CHECK0-NEXT: .cfi_offset vg, -32
; CHECK0-NEXT: tbz w19, #0, .LBB28_2
; CHECK0-NEXT: // %bb.1: // %entry
@@ -2030,10 +2024,10 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
; CHECK64-NEXT: sub sp, sp, #64
; CHECK64-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x01, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 176 + 144 * VG
; CHECK64-NEXT: mov x8, x0
+; CHECK64-NEXT: bl __arm_sme_state
+; CHECK64-NEXT: mov x19, x0
; CHECK64-NEXT: //APP
; CHECK64-NEXT: //NO_APP
-; CHECK64-NEXT: bl __arm_sme_state
-; CHECK64-NEXT: and x19, x0, #0x1
; CHECK64-NEXT: .cfi_offset vg, -32
; CHECK64-NEXT: tbz w19, #0, .LBB28_2
; CHECK64-NEXT: // %bb.1: // %entry
@@ -2159,10 +2153,10 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
; CHECK1024-NEXT: sub sp, sp, #1024
; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 2096 + 144 * VG
; CHECK1024-NEXT: mov x8, x0
+; CHECK1024-NEXT: bl __arm_sme_state
+; CHECK1024-NEXT: mov x19, x0
; CHECK1024-NEXT: //APP
; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: and x19, x0, #0x1
; CHECK1024-NEXT: .cfi_offset vg, -32
; CHECK1024-NEXT: tbz w19, #0, .LBB28_2
; CHECK1024-NEXT: // %bb.1: // %entry
@@ -2291,10 +2285,10 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48
; CHECK0-NEXT: sub sp, sp, #48
; CHECK0-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x00, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 96 + 144 * VG
+; CHECK0-NEXT: bl __arm_sme_state
+; CHECK0-NEXT: mov x19, x0
; CHECK0-NEXT: //APP
; CHECK0-NEXT: //NO_APP
-; CHECK0-NEXT: bl __arm_sme_state
-; CHECK0-NEXT: and x19, x0, #0x1
; CHECK0-NEXT: .cfi_offset vg, -32
; CHECK0-NEXT: tbz w19, #0, .LBB29_2
; CHECK0-NEXT: // %bb.1: // %entry
@@ -2415,10 +2409,10 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x90, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 112
; CHECK64-NEXT: sub sp, sp, #112
; CHECK64-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x01, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 224 + 144 * VG
+; CHECK64-NEXT: bl __arm_sme_state
+; CHECK64-NEXT: mov x19, x0
; CHECK64-NEXT: //APP
; CHECK64-NEXT: //NO_APP
-; CHECK64-NEXT: bl __arm_sme_state
-; CHECK64-NEXT: and x19, x0, #0x1
; CHECK64-NEXT: .cfi_offset vg, -32
; CHECK64-NEXT: tbz w19, #0, .LBB29_2
; CHECK64-NEXT: // %bb.1: // %entry
@@ -2543,10 +2537,10 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xd0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1072
; CHECK1024-NEXT: sub sp, sp, #1072
; CHECK1024-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xe0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x90, 0x01, 0x1e, 0x22 // sp + 2144 + 144 * VG
+; CHECK1024-NEXT: bl __arm_sme_state
+; CHECK1024-NEXT: mov x19, x0
; CHECK1024-NEXT: //APP
; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: and x19, x0, #0x1
; CHECK1024-NEXT: .cfi_offset vg, -32
; CHECK1024-NEXT: tbz w19, #0, .LBB29_2
; CHECK1024-NEXT: // %bb.1: // %entry
@@ -2616,6 +2610,7 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
; CHECK1024-NEXT: .cfi_restore w30
; CHECK1024-NEXT: .cfi_restore w29
; CHECK1024-NEXT: ret
+
entry:
tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
%0 = alloca [37 x i8], align 16
@@ -3200,18 +3195,19 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
; CHECK0-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * VG - 64
; CHECK0-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * VG - 64
; CHECK0-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * VG - 64
-; CHECK0-NEXT: mov w9, w0
-; CHECK0-NEXT: mov x8, sp
-; CHECK0-NEXT: mov w2, w1
-; CHECK0-NEXT: add x9, x9, #15
; CHECK0-NEXT: mov x19, sp
-; CHECK0-NEXT: and x9, x9, #0x1fffffff0
-; CHECK0-NEXT: sub x8, x8, x9
+; CHECK0-NEXT: mov w2, w1
+; CHECK0-NEXT: mov w8, w0
+; CHECK0-NEXT: bl __arm_sme_state
+; CHECK0-NEXT: mov w8, w8
+; CHECK0-NEXT: mov x9, sp
+; CHECK0-NEXT: mov x20, x0
+; CHECK0-NEXT: add x8, x8, #15
+; CHECK0-NEXT: and x8, x8, #0x1fffffff0
+; CHECK0-NEXT: sub x8, x9, x8
; CHECK0-NEXT: mov sp, x8
; CHECK0-NEXT: //APP
; CHECK0-NEXT: //NO_APP
-; CHECK0-NEXT: bl __arm_sme_state
-; CHECK0-NEXT: and x20, x0, #0x1
; CHECK0-NEXT: .cfi_offset vg, -48
; CHECK0-NEXT: tbz w20, #0, .LBB35_2
; CHECK0-NEXT: // %bb.1: // %entry
@@ -3336,18 +3332,19 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
; CHECK64-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128
; CHECK64-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128
; CHECK64-NEXT: sub sp, sp, #64
-; CHECK64-NEXT: mov w9, w0
-; CHECK64-NEXT: mov x8, sp
-; CHECK64-NEXT: mov w2, w1
-; CHECK64-NEXT: add x9, x9, #15
; CHECK64-NEXT: mov x19, sp
-; CHECK64-NEXT: and x9, x9, #0x1fffffff0
-; CHECK64-NEXT: sub x8, x8, x9
+; CHECK64-NEXT: mov w2, w1
+; CHECK64-NEXT: mov w8, w0
+; CHECK64-NEXT: bl __arm_sme_state
+; CHECK64-NEXT: mov w8, w8
+; CHECK64-NEXT: mov x9, sp
+; CHECK64-NEXT: mov x20, x0
+; CHECK64-NEXT: add x8, x8, #15
+; CHECK64-NEXT: and x8, x8, #0x1fffffff0
+; CHECK64-NEXT: sub x8, x9, x8
; CHECK64-NEXT: mov sp, x8
; CHECK64-NEXT: //APP
; CHECK64-NEXT: //NO_APP
-; CHECK64-NEXT: bl __arm_sme_state
-; CHECK64-NEXT: and x20, x0, #0x1
; CHECK64-NEXT: .cfi_offset vg, -48
; CHECK64-NEXT: tbz w20, #0, .LBB35_2
; CHECK64-NEXT: // %bb.1: // %entry
@@ -3478,18 +3475,19 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: mov w9, w0
-; CHECK1024-NEXT: mov x8, sp
-; CHECK1024-NEXT: mov w2, w1
-; CHECK1024-NEXT: add x9, x9, #15
; CHECK1024-NEXT: mov x19, sp
-; CHECK1024-NEXT: and x9, x9, #0x1fffffff0
-; CHECK1024-NEXT: sub x8, x8, x9
+; CHECK1024-NEXT: mov w2, w1
+; CHECK1024-NEXT: mov w8, w0
+; CHECK1024-NEXT: bl __arm_sme_state
+; CHECK1024-NEXT: mov w8, w8
+; CHECK1024-NEXT: mov x9, sp
+; CHECK1024-NEXT: mov x20, x0
+; CHECK1024-NEXT: add x8, x8, #15
+; CHECK1024-NEXT: and x8, x8, #0x1fffffff0
+; CHECK1024-NEXT: sub x8, x9, x8
; CHECK1024-NEXT: mov sp, x8
; CHECK1024-NEXT: //APP
; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: and x20, x0, #0x1
; CHECK1024-NEXT: .cfi_offset vg, -48
; CHECK1024-NEXT: tbz w20, #0, .LBB35_2
; CHECK1024-NEXT: // %bb.1: // %entry
@@ -3627,10 +3625,10 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
; CHECK0-NEXT: sub x9, sp, #1024
; CHECK0-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK0-NEXT: mov w2, w1
+; CHECK0-NEXT: bl __arm_sme_state
+; CHECK0-NEXT: mov x19, x0
; CHECK0-NEXT: //APP
; CHECK0-NEXT: //NO_APP
-; CHECK0-NEXT: bl __arm_sme_state
-; CHECK0-NEXT: and x19, x0, #0x1
; CHECK0-NEXT: .cfi_offset vg, -48
; CHECK0-NEXT: tbz w19, #0, .LBB36_2
; CHECK0-NEXT: // %bb.1: // %entry
@@ -3754,10 +3752,10 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
; CHECK64-NEXT: sub x9, sp, #1088
; CHECK64-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK64-NEXT: mov w2, w1
+; CHECK64-NEXT: bl __arm_sme_state
+; CHECK64-NEXT: mov x19, x0
; CHECK64-NEXT: //APP
; CHECK64-NEXT: //NO_APP
-; CHECK64-NEXT: bl __arm_sme_state
-; CHECK64-NEXT: and x19, x0, #0x1
; CHECK64-NEXT: .cfi_offset vg, -48
; CHECK64-NEXT: tbz w19, #0, .LBB36_2
; CHECK64-NEXT: // %bb.1: // %entry
@@ -3886,10 +3884,10 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
; CHECK1024-NEXT: sub x9, sp, #2048
; CHECK1024-NEXT: and sp, x9, #0xffffffffffffffe0
; CHECK1024-NEXT: mov w2, w1
+; CHECK1024-NEXT: bl __arm_sme_state
+; CHECK1024-NEXT: mov x19, x0
; CHECK1024-NEXT: //APP
; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: and x19, x0, #0x1
; CHECK1024-NEXT: .cfi_offset vg, -48
; CHECK1024-NEXT: tbz w19, #0, .LBB36_2
; CHECK1024-NEXT: // %bb.1: // %entry
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index f1e684c86e896..0650b8271d374 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -180,16 +180,15 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: bl __arm_sme_state
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x8, :got:dst
-; CHECK-NO-SME-ROUTINES-NEXT: and x19, x0, #0x1
+; CHECK-NO-SME-ROUTINES-NEXT: mov x19, x0
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
-; CHECK-NO-SME-ROUTINES-NEXT: ldr x8, [x8, :got_lo12:dst]
+; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_2
; CHECK-NO-SME-ROUTINES-NEXT: // %bb.1: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
; CHECK-NO-SME-ROUTINES-NEXT: .LBB3_2: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT: mov x0, x8
; CHECK-NO-SME-ROUTINES-NEXT: bl memcpy
; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_4
; CHECK-NO-SME-ROUTINES-NEXT: // %bb.3: // %entry
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index e0da9b57c6556..d440535f022c4 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -421,10 +421,10 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d14 @ cfa - 56 * VG - 48
; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x50, 0x22 // $d15 @ cfa - 64 * VG - 48
; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
-; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: .cfi_offset vg, -32
; CHECK-NEXT: tbz w19, #0, .LBB7_2
; CHECK-NEXT: // %bb.1: // %entry
>From 86bc038f37d7d3b16ef61e33619e87f373774505 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 6 Aug 2025 14:30:02 +0000
Subject: [PATCH 2/2] Rename CALLER_IS_STREAMING -> ENTRY_PSTATE_SM
---
.../Target/AArch64/AArch64ISelLowering.cpp | 20 +++++++++----------
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 4 ++--
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 8 ++++----
3 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2fe3278bb1846..3b19956f7650e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3102,8 +3102,8 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI,
}
MachineBasicBlock *
-AArch64TargetLowering::EmitCallerIsStreaming(MachineInstr &MI,
- MachineBasicBlock *BB) const {
+AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
@@ -3242,8 +3242,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
return EmitAllocateSMESaveBuffer(MI, BB);
case AArch64::GetSMESaveSize:
return EmitGetSMESaveSize(MI, BB);
- case AArch64::CallerIsStreaming:
- return EmitCallerIsStreaming(MI, BB);
+ case AArch64::EntryPStateSM:
+ return EmitEntryPStateSM(MI, BB);
case AArch64::F128CSEL:
return EmitF128CSEL(MI, BB);
case TargetOpcode::STATEPOINT:
@@ -8161,16 +8161,16 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
if (Attrs.hasStreamingCompatibleInterface()) {
- SDValue CallerIsStreaming =
- DAG.getNode(AArch64ISD::CALLER_IS_STREAMING, DL,
+ SDValue EntryPStateSM =
+ DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
DAG.getVTList(MVT::i64, MVT::Other), {Chain});
// Copy the value to a virtual register, and save that in FuncInfo.
- Register CallerIsStreamingReg =
+ Register EntryPStateSMReg =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
- Chain = DAG.getCopyToReg(CallerIsStreaming.getValue(1), DL,
- CallerIsStreamingReg, CallerIsStreaming);
- FuncInfo->setPStateSMReg(CallerIsStreamingReg);
+ Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
+ EntryPStateSM);
+ FuncInfo->setPStateSMReg(EntryPStateSMReg);
}
// Insert the SMSTART if this is a locally streaming function and
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index cc23f7e0bdfcd..02dfd9aa7b067 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -181,8 +181,8 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *BB) const;
MachineBasicBlock *EmitGetSMESaveSize(MachineInstr &MI,
MachineBasicBlock *BB) const;
- MachineBasicBlock *EmitCallerIsStreaming(MachineInstr &MI,
- MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitEntryPStateSM(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
/// Replace (0, vreg) discriminator components with the operands of blend
/// or with (immediate, NoRegister) when possible.
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 7b5f45e96a942..9c20087159d17 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -39,14 +39,14 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
-def AArch64CallerIsStreaming
- : SDNode<"AArch64ISD::CALLER_IS_STREAMING", SDTypeProfile<1, 0,
+def AArch64EntryPStateSM
+ : SDNode<"AArch64ISD::ENTRY_PSTATE_SM", SDTypeProfile<1, 0,
[SDTCisInt<0>]>, [SDNPHasChain, SDNPSideEffect]>;
let usesCustomInserter = 1 in {
- def CallerIsStreaming : Pseudo<(outs GPR64:$is_streaming), (ins), []>, Sched<[]> {}
+ def EntryPStateSM : Pseudo<(outs GPR64:$is_streaming), (ins), []>, Sched<[]> {}
}
-def : Pat<(i64 (AArch64CallerIsStreaming)), (CallerIsStreaming)>;
+def : Pat<(i64 (AArch64EntryPStateSM)), (EntryPStateSM)>;
def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
[SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
More information about the llvm-commits
mailing list