[llvm] [AArch64][SME] Introduce CHECK_MATCHING_VL pseudo for streaming transitions (PR #157510)
Mary Kassayova via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 16 08:29:12 PDT 2025
https://github.com/marykass-arm updated https://github.com/llvm/llvm-project/pull/157510
>From aae8b5beb4b7b6510d728b4c4f42b9016e08127d Mon Sep 17 00:00:00 2001
From: Mary Kassayova <mary.kassayova at arm.com>
Date: Mon, 8 Sep 2025 16:27:06 +0000
Subject: [PATCH 1/4] [AArch64][SME] Introduce CHECK_MATCHING_VL pseudo for
safe streaming mode transitions
---
.../Target/AArch64/AArch64ISelLowering.cpp | 90 ++++-
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 7 +-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 6 +
.../AArch64/sme-callee-save-restore-pairs.ll | 26 +-
.../test/CodeGen/AArch64/sme-peephole-opts.ll | 14 +-
.../CodeGen/AArch64/sme-streaming-checkvl.ll | 335 ++++++++++++++++++
.../sme-streaming-compatible-interface.ll | 26 +-
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 14 +
.../CodeGen/AArch64/spill-reload-remarks.ll | 2 +-
9 files changed, 501 insertions(+), 19 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 351235dd5bcdd..9380522d65aae 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2940,6 +2940,52 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
return NextInst->getParent();
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitCheckVL(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction::iterator It = ++MBB->getIterator();
+
+ const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ Register RegVL = MRI.createVirtualRegister(RC);
+ Register RegSVL = MRI.createVirtualRegister(RC);
+ Register RegCheck = MRI.createVirtualRegister(RC);
+
+ BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL).addImm(1);
+ BuildMI(*MBB, MI, DL, TII->get(AArch64::RDSVLI_XI), RegSVL).addImm(1);
+
+ BuildMI(*MBB, MI, DL, TII->get(AArch64::SUBXrr), RegCheck)
+ .addReg(RegVL)
+ .addReg(RegSVL);
+
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MF->insert(It, TrapBB);
+ MF->insert(It, PassBB);
+
+ BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
+ .addReg(RegCheck)
+ .addMBB(PassBB);
+
+ // Transfer rest of current BB to PassBB
+ PassBB->splice(PassBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ PassBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
+
+ MBB->addSuccessor(TrapBB);
+ MBB->addSuccessor(PassBB);
+
+ MI.eraseFromParent();
+ return PassBB;
+}
+
MachineBasicBlock *
AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
@@ -3343,6 +3389,9 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case AArch64::PROBED_STACKALLOC_DYN:
return EmitDynamicProbedAlloc(MI, BB);
+ case AArch64::CHECK_MATCHING_VL:
+ return EmitCheckVL(MI, BB);
+
case AArch64::LD1_MXIPXX_H_PSEUDO_B:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_H:
@@ -9116,7 +9165,8 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
bool Enable, SDValue Chain,
SDValue InGlue,
- unsigned Condition) const {
+ unsigned Condition,
+ bool HasSVECC) const {
MachineFunction &MF = DAG.getMachineFunction();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
FuncInfo->setHasStreamingModeChanges(true);
@@ -9147,7 +9197,40 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
if (InGlue)
Ops.push_back(InGlue);
- return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+ if (!HasSVECC)
+ return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+
+ auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
+ SmallVector<SDValue, 2> Ops = {Chain};
+ if (InGlue)
+ Ops.push_back(InGlue);
+ return SDValue(DAG.getMachineNode(AArch64::CHECK_MATCHING_VL, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue),
+ Ops),
+ 0);
+ };
+
+ // NS -> S
+ if (Enable) {
+ SDValue CheckVL = GetCheckVL(Chain, InGlue);
+
+ // Replace chain
+ Ops[0] = CheckVL.getValue(0);
+
+ // Replace/append glue
+ if (InGlue)
+ Ops.back() = CheckVL.getValue(1);
+ else
+ Ops.push_back(CheckVL.getValue(1));
+
+ return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+ }
+
+ // S -> NS
+ SDValue StreamingModeInstr =
+ DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+ return GetCheckVL(StreamingModeInstr.getValue(0),
+ StreamingModeInstr.getValue(1));
}
// Emit a call to __arm_sme_save or __arm_sme_restore.
@@ -9732,7 +9815,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (RequiresSMChange) {
Chain =
changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(),
- Chain, InGlue, getSMToggleCondition(CallAttrs));
+ Chain, InGlue, getSMToggleCondition(CallAttrs),
+ CallConv == CallingConv::AArch64_SVE_VectorCall);
InGlue = Chain.getValue(1);
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f5d14905cac66..e6385a059b875 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -168,6 +168,9 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
MachineBasicBlock *MBB) const;
+ MachineBasicBlock *EmitCheckVL(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
MachineBasicBlock *BB) const;
@@ -532,8 +535,8 @@ class AArch64TargetLowering : public TargetLowering {
/// node. \p Condition should be one of the enum values from
/// AArch64SME::ToggleCondition.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable,
- SDValue Chain, SDValue InGlue,
- unsigned Condition) const;
+ SDValue Chain, SDValue InGlue, unsigned Condition,
+ bool HasSVECC = false) const;
bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f0020a9a3c91d..7cc3dc9a2171c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1367,6 +1367,12 @@ def PROBED_STACKALLOC_DYN : Pseudo<(outs),
} // Defs = [SP, NZCV], Uses = [SP] in
} // hasSideEffects = 1, isCodeGenOnly = 1
+// Pseudo-instruction that compares the current SVE vector length (VL) with the
+// streaming vector length (SVL). If the two lengths do not match, the check
+// lowers to a `brk`, causing a trap.
+let hasSideEffects = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in
+def CHECK_MATCHING_VL : Pseudo<(outs), (ins), []>, Sched<[]>;
+
let isReMaterializable = 1, isCodeGenOnly = 1 in {
// FIXME: The following pseudo instructions are only needed because remat
// cannot handle multiple instructions. When that changes, they can be
diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
index cf42db7aa65bd..af6ebb3846738 100644
--- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -47,12 +47,19 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
; NOPAIR-NEXT: // %bb.1:
; NOPAIR-NEXT: smstop sm
; NOPAIR-NEXT: .LBB0_2:
+; NOPAIR-NEXT: rdvl x8, #1
+; NOPAIR-NEXT: rdsvl x9, #1
+; NOPAIR-NEXT: cmp x8, x9
+; NOPAIR-NEXT: b.eq .LBB0_4
+; NOPAIR-NEXT: // %bb.3:
+; NOPAIR-NEXT: brk #0x1
+; NOPAIR-NEXT: .LBB0_4:
; NOPAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; NOPAIR-NEXT: bl my_func2
-; NOPAIR-NEXT: tbz w19, #0, .LBB0_4
-; NOPAIR-NEXT: // %bb.3:
+; NOPAIR-NEXT: tbz w19, #0, .LBB0_6
+; NOPAIR-NEXT: // %bb.5:
; NOPAIR-NEXT: smstart sm
-; NOPAIR-NEXT: .LBB0_4:
+; NOPAIR-NEXT: .LBB0_6:
; NOPAIR-NEXT: addvl sp, sp, #1
; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -127,12 +134,19 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
; PAIR-NEXT: // %bb.1:
; PAIR-NEXT: smstop sm
; PAIR-NEXT: .LBB0_2:
+; PAIR-NEXT: rdvl x8, #1
+; PAIR-NEXT: rdsvl x9, #1
+; PAIR-NEXT: cmp x8, x9
+; PAIR-NEXT: b.eq .LBB0_4
+; PAIR-NEXT: // %bb.3:
+; PAIR-NEXT: brk #0x1
+; PAIR-NEXT: .LBB0_4:
; PAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; PAIR-NEXT: bl my_func2
-; PAIR-NEXT: tbz w19, #0, .LBB0_4
-; PAIR-NEXT: // %bb.3:
+; PAIR-NEXT: tbz w19, #0, .LBB0_6
+; PAIR-NEXT: // %bb.5:
; PAIR-NEXT: smstart sm
-; PAIR-NEXT: .LBB0_4:
+; PAIR-NEXT: .LBB0_6:
; PAIR-NEXT: addvl sp, sp, #1
; PAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; PAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 80827c2547780..9ff414d401426 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -527,14 +527,26 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: mov z0.s, #0 // =0x0
-; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.ne .LBB14_2
+; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: bl callee_farg_fret
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: smstop sm
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB14_3
+; CHECK-NEXT: .LBB14_2:
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB14_3:
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl callee_farg_fret
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
new file mode 100644
index 0000000000000..631d120f3c872
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
@@ -0,0 +1,335 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 < %s -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @bar_enabled(<vscale x 4 x i32>) #0
+
+; Non-streaming -> calls streaming callee
+define void @foo_non_streaming_pass_arg(ptr %arg) {
+; CHECK-LABEL: foo_non_streaming_pass_arg:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w28, -8
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB0_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB0_2: // %entry
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: sub x8, x29, #64
+; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: bl bar_enabled
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .cfi_def_cfa wsp, 96
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+entry:
+ %v = load <vscale x 4 x i32>, ptr %arg, align 16
+ tail call void @bar_enabled(<vscale x 4 x i32> %v) #0
+ ret void
+}
+
+; Streaming-compatible -> calls streaming callee
+define void @foo_streaming_compatible_pass_arg(ptr %arg) #1 {
+; CHECK-LABEL: foo_streaming_compatible_pass_arg:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #1136
+; CHECK-NEXT: .cfi_def_cfa_offset 1136
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #1120] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #1088
+; CHECK-NEXT: .cfi_def_cfa w29, 48
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w28, -24
+; CHECK-NEXT: .cfi_offset vg, -32
+; CHECK-NEXT: .cfi_offset w30, -40
+; CHECK-NEXT: .cfi_offset w29, -48
+; CHECK-NEXT: .cfi_offset b8, -1080
+; CHECK-NEXT: .cfi_offset b9, -1088
+; CHECK-NEXT: .cfi_offset b10, -1096
+; CHECK-NEXT: .cfi_offset b11, -1104
+; CHECK-NEXT: .cfi_offset b12, -1112
+; CHECK-NEXT: .cfi_offset b13, -1120
+; CHECK-NEXT: .cfi_offset b14, -1128
+; CHECK-NEXT: .cfi_offset b15, -1136
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: rdvl x9, #1
+; CHECK-NEXT: rdsvl x10, #1
+; CHECK-NEXT: cmp x9, x10
+; CHECK-NEXT: b.eq .LBB1_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB1_2: // %entry
+; CHECK-NEXT: ldr z0, [x8]
+; CHECK-NEXT: sub x8, x29, #1088
+; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: tbnz w0, #0, .LBB1_4
+; CHECK-NEXT: // %bb.3: // %entry
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB1_4: // %entry
+; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: bl bar_enabled
+; CHECK-NEXT: tbnz w19, #0, .LBB1_6
+; CHECK-NEXT: // %bb.5: // %entry
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB1_6: // %entry
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: .cfi_def_cfa wsp, 1136
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #1120] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1136
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+entry:
+ %v = load <vscale x 4 x i32>, ptr %arg, align 16
+ tail call void @bar_enabled(<vscale x 4 x i32> %v) #0
+ ret void
+}
+
+declare <vscale x 4 x i32> @bar_retv_enabled() #0
+
+; Non-streaming -> returns SVE value from streaming callee
+define void @foo_non_streaming_retval(ptr %ptr) {
+; CHECK-LABEL: foo_non_streaming_retval:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 112
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x28, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: .cfi_def_cfa w29, 48
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w28, -16
+; CHECK-NEXT: .cfi_offset vg, -32
+; CHECK-NEXT: .cfi_offset w30, -40
+; CHECK-NEXT: .cfi_offset w29, -48
+; CHECK-NEXT: .cfi_offset b8, -56
+; CHECK-NEXT: .cfi_offset b9, -64
+; CHECK-NEXT: .cfi_offset b10, -72
+; CHECK-NEXT: .cfi_offset b11, -80
+; CHECK-NEXT: .cfi_offset b12, -88
+; CHECK-NEXT: .cfi_offset b13, -96
+; CHECK-NEXT: .cfi_offset b14, -104
+; CHECK-NEXT: .cfi_offset b15, -112
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB2_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB2_2: // %entry
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: bl bar_retv_enabled
+; CHECK-NEXT: sub x8, x29, #64
+; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: str z0, [x19]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .cfi_def_cfa wsp, 112
+; CHECK-NEXT: ldp x28, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+entry:
+ %v = tail call <vscale x 4 x i32> @bar_retv_enabled() #0
+ store <vscale x 4 x i32> %v, ptr %ptr, align 16
+ ret void
+}
+
+; Streaming-compatible -> returns SVE value from streaming callee
+define void @foo_streaming_compatible_retval(ptr %ptr) #1 {
+; CHECK-LABEL: foo_streaming_compatible_retval:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #1136
+; CHECK-NEXT: .cfi_def_cfa_offset 1136
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK-NEXT: str x20, [sp, #1120] // 8-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #1128] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #1088
+; CHECK-NEXT: .cfi_def_cfa w29, 48
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset w28, -24
+; CHECK-NEXT: .cfi_offset vg, -32
+; CHECK-NEXT: .cfi_offset w30, -40
+; CHECK-NEXT: .cfi_offset w29, -48
+; CHECK-NEXT: .cfi_offset b8, -1080
+; CHECK-NEXT: .cfi_offset b9, -1088
+; CHECK-NEXT: .cfi_offset b10, -1096
+; CHECK-NEXT: .cfi_offset b11, -1104
+; CHECK-NEXT: .cfi_offset b12, -1112
+; CHECK-NEXT: .cfi_offset b13, -1120
+; CHECK-NEXT: .cfi_offset b14, -1128
+; CHECK-NEXT: .cfi_offset b15, -1136
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB3_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB3_2: // %entry
+; CHECK-NEXT: tbnz w0, #0, .LBB3_4
+; CHECK-NEXT: // %bb.3: // %entry
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB3_4: // %entry
+; CHECK-NEXT: mov x20, x0
+; CHECK-NEXT: bl bar_retv_enabled
+; CHECK-NEXT: sub x8, x29, #1088
+; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: tbnz w20, #0, .LBB3_6
+; CHECK-NEXT: // %bb.5: // %entry
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB3_6: // %entry
+; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: str z0, [x19]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: .cfi_def_cfa wsp, 1136
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #1128] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x20, [sp, #1120] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1136
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w20
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+entry:
+ %v = tail call <vscale x 4 x i32> @bar_retv_enabled() #0
+ store <vscale x 4 x i32> %v, ptr %ptr, align 16
+ ret void
+}
+
+attributes #0 = { "aarch64_pstate_sm_enabled" }
+attributes #1 = { "aarch64_pstate_sm_compatible" }
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 9088986ee9b72..95fb68945de44 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -209,13 +209,20 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB5_2:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB5_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB5_4:
; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: bl normal_callee_scalable_vec_arg
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT: tbz w19, #0, .LBB5_4
-; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: tbz w19, #0, .LBB5_6
+; CHECK-NEXT: // %bb.5:
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: .LBB5_4:
+; CHECK-NEXT: .LBB5_6:
; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: fadd z0.d, z1.d, z0.d
@@ -300,13 +307,20 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB6_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB6_4:
; CHECK-NEXT: ldr p0, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: bl normal_callee_predicate_vec_arg
; CHECK-NEXT: str p0, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: tbz w19, #0, .LBB6_4
-; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: tbz w19, #0, .LBB6_6
+; CHECK-NEXT: // %bb.5:
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: .LBB6_4:
+; CHECK-NEXT: .LBB6_6:
; CHECK-NEXT: ldr p0, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p1, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index c72077bd311b4..6697f1e54b46f 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -376,6 +376,13 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: smstop sm
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB3_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB3_2:
; CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: bl scalable_callee
; CHECK-NEXT: smstart sm
@@ -472,6 +479,13 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: //APP
; FP-CHECK-NEXT: //NO_APP
; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: rdvl x8, #1
+; FP-CHECK-NEXT: rdsvl x9, #1
+; FP-CHECK-NEXT: cmp x8, x9
+; FP-CHECK-NEXT: b.eq .LBB3_2
+; FP-CHECK-NEXT: // %bb.1:
+; FP-CHECK-NEXT: brk #0x1
+; FP-CHECK-NEXT: .LBB3_2:
; FP-CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
; FP-CHECK-NEXT: bl scalable_callee
; FP-CHECK-NEXT: smstart sm
diff --git a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
index a23854759d688..9e711fe05f462 100644
--- a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
@@ -2,7 +2,7 @@
; We should have both spill and reload for %arg.
-; CHECK: remark: <unknown>:0:0: 2 spills 2.000000e+00 total spills cost 3 reloads 3.000000e+00 total reloads cost generated in function
+; CHECK: remark: <unknown>:0:0: 2 spills 1.500000e+00 total spills cost 3 reloads 1.500000e+00 total reloads cost 1 virtual registers copies 1.000000e+00 total copies cost generated in function
define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind #0 {
%res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
%and = and <vscale x 2 x i1> %res, %arg
>From ac52098fe468020a6b72819096367496bdc9c292 Mon Sep 17 00:00:00 2001
From: Mary Kassayova <mary.kassayova at arm.com>
Date: Wed, 10 Sep 2025 13:40:05 +0000
Subject: [PATCH 2/4] Addressed comments
---
.../Target/AArch64/AArch64ISelLowering.cpp | 37 ++--
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 6 +-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 6 -
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 11 +
.../CodeGen/AArch64/sme-streaming-checkvl.ll | 191 ++++++++++++++++--
.../CodeGen/AArch64/spill-reload-remarks.ll | 2 +-
6 files changed, 202 insertions(+), 51 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9380522d65aae..19ae0e848a9e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2941,8 +2941,8 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
}
MachineBasicBlock *
-AArch64TargetLowering::EmitCheckVL(MachineInstr &MI,
- MachineBasicBlock *MBB) const {
+AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
@@ -3389,8 +3389,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case AArch64::PROBED_STACKALLOC_DYN:
return EmitDynamicProbedAlloc(MI, BB);
- case AArch64::CHECK_MATCHING_VL:
- return EmitCheckVL(MI, BB);
+ case AArch64::CHECK_MATCHING_VL_PSEUDO:
+ return EmitCheckMatchingVL(MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_B:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
@@ -9162,11 +9162,9 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
}
-SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
- bool Enable, SDValue Chain,
- SDValue InGlue,
- unsigned Condition,
- bool HasSVECC) const {
+SDValue AArch64TargetLowering::changeStreamingMode(
+ SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
+ unsigned Condition, bool InsertVectorLengthCheck) const {
MachineFunction &MF = DAG.getMachineFunction();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
FuncInfo->setHasStreamingModeChanges(true);
@@ -9197,20 +9195,18 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
if (InGlue)
Ops.push_back(InGlue);
- if (!HasSVECC)
+ if (!InsertVectorLengthCheck)
return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
SmallVector<SDValue, 2> Ops = {Chain};
if (InGlue)
Ops.push_back(InGlue);
- return SDValue(DAG.getMachineNode(AArch64::CHECK_MATCHING_VL, DL,
- DAG.getVTList(MVT::Other, MVT::Glue),
- Ops),
- 0);
+ return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), Ops);
};
- // NS -> S
+ // Non-streaming -> Streaming
if (Enable) {
SDValue CheckVL = GetCheckVL(Chain, InGlue);
@@ -9226,7 +9222,7 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
}
- // S -> NS
+ // Streaming -> Non-streaming
SDValue StreamingModeInstr =
DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
return GetCheckVL(StreamingModeInstr.getValue(0),
@@ -9813,10 +9809,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue InGlue;
if (RequiresSMChange) {
- Chain =
- changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(),
- Chain, InGlue, getSMToggleCondition(CallAttrs),
- CallConv == CallingConv::AArch64_SVE_VectorCall);
+ bool InsertVectorLengthCheck =
+ (CallConv == CallingConv::AArch64_SVE_VectorCall);
+ Chain = changeStreamingMode(
+ DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
+ getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
InGlue = Chain.getValue(1);
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e6385a059b875..ff073d3eafb1f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -168,8 +168,8 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
MachineBasicBlock *MBB) const;
- MachineBasicBlock *EmitCheckVL(MachineInstr &MI,
- MachineBasicBlock *MBB) const;
+ MachineBasicBlock *EmitCheckMatchingVL(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
@@ -536,7 +536,7 @@ class AArch64TargetLowering : public TargetLowering {
/// AArch64SME::ToggleCondition.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable,
SDValue Chain, SDValue InGlue, unsigned Condition,
- bool HasSVECC = false) const;
+ bool InsertVectorLengthCheck = false) const;
bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 7cc3dc9a2171c..f0020a9a3c91d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1367,12 +1367,6 @@ def PROBED_STACKALLOC_DYN : Pseudo<(outs),
} // Defs = [SP, NZCV], Uses = [SP] in
} // hasSideEffects = 1, isCodeGenOnly = 1
-// Pseudo-instruction that compares the current SVE vector length (VL) with the
-// streaming vector length (SVL). If the two lengths do not match, the check
-// lowers to a `brk`, causing a trap.
-let hasSideEffects = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in
-def CHECK_MATCHING_VL : Pseudo<(outs), (ins), []>, Sched<[]>;
-
let isReMaterializable = 1, isCodeGenOnly = 1 in {
// FIXME: The following pseudo instructions are only needed because remat
// cannot handle multiple instructions. When that changes, they can be
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 601dc34d74b9c..430b7382de216 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -48,6 +48,17 @@ let usesCustomInserter = 1 in {
}
def : Pat<(i64 (AArch64EntryPStateSM)), (EntryPStateSM)>;
+// Pseudo-instruction that compares the current SVE vector length (VL) with the
+// streaming vector length (SVL). If the two lengths do not match, the check
+// lowers to a `brk`, causing a trap.
+let hasSideEffects = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in
+def CHECK_MATCHING_VL_PSEUDO : Pseudo<(outs), (ins), []>, Sched<[]>;
+
+def AArch64_check_matching_vl
+ : SDNode<"AArch64ISD::CHECK_MATCHING_VL", SDTypeProfile<0, 0,[]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def : Pat<(AArch64_check_matching_vl), (CHECK_MATCHING_VL_PSEUDO)>;
+
//===----------------------------------------------------------------------===//
// Old SME ABI lowering ISD nodes/pseudos (deprecated)
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
index 631d120f3c872..1f36fd13fbaa3 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
@@ -1,9 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 < %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 < %s | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
declare void @bar_enabled(<vscale x 4 x i32>) #0
+declare void @bar(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @bar_retv_enabled() #0
+declare <vscale x 4 x i32> @bar_retv()
; Non-streaming -> calls streaming callee
define void @foo_non_streaming_pass_arg(ptr %arg) {
@@ -107,24 +110,22 @@ define void @foo_streaming_compatible_pass_arg(ptr %arg) #1 {
; CHECK-NEXT: .cfi_offset b15, -1136
; CHECK-NEXT: sub sp, sp, #1024
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: mov x8, x0
-; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: rdvl x9, #1
-; CHECK-NEXT: rdsvl x10, #1
-; CHECK-NEXT: cmp x9, x10
+; CHECK-NEXT: mrs x19, SVCR
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
; CHECK-NEXT: b.eq .LBB1_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB1_2: // %entry
-; CHECK-NEXT: ldr z0, [x8]
+; CHECK-NEXT: ldr z0, [x0]
; CHECK-NEXT: sub x8, x29, #1088
; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: tbnz w0, #0, .LBB1_4
+; CHECK-NEXT: tbnz w19, #0, .LBB1_4
; CHECK-NEXT: // %bb.3: // %entry
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB1_4: // %entry
; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: bl bar_enabled
; CHECK-NEXT: tbnz w19, #0, .LBB1_6
; CHECK-NEXT: // %bb.5: // %entry
@@ -163,7 +164,75 @@ entry:
ret void
}
-declare <vscale x 4 x i32> @bar_retv_enabled() #0
+; Streaming -> calls non-streaming callee
+define void @foo_streaming_pass_arg(ptr %arg) #0 {
+; CHECK-LABEL: foo_streaming_pass_arg:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #1120
+; CHECK-NEXT: .cfi_def_cfa_offset 1120
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w28, -8
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_offset b8, -1064
+; CHECK-NEXT: .cfi_offset b9, -1072
+; CHECK-NEXT: .cfi_offset b10, -1080
+; CHECK-NEXT: .cfi_offset b11, -1088
+; CHECK-NEXT: .cfi_offset b12, -1096
+; CHECK-NEXT: .cfi_offset b13, -1104
+; CHECK-NEXT: .cfi_offset b14, -1112
+; CHECK-NEXT: .cfi_offset b15, -1120
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: .cfi_def_cfa_offset 2144
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB2_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB2_2: // %entry
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: bl bar
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: .cfi_def_cfa_offset 1120
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1120
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+entry:
+ %v = load <vscale x 4 x i32>, ptr %arg, align 16
+ tail call void @bar(<vscale x 4 x i32> %v)
+ ret void
+}
; Non-streaming -> returns SVE value from streaming callee
define void @foo_non_streaming_retval(ptr %ptr) {
@@ -197,10 +266,10 @@ define void @foo_non_streaming_retval(ptr %ptr) {
; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB2_2
+; CHECK-NEXT: b.eq .LBB3_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
-; CHECK-NEXT: .LBB2_2: // %entry
+; CHECK-NEXT: .LBB3_2: // %entry
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl bar_retv_enabled
@@ -273,27 +342,26 @@ define void @foo_streaming_compatible_retval(ptr %ptr) #1 {
; CHECK-NEXT: .cfi_offset b15, -1136
; CHECK-NEXT: sub sp, sp, #1024
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: mrs x20, SVCR
; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB3_2
+; CHECK-NEXT: b.eq .LBB4_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
-; CHECK-NEXT: .LBB3_2: // %entry
-; CHECK-NEXT: tbnz w0, #0, .LBB3_4
+; CHECK-NEXT: .LBB4_2: // %entry
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: tbnz w20, #0, .LBB4_4
; CHECK-NEXT: // %bb.3: // %entry
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: .LBB3_4: // %entry
-; CHECK-NEXT: mov x20, x0
+; CHECK-NEXT: .LBB4_4: // %entry
; CHECK-NEXT: bl bar_retv_enabled
; CHECK-NEXT: sub x8, x29, #1088
; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: tbnz w20, #0, .LBB3_6
+; CHECK-NEXT: tbnz w20, #0, .LBB4_6
; CHECK-NEXT: // %bb.5: // %entry
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: .LBB3_6: // %entry
+; CHECK-NEXT: .LBB4_6: // %entry
; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: str z0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
@@ -331,5 +399,86 @@ entry:
ret void
}
+; Streaming -> returns SVE value from non-streaming callee
+define void @foo_streaming_retval(ptr %ptr) #0 {
+; CHECK-LABEL: foo_streaming_retval:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #1136
+; CHECK-NEXT: .cfi_def_cfa_offset 1136
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #1120] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #1088
+; CHECK-NEXT: .cfi_def_cfa w29, 48
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w28, -24
+; CHECK-NEXT: .cfi_offset vg, -32
+; CHECK-NEXT: .cfi_offset w30, -40
+; CHECK-NEXT: .cfi_offset w29, -48
+; CHECK-NEXT: .cfi_offset b8, -1080
+; CHECK-NEXT: .cfi_offset b9, -1088
+; CHECK-NEXT: .cfi_offset b10, -1096
+; CHECK-NEXT: .cfi_offset b11, -1104
+; CHECK-NEXT: .cfi_offset b12, -1112
+; CHECK-NEXT: .cfi_offset b13, -1120
+; CHECK-NEXT: .cfi_offset b14, -1128
+; CHECK-NEXT: .cfi_offset b15, -1136
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB5_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB5_2: // %entry
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: bl bar_retv
+; CHECK-NEXT: sub x8, x29, #1088
+; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: str z0, [x19]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: .cfi_def_cfa wsp, 1136
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #1120] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1136
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+entry:
+ %v = tail call <vscale x 4 x i32> @bar_retv()
+ store <vscale x 4 x i32> %v, ptr %ptr, align 16
+ ret void
+}
+
attributes #0 = { "aarch64_pstate_sm_enabled" }
attributes #1 = { "aarch64_pstate_sm_compatible" }
diff --git a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
index 9e711fe05f462..33a4ecd56e35b 100644
--- a/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/spill-reload-remarks.ll
@@ -2,7 +2,7 @@
; We should have both spill and reload for %arg.
-; CHECK: remark: <unknown>:0:0: 2 spills 1.500000e+00 total spills cost 3 reloads 1.500000e+00 total reloads cost 1 virtual registers copies 1.000000e+00 total copies cost generated in function
+; CHECK: remark: <unknown>:0:0: 2 spills 1.500000e+00 total spills cost 3 reloads 1.500000e+00 total reloads cost generated in function
define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind #0 {
%res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
%and = and <vscale x 2 x i1> %res, %arg
>From f3f31c9ed12d892e815b5df5253aae26e3b0cf07 Mon Sep 17 00:00:00 2001
From: Mary Kassayova <mary.kassayova at arm.com>
Date: Thu, 11 Sep 2025 16:08:57 +0000
Subject: [PATCH 3/4] Added MIR test & some comments
---
.../Target/AArch64/AArch64ISelLowering.cpp | 4 +
.../AArch64/sme-streaming-checkvl-mir.ll | 209 ++++++++++++++++++
.../CodeGen/AArch64/sme-streaming-checkvl.ll | 2 +-
3 files changed, 214 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 19ae0e848a9e6..54349ae5ceb67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2956,9 +2956,11 @@ AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI,
Register RegSVL = MRI.createVirtualRegister(RC);
Register RegCheck = MRI.createVirtualRegister(RC);
+ // Read VL and Streaming VL
BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL).addImm(1);
BuildMI(*MBB, MI, DL, TII->get(AArch64::RDSVLI_XI), RegSVL).addImm(1);
+ // Compare vector lengths
BuildMI(*MBB, MI, DL, TII->get(AArch64::SUBXrr), RegCheck)
.addReg(RegVL)
.addReg(RegSVL);
@@ -2968,6 +2970,7 @@ AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI,
MF->insert(It, TrapBB);
MF->insert(It, PassBB);
+ // Continue if vector lengths match
BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
.addReg(RegCheck)
.addMBB(PassBB);
@@ -2977,6 +2980,7 @@ AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI,
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
PassBB->transferSuccessorsAndUpdatePHIs(MBB);
+ // Trap if vector lengths mismatch
BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
MBB->addSuccessor(TrapBB);
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll
new file mode 100644
index 0000000000000..8ea3e1f71c7ad
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -stop-before=finalize-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-BEFORE-ISEL
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-AFTER-ISEL
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @bar_enabled(<vscale x 4 x i32>) #0
+declare void @bar(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @bar_retv_enabled() #0
+declare <vscale x 4 x i32> @bar_retv()
+
+; Non-streaming -> calls streaming callee
+define void @foo_non_streaming_pass_arg(ptr %arg) {
+ ; CHECK-BEFORE-ISEL-LABEL: name: foo_non_streaming_pass_arg
+ ; CHECK-BEFORE-ISEL: bb.0.entry:
+ ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0
+ ; CHECK-BEFORE-ISEL-NEXT: {{ $}}
+ ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-BEFORE-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load (<vscale x 1 x s128>) from %ir.arg)
+ ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO
+ ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-BEFORE-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]]
+ ; CHECK-BEFORE-ISEL-NEXT: BL @bar_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+ ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR
+ ;
+ ; CHECK-AFTER-ISEL-LABEL: name: foo_non_streaming_pass_arg
+ ; CHECK-AFTER-ISEL: bb.0.entry:
+ ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-AFTER-ISEL-NEXT: liveins: $x0
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-AFTER-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load (<vscale x 1 x s128>) from %ir.arg)
+ ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]]
+ ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
+ ; CHECK-AFTER-ISEL-NEXT: successors:
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: BRK 1
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: bb.2.entry:
+ ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-AFTER-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]]
+ ; CHECK-AFTER-ISEL-NEXT: BL @bar_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+ ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR
+entry:
+ %v = load <vscale x 4 x i32>, ptr %arg, align 16
+ tail call void @bar_enabled(<vscale x 4 x i32> %v) #0
+ ret void
+}
+
+; Streaming -> calls non-streaming callee
+define void @foo_streaming_pass_arg(ptr %arg) #0 {
+ ; CHECK-BEFORE-ISEL-LABEL: name: foo_streaming_pass_arg
+ ; CHECK-BEFORE-ISEL: bb.0.entry:
+ ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0
+ ; CHECK-BEFORE-ISEL-NEXT: {{ $}}
+ ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-BEFORE-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load (<vscale x 1 x s128>) from %ir.arg)
+ ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO
+ ; CHECK-BEFORE-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]]
+ ; CHECK-BEFORE-ISEL-NEXT: BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+ ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR
+ ;
+ ; CHECK-AFTER-ISEL-LABEL: name: foo_streaming_pass_arg
+ ; CHECK-AFTER-ISEL: bb.0.entry:
+ ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-AFTER-ISEL-NEXT: liveins: $x0
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-AFTER-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load (<vscale x 1 x s128>) from %ir.arg)
+ ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]]
+ ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
+ ; CHECK-AFTER-ISEL-NEXT: successors:
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: BRK 1
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: bb.2.entry:
+ ; CHECK-AFTER-ISEL-NEXT: $z0 = COPY [[LDR_ZXI]]
+ ; CHECK-AFTER-ISEL-NEXT: BL @bar, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit $z0, implicit-def $sp
+ ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR
+entry:
+ %v = load <vscale x 4 x i32>, ptr %arg, align 16
+ tail call void @bar(<vscale x 4 x i32> %v)
+ ret void
+}
+
+; Non-streaming -> returns SVE value from streaming callee
+define void @foo_non_streaming_retval(ptr %ptr) {
+ ; CHECK-BEFORE-ISEL-LABEL: name: foo_non_streaming_retval
+ ; CHECK-BEFORE-ISEL: bb.0.entry:
+ ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0
+ ; CHECK-BEFORE-ISEL-NEXT: {{ $}}
+ ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO
+ ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-BEFORE-ISEL-NEXT: BL @bar_retv_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
+ ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-BEFORE-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-BEFORE-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+ ; CHECK-BEFORE-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
+ ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR
+ ;
+ ; CHECK-AFTER-ISEL-LABEL: name: foo_non_streaming_retval
+ ; CHECK-AFTER-ISEL: bb.0.entry:
+ ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-AFTER-ISEL-NEXT: liveins: $x0
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]]
+ ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
+ ; CHECK-AFTER-ISEL-NEXT: successors:
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: BRK 1
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: bb.2.entry:
+ ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-AFTER-ISEL-NEXT: BL @bar_retv_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
+ ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+ ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
+ ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR
+entry:
+ %v = tail call <vscale x 4 x i32> @bar_retv_enabled() #0
+ store <vscale x 4 x i32> %v, ptr %ptr, align 16
+ ret void
+}
+
+; Streaming -> returns SVE value from non-streaming callee
+define void @foo_streaming_retval(ptr %ptr) #0 {
+ ; CHECK-BEFORE-ISEL-LABEL: name: foo_streaming_retval
+ ; CHECK-BEFORE-ISEL: bb.0.entry:
+ ; CHECK-BEFORE-ISEL-NEXT: liveins: $x0
+ ; CHECK-BEFORE-ISEL-NEXT: {{ $}}
+ ; CHECK-BEFORE-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-BEFORE-ISEL-NEXT: CHECK_MATCHING_VL_PSEUDO
+ ; CHECK-BEFORE-ISEL-NEXT: BL @bar_retv, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
+ ; CHECK-BEFORE-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-BEFORE-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-BEFORE-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-BEFORE-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+ ; CHECK-BEFORE-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
+ ; CHECK-BEFORE-ISEL-NEXT: RET_ReallyLR
+ ;
+ ; CHECK-AFTER-ISEL-LABEL: name: foo_streaming_retval
+ ; CHECK-AFTER-ISEL: bb.0.entry:
+ ; CHECK-AFTER-ISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-AFTER-ISEL-NEXT: liveins: $x0
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
+ ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]]
+ ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
+ ; CHECK-AFTER-ISEL-NEXT: successors:
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: BRK 1
+ ; CHECK-AFTER-ISEL-NEXT: {{ $}}
+ ; CHECK-AFTER-ISEL-NEXT: bb.2.entry:
+ ; CHECK-AFTER-ISEL-NEXT: BL @bar_retv, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
+ ; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+ ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
+ ; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR
+entry:
+ %v = tail call <vscale x 4 x i32> @bar_retv()
+ store <vscale x 4 x i32> %v, ptr %ptr, align 16
+ ret void
+}
+
+attributes #0 = { "aarch64_pstate_sm_enabled" }
+attributes #1 = { "aarch64_pstate_sm_compatible" }
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
index 1f36fd13fbaa3..8c197ef97b116 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 -verify-machineinstrs < %s | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
>From 3a2c02a6a71737eae4c286ff417404b99d04c31d Mon Sep 17 00:00:00 2001
From: Mary Kassayova <mary.kassayova at arm.com>
Date: Mon, 15 Sep 2025 14:05:39 +0000
Subject: [PATCH 4/4] Replaced RDSVL+SUB with ADDSVL, simplified
changeStreamingMode
---
.../Target/AArch64/AArch64ISelLowering.cpp | 93 ++++++++++---------
.../AArch64/sme-callee-save-restore-pairs.ll | 10 +-
.../test/CodeGen/AArch64/sme-peephole-opts.ll | 10 +-
.../AArch64/sme-streaming-checkvl-mir.ll | 40 ++++----
.../CodeGen/AArch64/sme-streaming-checkvl.ll | 34 +++----
.../sme-streaming-compatible-interface.ll | 10 +-
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 10 +-
7 files changed, 99 insertions(+), 108 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 54349ae5ceb67..df50375ea3763 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2944,27 +2944,34 @@ MachineBasicBlock *
AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI,
MachineBasicBlock *MBB) const {
MachineFunction *MF = MBB->getParent();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- const BasicBlock *LLVM_BB = MBB->getBasicBlock();
- DebugLoc DL = MI.getDebugLoc();
- MachineFunction::iterator It = ++MBB->getIterator();
-
- const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
MachineRegisterInfo &MRI = MF->getRegInfo();
- Register RegVL = MRI.createVirtualRegister(RC);
- Register RegSVL = MRI.createVirtualRegister(RC);
- Register RegCheck = MRI.createVirtualRegister(RC);
+ const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
+ const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
- // Read VL and Streaming VL
- BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL).addImm(1);
- BuildMI(*MBB, MI, DL, TII->get(AArch64::RDSVLI_XI), RegSVL).addImm(1);
+ Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
+ Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
+ Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
+ Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
- // Compare vector lengths
- BuildMI(*MBB, MI, DL, TII->get(AArch64::SUBXrr), RegCheck)
- .addReg(RegVL)
- .addReg(RegSVL);
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // RDVL requires GPR64, ADDSVL requires GPR64sp
+ // We need to insert COPY instructions, these will later be removed by the
+ // RegisterCoalescer
+ BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
+ .addReg(RegVL_GPR);
+ BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
+ .addReg(RegVL_GPRsp)
+ .addImm(-1);
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
+ .addReg(RegSVL_GPRsp);
+
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ MachineFunction::iterator It = ++MBB->getIterator();
MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
MF->insert(It, TrapBB);
@@ -2972,7 +2979,7 @@ AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI,
// Continue if vector lengths match
BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
- .addReg(RegCheck)
+ .addReg(RegSVL_GPR)
.addMBB(PassBB);
// Transfer rest of current BB to PassBB
@@ -9173,6 +9180,22 @@ SDValue AArch64TargetLowering::changeStreamingMode(
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
FuncInfo->setHasStreamingModeChanges(true);
+ auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
+ SmallVector<SDValue, 2> Ops = {Chain};
+ if (InGlue)
+ Ops.push_back(InGlue);
+ return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+ };
+
+ if (InsertVectorLengthCheck && Enable) {
+ // Non-streaming -> Streaming
+ // Insert vector length check before smstart
+ SDValue CheckVL = GetCheckVL(Chain, InGlue);
+ Chain = CheckVL.getValue(0);
+ InGlue = CheckVL.getValue(1);
+ }
+
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
SDValue MSROp =
@@ -9199,38 +9222,16 @@ SDValue AArch64TargetLowering::changeStreamingMode(
if (InGlue)
Ops.push_back(InGlue);
- if (!InsertVectorLengthCheck)
- return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
-
- auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
- SmallVector<SDValue, 2> Ops = {Chain};
- if (InGlue)
- Ops.push_back(InGlue);
- return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
- DAG.getVTList(MVT::Other, MVT::Glue), Ops);
- };
-
- // Non-streaming -> Streaming
- if (Enable) {
- SDValue CheckVL = GetCheckVL(Chain, InGlue);
-
- // Replace chain
- Ops[0] = CheckVL.getValue(0);
-
- // Replace/append glue
- if (InGlue)
- Ops.back() = CheckVL.getValue(1);
- else
- Ops.push_back(CheckVL.getValue(1));
+ SDValue SMChange =
+ DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
- return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
- }
+ if (!InsertVectorLengthCheck || Enable)
+ return SMChange;
// Streaming -> Non-streaming
- SDValue StreamingModeInstr =
- DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
- return GetCheckVL(StreamingModeInstr.getValue(0),
- StreamingModeInstr.getValue(1));
+ // Insert vector length check after smstop since we cannot read VL
+ // in streaming mode
+ return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
}
// Emit a call to __arm_sme_save or __arm_sme_restore.
diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
index af6ebb3846738..b58a857f3a3cb 100644
--- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -48,9 +48,8 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
; NOPAIR-NEXT: smstop sm
; NOPAIR-NEXT: .LBB0_2:
; NOPAIR-NEXT: rdvl x8, #1
-; NOPAIR-NEXT: rdsvl x9, #1
-; NOPAIR-NEXT: cmp x8, x9
-; NOPAIR-NEXT: b.eq .LBB0_4
+; NOPAIR-NEXT: addsvl x8, x8, #-1
+; NOPAIR-NEXT: cbz x8, .LBB0_4
; NOPAIR-NEXT: // %bb.3:
; NOPAIR-NEXT: brk #0x1
; NOPAIR-NEXT: .LBB0_4:
@@ -135,9 +134,8 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
; PAIR-NEXT: smstop sm
; PAIR-NEXT: .LBB0_2:
; PAIR-NEXT: rdvl x8, #1
-; PAIR-NEXT: rdsvl x9, #1
-; PAIR-NEXT: cmp x8, x9
-; PAIR-NEXT: b.eq .LBB0_4
+; PAIR-NEXT: addsvl x8, x8, #-1
+; PAIR-NEXT: cbz x8, .LBB0_4
; PAIR-NEXT: // %bb.3:
; PAIR-NEXT: brk #0x1
; PAIR-NEXT: .LBB0_4:
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 9ff414d401426..1659b217ce0be 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -530,9 +530,8 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.ne .LBB14_2
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbnz x8, .LBB14_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: mov x19, x0
@@ -541,9 +540,8 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
; CHECK-NEXT: smstart sm
; CHECK-NEXT: smstop sm
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB14_3
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbz x8, .LBB14_3
; CHECK-NEXT: .LBB14_2:
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB14_3:
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll
index 8ea3e1f71c7ad..0ac46085d683f 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl-mir.ll
@@ -35,9 +35,10 @@ define void @foo_non_streaming_pass_arg(ptr %arg) {
; CHECK-AFTER-ISEL-NEXT: [[LDR_ZXI:%[0-9]+]]:zpr = LDR_ZXI [[COPY]], 0 :: (load (<vscale x 1 x s128>) from %ir.arg)
; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
- ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
- ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]]
- ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]]
+ ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]]
+ ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2
; CHECK-AFTER-ISEL-NEXT: {{ $}}
; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
; CHECK-AFTER-ISEL-NEXT: successors:
@@ -84,9 +85,10 @@ define void @foo_streaming_pass_arg(ptr %arg) #0 {
; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
- ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
- ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]]
- ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]]
+ ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]]
+ ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2
; CHECK-AFTER-ISEL-NEXT: {{ $}}
; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
; CHECK-AFTER-ISEL-NEXT: successors:
@@ -131,9 +133,10 @@ define void @foo_non_streaming_retval(ptr %ptr) {
; CHECK-AFTER-ISEL-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x0
; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
- ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
- ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]]
- ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]]
+ ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]]
+ ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2
; CHECK-AFTER-ISEL-NEXT: {{ $}}
; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
; CHECK-AFTER-ISEL-NEXT: successors:
@@ -144,10 +147,10 @@ define void @foo_non_streaming_retval(ptr %ptr) {
; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
; CHECK-AFTER-ISEL-NEXT: BL @bar_retv_enabled, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
- ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY3:%[0-9]+]]:zpr = COPY $z0
; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
- ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
- ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY3]]
+ ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY4]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR
entry:
%v = tail call <vscale x 4 x i32> @bar_retv_enabled() #0
@@ -182,9 +185,10 @@ define void @foo_streaming_retval(ptr %ptr) #0 {
; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 0, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit-def $sp, implicit-def $z0, implicit $vg, implicit-def $vg, implicit-def $fpmr
; CHECK-AFTER-ISEL-NEXT: [[RDVLI_XI:%[0-9]+]]:gpr64 = RDVLI_XI 1, implicit $vg
- ; CHECK-AFTER-ISEL-NEXT: [[RDSVLI_XI:%[0-9]+]]:gpr64 = RDSVLI_XI 1, implicit $vg
- ; CHECK-AFTER-ISEL-NEXT: [[SUBXrr:%[0-9]+]]:gpr64 = SUBXrr [[RDVLI_XI]], [[RDSVLI_XI]]
- ; CHECK-AFTER-ISEL-NEXT: CBZX [[SUBXrr]], %bb.2
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[RDVLI_XI]]
+ ; CHECK-AFTER-ISEL-NEXT: [[ADDSVL_XXI:%[0-9]+]]:gpr64sp = ADDSVL_XXI [[COPY1]], -1, implicit $vg
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY [[ADDSVL_XXI]]
+ ; CHECK-AFTER-ISEL-NEXT: CBZX [[COPY2]], %bb.2
; CHECK-AFTER-ISEL-NEXT: {{ $}}
; CHECK-AFTER-ISEL-NEXT: bb.1.entry:
; CHECK-AFTER-ISEL-NEXT: successors:
@@ -194,10 +198,10 @@ define void @foo_streaming_retval(ptr %ptr) #0 {
; CHECK-AFTER-ISEL-NEXT: bb.2.entry:
; CHECK-AFTER-ISEL-NEXT: BL @bar_retv, csr_aarch64_sve_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $z0
; CHECK-AFTER-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
- ; CHECK-AFTER-ISEL-NEXT: [[COPY1:%[0-9]+]]:zpr = COPY $z0
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY3:%[0-9]+]]:zpr = COPY $z0
; CHECK-AFTER-ISEL-NEXT: MSRpstatesvcrImm1 1, 1, csr_aarch64_smstartstop, implicit-def dead $nzcv, implicit $vg, implicit-def $vg, implicit-def $fpmr
- ; CHECK-AFTER-ISEL-NEXT: [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
- ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY2]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
+ ; CHECK-AFTER-ISEL-NEXT: [[COPY4:%[0-9]+]]:zpr = COPY [[COPY3]]
+ ; CHECK-AFTER-ISEL-NEXT: STR_ZXI [[COPY4]], [[COPY]], 0 :: (store (<vscale x 1 x s128>) into %ir.ptr)
; CHECK-AFTER-ISEL-NEXT: RET_ReallyLR
entry:
%v = tail call <vscale x 4 x i32> @bar_retv()
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
index 8c197ef97b116..a1eb1ceeaf19b 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
@@ -36,9 +36,8 @@ define void @foo_non_streaming_pass_arg(ptr %arg) {
; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB0_2
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbz x8, .LBB0_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB0_2: // %entry
@@ -110,11 +109,10 @@ define void @foo_streaming_compatible_pass_arg(ptr %arg) #1 {
; CHECK-NEXT: .cfi_offset b15, -1136
; CHECK-NEXT: sub sp, sp, #1024
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: mrs x19, SVCR
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB1_2
+; CHECK-NEXT: mrs x19, SVCR
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbz x8, .LBB1_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB1_2: // %entry
@@ -195,9 +193,8 @@ define void @foo_streaming_pass_arg(ptr %arg) #0 {
; CHECK-NEXT: .cfi_def_cfa_offset 2144
; CHECK-NEXT: smstop sm
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB2_2
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbz x8, .LBB2_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB2_2: // %entry
@@ -264,9 +261,8 @@ define void @foo_non_streaming_retval(ptr %ptr) {
; CHECK-NEXT: .cfi_offset b15, -112
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB3_2
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbz x8, .LBB3_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB3_2: // %entry
@@ -342,11 +338,10 @@ define void @foo_streaming_compatible_retval(ptr %ptr) #1 {
; CHECK-NEXT: .cfi_offset b15, -1136
; CHECK-NEXT: sub sp, sp, #1024
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: mrs x20, SVCR
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB4_2
+; CHECK-NEXT: mrs x20, SVCR
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbz x8, .LBB4_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB4_2: // %entry
@@ -434,9 +429,8 @@ define void @foo_streaming_retval(ptr %ptr) #0 {
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: smstop sm
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB5_2
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbz x8, .LBB5_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB5_2: // %entry
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 95fb68945de44..f2163ad15bafc 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -210,9 +210,8 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB5_2:
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB5_4
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbz x8, .LBB5_4
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB5_4:
@@ -308,9 +307,8 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB6_2:
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB6_4
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbz x8, .LBB6_4
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB6_4:
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 6697f1e54b46f..125cea7dc469a 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -377,9 +377,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: smstop sm
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: rdsvl x9, #1
-; CHECK-NEXT: cmp x8, x9
-; CHECK-NEXT: b.eq .LBB3_2
+; CHECK-NEXT: addsvl x8, x8, #-1
+; CHECK-NEXT: cbz x8, .LBB3_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: brk #0x1
; CHECK-NEXT: .LBB3_2:
@@ -480,9 +479,8 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: //NO_APP
; FP-CHECK-NEXT: smstop sm
; FP-CHECK-NEXT: rdvl x8, #1
-; FP-CHECK-NEXT: rdsvl x9, #1
-; FP-CHECK-NEXT: cmp x8, x9
-; FP-CHECK-NEXT: b.eq .LBB3_2
+; FP-CHECK-NEXT: addsvl x8, x8, #-1
+; FP-CHECK-NEXT: cbz x8, .LBB3_2
; FP-CHECK-NEXT: // %bb.1:
; FP-CHECK-NEXT: brk #0x1
; FP-CHECK-NEXT: .LBB3_2:
More information about the llvm-commits
mailing list