[llvm] [AArch64][SME] Introduce CHECK_MATCHING_VL pseudo for streaming transitions (PR #157510)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 10 07:20:45 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Mary Kassayova (marykass-arm)
<details>
<summary>Changes</summary>
This patch adds a new codegen-only pseudo-instruction, `CHECK_MATCHING_VL`, used when transitioning between non-streaming / streaming-compatible callers and streaming-enabled callees. The pseudo verifies that the current SVE vector length (VL) matches the streaming vector length (SVL); if they differ, we trap.
---
Patch is 34.93 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157510.diff
9 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+89-8)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+5-2)
- (modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+11)
- (modified) llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll (+20-6)
- (modified) llvm/test/CodeGen/AArch64/sme-peephole-opts.ll (+13-1)
- (added) llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll (+484)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll (+20-6)
- (modified) llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll (+14)
- (modified) llvm/test/CodeGen/AArch64/spill-reload-remarks.ll (+1-1)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 351235dd5bcdd..19ae0e848a9e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2940,6 +2940,52 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
return NextInst->getParent();
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitCheckMatchingVL(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ MachineFunction *MF = MBB->getParent();
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+ DebugLoc DL = MI.getDebugLoc();
+ MachineFunction::iterator It = ++MBB->getIterator();
+
+ const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ Register RegVL = MRI.createVirtualRegister(RC);
+ Register RegSVL = MRI.createVirtualRegister(RC);
+ Register RegCheck = MRI.createVirtualRegister(RC);
+
+ BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL).addImm(1);
+ BuildMI(*MBB, MI, DL, TII->get(AArch64::RDSVLI_XI), RegSVL).addImm(1);
+
+ BuildMI(*MBB, MI, DL, TII->get(AArch64::SUBXrr), RegCheck)
+ .addReg(RegVL)
+ .addReg(RegSVL);
+
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
+ MF->insert(It, TrapBB);
+ MF->insert(It, PassBB);
+
+ BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
+ .addReg(RegCheck)
+ .addMBB(PassBB);
+
+ // Transfer rest of current BB to PassBB
+ PassBB->splice(PassBB->begin(), MBB,
+ std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ PassBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
+
+ MBB->addSuccessor(TrapBB);
+ MBB->addSuccessor(PassBB);
+
+ MI.eraseFromParent();
+ return PassBB;
+}
+
MachineBasicBlock *
AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
@@ -3343,6 +3389,9 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case AArch64::PROBED_STACKALLOC_DYN:
return EmitDynamicProbedAlloc(MI, BB);
+ case AArch64::CHECK_MATCHING_VL_PSEUDO:
+ return EmitCheckMatchingVL(MI, BB);
+
case AArch64::LD1_MXIPXX_H_PSEUDO_B:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_H:
@@ -9113,10 +9162,9 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
}
}
-SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
- bool Enable, SDValue Chain,
- SDValue InGlue,
- unsigned Condition) const {
+SDValue AArch64TargetLowering::changeStreamingMode(
+ SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
+ unsigned Condition, bool InsertVectorLengthCheck) const {
MachineFunction &MF = DAG.getMachineFunction();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
FuncInfo->setHasStreamingModeChanges(true);
@@ -9147,7 +9195,38 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
if (InGlue)
Ops.push_back(InGlue);
- return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+ if (!InsertVectorLengthCheck)
+ return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+
+ auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
+ SmallVector<SDValue, 2> Ops = {Chain};
+ if (InGlue)
+ Ops.push_back(InGlue);
+ return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+ };
+
+ // Non-streaming -> Streaming
+ if (Enable) {
+ SDValue CheckVL = GetCheckVL(Chain, InGlue);
+
+ // Replace chain
+ Ops[0] = CheckVL.getValue(0);
+
+ // Replace/append glue
+ if (InGlue)
+ Ops.back() = CheckVL.getValue(1);
+ else
+ Ops.push_back(CheckVL.getValue(1));
+
+ return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+ }
+
+ // Streaming -> Non-streaming
+ SDValue StreamingModeInstr =
+ DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
+ return GetCheckVL(StreamingModeInstr.getValue(0),
+ StreamingModeInstr.getValue(1));
}
// Emit a call to __arm_sme_save or __arm_sme_restore.
@@ -9730,9 +9809,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue InGlue;
if (RequiresSMChange) {
- Chain =
- changeStreamingMode(DAG, DL, CallAttrs.callee().hasStreamingInterface(),
- Chain, InGlue, getSMToggleCondition(CallAttrs));
+ bool InsertVectorLengthCheck =
+ (CallConv == CallingConv::AArch64_SVE_VectorCall);
+ Chain = changeStreamingMode(
+ DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
+ getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
InGlue = Chain.getValue(1);
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f5d14905cac66..ff073d3eafb1f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -168,6 +168,9 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
MachineBasicBlock *MBB) const;
+ MachineBasicBlock *EmitCheckMatchingVL(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
MachineBasicBlock *BB) const;
@@ -532,8 +535,8 @@ class AArch64TargetLowering : public TargetLowering {
/// node. \p Condition should be one of the enum values from
/// AArch64SME::ToggleCondition.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable,
- SDValue Chain, SDValue InGlue,
- unsigned Condition) const;
+ SDValue Chain, SDValue InGlue, unsigned Condition,
+ bool InsertVectorLengthCheck = false) const;
bool isVScaleKnownToBeAPowerOfTwo() const override { return true; }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f0020a9a3c91d..083dfc3bd382e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1367,6 +1367,17 @@ def PROBED_STACKALLOC_DYN : Pseudo<(outs),
} // Defs = [SP, NZCV], Uses = [SP] in
} // hasSideEffects = 1, isCodeGenOnly = 1
+// Pseudo-instruction that compares the current SVE vector length (VL) with the
+// streaming vector length (SVL). If the two lengths do not match, the check
+// lowers to a `brk`, causing a trap.
+let hasSideEffects = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in
+def CHECK_MATCHING_VL_PSEUDO : Pseudo<(outs), (ins), []>, Sched<[]>;
+
+def AArch64_check_matching_vl
+ : SDNode<"AArch64ISD::CHECK_MATCHING_VL", SDTypeProfile<0, 0,[]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def : Pat<(AArch64_check_matching_vl), (CHECK_MATCHING_VL_PSEUDO)>;
+
let isReMaterializable = 1, isCodeGenOnly = 1 in {
// FIXME: The following pseudo instructions are only needed because remat
// cannot handle multiple instructions. When that changes, they can be
diff --git a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
index cf42db7aa65bd..af6ebb3846738 100644
--- a/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
+++ b/llvm/test/CodeGen/AArch64/sme-callee-save-restore-pairs.ll
@@ -47,12 +47,19 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
; NOPAIR-NEXT: // %bb.1:
; NOPAIR-NEXT: smstop sm
; NOPAIR-NEXT: .LBB0_2:
+; NOPAIR-NEXT: rdvl x8, #1
+; NOPAIR-NEXT: rdsvl x9, #1
+; NOPAIR-NEXT: cmp x8, x9
+; NOPAIR-NEXT: b.eq .LBB0_4
+; NOPAIR-NEXT: // %bb.3:
+; NOPAIR-NEXT: brk #0x1
+; NOPAIR-NEXT: .LBB0_4:
; NOPAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; NOPAIR-NEXT: bl my_func2
-; NOPAIR-NEXT: tbz w19, #0, .LBB0_4
-; NOPAIR-NEXT: // %bb.3:
+; NOPAIR-NEXT: tbz w19, #0, .LBB0_6
+; NOPAIR-NEXT: // %bb.5:
; NOPAIR-NEXT: smstart sm
-; NOPAIR-NEXT: .LBB0_4:
+; NOPAIR-NEXT: .LBB0_6:
; NOPAIR-NEXT: addvl sp, sp, #1
; NOPAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; NOPAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -127,12 +134,19 @@ define void @fbyte(<vscale x 16 x i8> %v) #0{
; PAIR-NEXT: // %bb.1:
; PAIR-NEXT: smstop sm
; PAIR-NEXT: .LBB0_2:
+; PAIR-NEXT: rdvl x8, #1
+; PAIR-NEXT: rdsvl x9, #1
+; PAIR-NEXT: cmp x8, x9
+; PAIR-NEXT: b.eq .LBB0_4
+; PAIR-NEXT: // %bb.3:
+; PAIR-NEXT: brk #0x1
+; PAIR-NEXT: .LBB0_4:
; PAIR-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; PAIR-NEXT: bl my_func2
-; PAIR-NEXT: tbz w19, #0, .LBB0_4
-; PAIR-NEXT: // %bb.3:
+; PAIR-NEXT: tbz w19, #0, .LBB0_6
+; PAIR-NEXT: // %bb.5:
; PAIR-NEXT: smstart sm
-; PAIR-NEXT: .LBB0_4:
+; PAIR-NEXT: .LBB0_6:
; PAIR-NEXT: addvl sp, sp, #1
; PAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; PAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index 80827c2547780..9ff414d401426 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -527,14 +527,26 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" {
; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: mov z0.s, #0 // =0x0
-; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.ne .LBB14_2
+; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: bl callee_farg_fret
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: smstop sm
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB14_3
+; CHECK-NEXT: .LBB14_2:
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB14_3:
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl callee_farg_fret
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
new file mode 100644
index 0000000000000..1f36fd13fbaa3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-checkvl.ll
@@ -0,0 +1,484 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme,+sme2p1 < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @bar_enabled(<vscale x 4 x i32>) #0
+declare void @bar(<vscale x 4 x i32>)
+declare <vscale x 4 x i32> @bar_retv_enabled() #0
+declare <vscale x 4 x i32> @bar_retv()
+
+; Non-streaming -> calls streaming callee
+define void @foo_non_streaming_pass_arg(ptr %arg) {
+; CHECK-LABEL: foo_non_streaming_pass_arg:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w28, -8
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB0_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB0_2: // %entry
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: sub x8, x29, #64
+; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: bl bar_enabled
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .cfi_def_cfa wsp, 96
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+entry:
+ %v = load <vscale x 4 x i32>, ptr %arg, align 16
+ tail call void @bar_enabled(<vscale x 4 x i32> %v) #0
+ ret void
+}
+
+; Streaming-compatible -> calls streaming callee
+define void @foo_streaming_compatible_pass_arg(ptr %arg) #1 {
+; CHECK-LABEL: foo_streaming_compatible_pass_arg:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #1136
+; CHECK-NEXT: .cfi_def_cfa_offset 1136
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #1120] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #1088
+; CHECK-NEXT: .cfi_def_cfa w29, 48
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w28, -24
+; CHECK-NEXT: .cfi_offset vg, -32
+; CHECK-NEXT: .cfi_offset w30, -40
+; CHECK-NEXT: .cfi_offset w29, -48
+; CHECK-NEXT: .cfi_offset b8, -1080
+; CHECK-NEXT: .cfi_offset b9, -1088
+; CHECK-NEXT: .cfi_offset b10, -1096
+; CHECK-NEXT: .cfi_offset b11, -1104
+; CHECK-NEXT: .cfi_offset b12, -1112
+; CHECK-NEXT: .cfi_offset b13, -1120
+; CHECK-NEXT: .cfi_offset b14, -1128
+; CHECK-NEXT: .cfi_offset b15, -1136
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: mrs x19, SVCR
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB1_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB1_2: // %entry
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: sub x8, x29, #1088
+; CHECK-NEXT: str z0, [x8, #-1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: tbnz w19, #0, .LBB1_4
+; CHECK-NEXT: // %bb.3: // %entry
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB1_4: // %entry
+; CHECK-NEXT: ldr z0, [x8, #-1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: bl bar_enabled
+; CHECK-NEXT: tbnz w19, #0, .LBB1_6
+; CHECK-NEXT: // %bb.5: // %entry
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB1_6: // %entry
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: .cfi_def_cfa wsp, 1136
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #1120] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1136
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+entry:
+ %v = load <vscale x 4 x i32>, ptr %arg, align 16
+ tail call void @bar_enabled(<vscale x 4 x i32> %v) #0
+ ret void
+}
+
+; Streaming -> calls non-streaming callee
+define void @foo_streaming_pass_arg(ptr %arg) #0 {
+; CHECK-LABEL: foo_streaming_pass_arg:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #1120
+; CHECK-NEXT: .cfi_def_cfa_offset 1120
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #1088] // 8-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #1096] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #1104] // 8-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w28, -8
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_offset b8, -1064
+; CHECK-NEXT: .cfi_offset b9, -1072
+; CHECK-NEXT: .cfi_offset b10, -1080
+; CHECK-NEXT: .cfi_offset b11, -1088
+; CHECK-NEXT: .cfi_offset b12, -1096
+; CHECK-NEXT: .cfi_offset b13, -1104
+; CHECK-NEXT: .cfi_offset b14, -1112
+; CHECK-NEXT: .cfi_offset b15, -1120
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: .cfi_def_cfa_offset 2144
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: cmp x8, x9
+; CHECK-NEXT: b.eq .LBB2_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: brk #0x1
+; CHECK-NEXT: .LBB2_2: // %entry
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: bl bar
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: .cfi_def_cfa_offset 1120
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x28, [sp, #1112] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #1096] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1088] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1120
+; CHECK-NEXT: .cfi_def...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/157510
More information about the llvm-commits
mailing list