[clang] [llvm] [AArch64][SME] Save VG for unwind info when changing streaming-mode (PR #83301)
Kerry McLaughlin via cfe-commits
cfe-commits at lists.llvm.org
Tue Jun 11 05:40:11 PDT 2024
https://github.com/kmclaughlin-arm updated https://github.com/llvm/llvm-project/pull/83301
>From 7688f1710c77ff14b58d7e01925047f22f9f069e Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Wed, 28 Feb 2024 16:33:25 +0000
Subject: [PATCH 01/13] [AArch64][SME] Save VG for unwind info when changing
streaming-mode
If a function requires any streaming-mode change, the vector granule
value must be stored to the stack and unwind info must also describe the
save of VG to this location.
This patch adds VG to the list of callee-saved registers and increases the
callee-saved stack size in determineCalleeSaves if the function requires
streaming-mode changes. A new type is added to RegPairInfo for VG, which is
also used to skip restoring the register in the restore block.
See https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst
---
.../Target/AArch64/AArch64FrameLowering.cpp | 60 +-
...compatible-to-normal-fn-wihout-sme-attr.ll | 13 +-
.../CodeGen/AArch64/sme-lazy-save-call.ll | 5 +-
...ate-sm-changing-call-disable-coalescing.ll | 518 ++++++++------
.../sme-streaming-compatible-interface.ll | 111 +--
...nging-call-disable-stackslot-scavenging.ll | 9 +-
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 641 ++++++++++++++++++
7 files changed, 1083 insertions(+), 274 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index cd532671f5018..4e9a1454e1f7d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -321,7 +321,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
return false;
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
- if (AFI->hasSwiftAsyncContext())
+ if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
return false;
// If there are an odd number of GPRs before LR and FP in the CSRs list,
@@ -691,6 +691,9 @@ static void emitCalleeSavedRestores(MachineBasicBlock &MBB,
!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
continue;
+ if (!Info.isRestored())
+ continue;
+
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1344,6 +1347,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MachineInstr::MIFlag FrameFlag = MachineInstr::FrameSetup,
int CFAOffset = 0) {
unsigned NewOpc;
+
switch (MBBI->getOpcode()) {
default:
llvm_unreachable("Unexpected callee-save save/restore opcode!");
@@ -1654,6 +1658,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
LiveRegs.removeReg(AArch64::LR);
}
+ // If the function contains streaming mode changes, we expect the first
+ // instruction of MBB to be a CNTD. Move past this instruction if found.
+ if (AFI->hasStreamingModeChanges()) {
+ assert(MBBI->getOpcode() == AArch64::CNTD_XPiI && "Unexpected instruction");
+ MBBI = std::next(MBBI);
+ }
+
auto VerifyClobberOnExit = make_scope_exit([&]() {
if (NonFrameStart == MBB.end())
return;
@@ -2760,7 +2771,7 @@ struct RegPairInfo {
unsigned Reg2 = AArch64::NoRegister;
int FrameIdx;
int Offset;
- enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
+ enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
RegPairInfo() = default;
@@ -2772,6 +2783,7 @@ struct RegPairInfo {
return 2;
case GPR:
case FPR64:
+ case VG:
return 8;
case ZPR:
case FPR128:
@@ -2847,6 +2859,8 @@ static void computeCalleeSaveRegisterPairs(
RPI.Type = RegPairInfo::ZPR;
else if (AArch64::PPRRegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::PPR;
+ else if (RPI.Reg1 == AArch64::VG)
+ RPI.Type = RegPairInfo::VG;
else
llvm_unreachable("Unsupported register class.");
@@ -2879,6 +2893,8 @@ static void computeCalleeSaveRegisterPairs(
if (((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1))
RPI.Reg2 = NextReg;
break;
+ case RegPairInfo::VG:
+ break;
}
}
@@ -3062,7 +3078,23 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Size = 2;
Alignment = Align(2);
break;
+ case RegPairInfo::VG:
+ StrOpc = AArch64::STRXui;
+ Size = 8;
+ Alignment = Align(8);
+ break;
+ }
+
+ if (Reg1 == AArch64::VG) {
+ // Find an available register to store value of VG to.
+ Reg1 = findScratchNonCalleeSaveRegister(&MBB);
+ assert(Reg1 != AArch64::NoRegister);
+
+ BuildMI(MBB, MBB.begin(), DL, TII.get(AArch64::CNTD_XPiI), Reg1)
+ .addImm(31)
+ .addImm(1);
}
+
LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
dbgs() << ") -> fi#(" << RPI.FrameIdx;
@@ -3233,6 +3265,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Size = 2;
Alignment = Align(2);
break;
+ case RegPairInfo::VG:
+ continue;
}
LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -3432,6 +3466,11 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
CSStackSize += RegSize;
}
+ // Increase the callee-saved stack size if the function has streaming mode
+ // changes, as we will need to spill the value of the VG register.
+ if (AFI->hasStreamingModeChanges())
+ CSStackSize += 8;
+
// Save number of saved regs, so we can easily update CSStackSize later.
unsigned NumSavedRegs = SavedRegs.count();
@@ -3568,6 +3607,23 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
}
+ // Insert VG into the list of CSRs, immediately before LR if saved.
+ if (AFI->hasStreamingModeChanges()) {
+ auto VGInfo = CalleeSavedInfo(AArch64::VG);
+ VGInfo.setRestored(false);
+ bool InsertBeforeLR = false;
+
+ for (unsigned I = 0; I < CSI.size(); I++)
+ if (CSI[I].getReg() == AArch64::LR) {
+ InsertBeforeLR = true;
+ CSI.insert(CSI.begin() + I, VGInfo);
+ break;
+ }
+
+ if (!InsertBeforeLR)
+ CSI.push_back(VGInfo);
+ }
+
for (auto &CS : CSI) {
Register Reg = CS.getReg();
const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 07377195d62a0..25743cf68b148 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -mattr=+sve < %s | FileCheck %s
; Verify that the following code can be compiled without +sme, because if the
; call is not entered in streaming-SVE mode at runtime, the codepath leading
@@ -10,11 +10,13 @@ target triple = "aarch64"
define void @streaming_compatible() #0 {
; CHECK-LABEL: streaming_compatible:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB0_2
@@ -26,11 +28,12 @@ define void @streaming_compatible() #0 {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @non_streaming()
ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9d635f0b88f19..c24585a971fb7 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -121,13 +121,14 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
@@ -160,7 +161,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: sub sp, x29, #64
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index 1d1bae42c9e30..500c51159dd91 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -16,11 +16,12 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov x19, x1
@@ -28,12 +29,12 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -49,11 +50,12 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov x19, x1
@@ -61,12 +63,12 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -82,11 +84,12 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: mov x19, x1
@@ -94,12 +97,12 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -115,11 +118,12 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_i64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: mov x19, x1
@@ -127,12 +131,12 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl use_i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -148,11 +152,12 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_f16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
@@ -165,14 +170,14 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
; CHECK-NEXT: bl use_f16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -188,11 +193,12 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
@@ -205,14 +211,14 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-NEXT: bl use_f32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -228,11 +234,12 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -245,14 +252,14 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_f64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -273,11 +280,12 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -290,14 +298,14 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v16i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -314,11 +322,12 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -331,14 +340,14 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v8i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -355,11 +364,12 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -372,14 +382,14 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v4i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -396,11 +406,12 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1i64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -413,14 +424,14 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v2i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -437,11 +448,12 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1f16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
@@ -454,14 +466,14 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
; CHECK-NEXT: bl use_v8f16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -478,11 +490,12 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1f32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -495,14 +508,14 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v4f32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -519,11 +532,12 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v1f64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -536,14 +550,14 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v2f64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -564,11 +578,12 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -581,14 +596,14 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v16i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -604,11 +619,12 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -621,14 +637,14 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -644,11 +660,12 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -661,14 +678,14 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v4i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -684,11 +701,12 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -701,14 +719,14 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v2i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -724,11 +742,12 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8f16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -741,14 +760,14 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8f16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -764,11 +783,12 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -781,14 +801,14 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v8bf16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -804,11 +824,12 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -821,14 +842,14 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v4f32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -844,11 +865,12 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
@@ -861,14 +883,14 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl use_v2f64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -887,11 +909,12 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_arg_v8i1:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
@@ -900,10 +923,10 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
; CHECK-NEXT: add x8, sp, #16
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
-; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: and z1.b, z1.b, #0x1
; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0
; CHECK-NEXT: str p0, [x8, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: bl use_v8i1
@@ -913,8 +936,8 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
; CHECK-NEXT: str p0, [x19]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -933,23 +956,26 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
define void @dont_coalesce_res_i8(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i8
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i8 @get_i8()
%vec = insertelement <vscale x 16 x i8> poison, i8 %res, i32 0
@@ -960,23 +986,26 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 {
define void @dont_coalesce_res_i16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i16
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i16 @get_i16()
%vec = insertelement <vscale x 8 x i16> poison, i16 %res, i32 0
@@ -987,23 +1016,26 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 {
define void @dont_coalesce_res_i32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i32
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i32 @get_i32()
%vec = insertelement <vscale x 4 x i32> poison, i32 %res, i32 0
@@ -1014,23 +1046,26 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 {
define void @dont_coalesce_res_i64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_i64
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call i64 @get_i64()
%vec = insertelement <vscale x 2 x i64> poison, i64 %res, i32 0
@@ -1041,27 +1076,30 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 {
define void @dont_coalesce_res_f16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_f16
; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call half @get_f16()
%vec = insertelement <vscale x 8 x half> poison, half %res, i32 0
@@ -1072,12 +1110,14 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 {
define void @dont_coalesce_res_f32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_f32
@@ -1087,11 +1127,12 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call float @get_f32()
%vec = insertelement <vscale x 4 x float> poison, float %res, i32 0
@@ -1102,12 +1143,14 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
define void @dont_coalesce_res_f64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_f64
@@ -1117,11 +1160,12 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call double @get_f64()
%vec = insertelement <vscale x 2 x double> poison, double %res, i32 0
@@ -1136,12 +1180,14 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i8
@@ -1151,11 +1197,12 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i8> @get_v1i8()
%elt = extractelement <1 x i8> %res, i32 0
@@ -1167,12 +1214,14 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i16
@@ -1182,11 +1231,12 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i16> @get_v1i16()
%elt = extractelement <1 x i16> %res, i32 0
@@ -1198,12 +1248,14 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i32
@@ -1213,11 +1265,12 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i32> @get_v1i32()
%elt = extractelement <1 x i32> %res, i32 0
@@ -1229,12 +1282,14 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1i64
@@ -1244,11 +1299,12 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x i64> @get_v1i64()
%elt = extractelement <1 x i64> %res, i32 0
@@ -1260,27 +1316,30 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1f16
; CHECK-NEXT: str h0, [sp, #14] // 2-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x half> @get_v1f16()
%elt = extractelement <1 x half> %res, i32 0
@@ -1292,12 +1351,14 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1f32
@@ -1307,11 +1368,12 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x float> @get_v1f32()
%elt = extractelement <1 x float> %res, i32 0
@@ -1323,12 +1385,14 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v1f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v1f64
@@ -1338,11 +1402,12 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <1 x double> @get_v1f64()
%elt = extractelement <1 x double> %res, i32 0
@@ -1358,27 +1423,30 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v16i8
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1b { z0.b }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <16 x i8> @get_v16i8()
%vec = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> %res, i64 0)
@@ -1389,27 +1457,30 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v8i16
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <8 x i16> @get_v8i16()
%vec = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> poison, <8 x i16> %res, i64 0)
@@ -1420,27 +1491,30 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v4i32
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <4 x i32> @get_v4i32()
%vec = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> %res, i64 0)
@@ -1451,27 +1525,30 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v2i64
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <2 x i64> @get_v2i64()
%vec = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> poison, <2 x i64> %res, i64 0)
@@ -1482,27 +1559,30 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v8f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v8f16
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1h { z0.h }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <8 x half> @get_v8f16()
%vec = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> poison, <8 x half> %res, i64 0)
@@ -1513,27 +1593,30 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v4f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v4f32
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1w { z0.s }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <4 x float> @get_v4f32()
%vec = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %res, i64 0)
@@ -1544,27 +1627,30 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
define void @dont_coalesce_res_v2f64(ptr %ptr) #0 {
; CHECK-LABEL: dont_coalesce_res_v2f64:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl get_v2f64
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: st1d { z0.d }, p0, [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <2 x double> @get_v2f64()
%vec = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> %res, i64 0)
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 1e16f140676ba..8037ec4edbab5 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -36,11 +36,13 @@ define void @normal_caller_streaming_compatible_callee() nounwind {
define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_caller_normal_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB1_2
@@ -52,11 +54,12 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @normal_callee();
@@ -72,11 +75,13 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_caller_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
@@ -88,11 +93,12 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee();
@@ -123,12 +129,13 @@ define void @streaming_compatible_caller_and_callee() "aarch64_pstate_sm_compati
define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_neon_vectors:
; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: add x8, sp, #16
@@ -160,8 +167,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -176,8 +183,9 @@ declare <2 x double> @normal_callee_vec_arg(<2 x double>)
define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale x 2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_scalable_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -255,8 +263,8 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x double> %arg)
%fadd = fadd <vscale x 2 x double> %res, %arg
@@ -268,8 +276,9 @@ declare <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x doub
define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_predicate_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -347,8 +356,8 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
%and = and <vscale x 2 x i1> %res, %arg
@@ -360,11 +369,13 @@ declare <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1>)
define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: conditional_smstart_unreachable_block:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB7_2
@@ -381,11 +392,13 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
; CHECK: // %bb.0:
; CHECK-NEXT: tbz w0, #0, .LBB8_6
; CHECK-NEXT: // %bb.1: // %if.then
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB8_3
@@ -397,11 +410,12 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
; CHECK-NEXT: // %bb.4: // %if.then
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB8_5: // %if.then
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: .LBB8_6: // %exit
; CHECK-NEXT: ret
br i1 %p, label %if.then, label %exit
@@ -417,11 +431,13 @@ exit:
define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB9_2
@@ -433,11 +449,12 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB9_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
tail call void @normal_callee();
@@ -447,23 +464,26 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: sub sp, sp, #128
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 112
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: .cfi_offset b8, -24
-; CHECK-NEXT: .cfi_offset b9, -32
-; CHECK-NEXT: .cfi_offset b10, -40
-; CHECK-NEXT: .cfi_offset b11, -48
-; CHECK-NEXT: .cfi_offset b12, -56
-; CHECK-NEXT: .cfi_offset b13, -64
-; CHECK-NEXT: .cfi_offset b14, -72
-; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 128
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x8, x1
; CHECK-NEXT: mov x9, x0
@@ -483,12 +503,13 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-NEXT: // %bb.3: // %entry
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB10_4: // %entry
-; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #112] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #112
+; CHECK-NEXT: add sp, sp, #128
; CHECK-NEXT: ret
entry:
call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index 45ca7844b0655..27fde24891ff8 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -14,12 +14,13 @@ target triple = "aarch64"
define void @test_no_stackslot_scavenging(float %f) #0 {
; CHECK-LABEL: test_no_stackslot_scavenging:
; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x24, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
@@ -31,8 +32,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
; CHECK-NEXT: smstart sm
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x24, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x24, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
new file mode 100644
index 0000000000000..79ef691eccf77
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -0,0 +1,641 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=FP-CHECK
+
+declare void @callee();
+declare void @fixed_callee(<4 x i32>);
+declare void @scalable_callee(<vscale x 2 x i64>);
+
+; Simple example of a function with one call requiring a streaming mode change
+;
+define void @vg_unwind_simple() #0 {
+; CHECK-LABEL: vg_unwind_simple:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 80
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset vg, -8
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_simple:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+ call void @callee();
+ ret void;
+}
+
+; As above, with an extra register clobbered by the inline asm call which
+; changes NeedsGapToAlignStack to false
+;
+define void @vg_unwind_needs_gap() #0 {
+; CHECK-LABEL: vg_unwind_needs_gap:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x20, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w20, -16
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x20, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w20
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_needs_gap:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x20, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w20, -8
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: //APP
+; FP-CHECK-NEXT: //NO_APP
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x20, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w20
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+ call void asm sideeffect "", "~{x20}"()
+ call void @callee();
+ ret void;
+}
+
+define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
+; CHECK-LABEL: vg_unwind_with_fixed_args:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset vg, -8
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: bl fixed_callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_with_fixed_args:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: sub sp, sp, #112
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 112
+; FP-CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #80
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; FP-CHECK-NEXT: bl fixed_callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 112
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #80] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: add sp, sp, #112
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+ call void @fixed_callee(<4 x i32> %x);
+ ret void;
+}
+
+define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
+; CHECK-LABEL: vg_unwind_with_sve_args:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w28, -8
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: addvl sp, sp, #-18
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 32 - 32 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 32 - 40 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 32 - 48 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 32 - 56 * VG
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 32 - 64 * VG
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 152 * VG
+; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: bl scalable_callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
+; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #18
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
+; CHECK-NEXT: .cfi_restore z8
+; CHECK-NEXT: .cfi_restore z9
+; CHECK-NEXT: .cfi_restore z10
+; CHECK-NEXT: .cfi_restore z11
+; CHECK-NEXT: .cfi_restore z12
+; CHECK-NEXT: .cfi_restore z13
+; CHECK-NEXT: .cfi_restore z14
+; CHECK-NEXT: .cfi_restore z15
+; CHECK-NEXT: ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_with_sve_args:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 48
+; FP-CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; FP-CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: mov x29, sp
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 48
+; FP-CHECK-NEXT: .cfi_offset w27, -8
+; FP-CHECK-NEXT: .cfi_offset w28, -16
+; FP-CHECK-NEXT: .cfi_offset vg, -32
+; FP-CHECK-NEXT: .cfi_offset w30, -40
+; FP-CHECK-NEXT: .cfi_offset w29, -48
+; FP-CHECK-NEXT: addvl sp, sp, #-18
+; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 48 - 32 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 48 - 40 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 48 - 48 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 48 - 56 * VG
+; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 48 - 64 * VG
+; FP-CHECK-NEXT: addvl sp, sp, #-1
+; FP-CHECK-NEXT: str z0, [x29, #-19, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: //APP
+; FP-CHECK-NEXT: //NO_APP
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: bl scalable_callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: addvl sp, sp, #1
+; FP-CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: addvl sp, sp, #18
+; FP-CHECK-NEXT: .cfi_restore z8
+; FP-CHECK-NEXT: .cfi_restore z9
+; FP-CHECK-NEXT: .cfi_restore z10
+; FP-CHECK-NEXT: .cfi_restore z11
+; FP-CHECK-NEXT: .cfi_restore z12
+; FP-CHECK-NEXT: .cfi_restore z13
+; FP-CHECK-NEXT: .cfi_restore z14
+; FP-CHECK-NEXT: .cfi_restore z15
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 48
+; FP-CHECK-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w27
+; FP-CHECK-NEXT: .cfi_restore w28
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: ret
+ call void asm sideeffect "", "~{x28}"()
+ call void @scalable_callee(<vscale x 2 x i64> %x);
+ ret void;
+}
+
+; This test was based on stack-probing-64k.ll and tries to test multiple uses of
+; findScratchNonCalleeSaveRegister.
+;
+define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
+; CHECK-LABEL: vg_unwind_multiple_scratch_regs:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680
+; CHECK-NEXT: .cfi_def_cfa w9, 327776
+; CHECK-NEXT: .LBB4_1: // %entry
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: cmp sp, x9
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b.ne .LBB4_1
+; CHECK-NEXT: // %bb.2: // %entry
+; CHECK-NEXT: .cfi_def_cfa_register wsp
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_multiple_scratch_regs:
+; FP-CHECK: // %bb.0: // %entry
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x28, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w28, -8
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: sub x9, sp, #80, lsl #12 // =327680
+; FP-CHECK-NEXT: .LBB4_1: // %entry
+; FP-CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; FP-CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; FP-CHECK-NEXT: cmp sp, x9
+; FP-CHECK-NEXT: str xzr, [sp]
+; FP-CHECK-NEXT: b.ne .LBB4_1
+; FP-CHECK-NEXT: // %bb.2: // %entry
+; FP-CHECK-NEXT: mov x8, sp
+; FP-CHECK-NEXT: str x8, [x0]
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x28, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w28
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+entry:
+ %v = alloca i8, i64 327680, align 1
+ store ptr %v, ptr %out, align 8
+ call void @callee()
+ ret void
+}
+
+attributes #0 = { "aarch64_pstate_sm_enabled" uwtable(async) }
+attributes #1 = { "probe-stack"="inline-asm" "aarch64_pstate_sm_enabled" uwtable(async) }
>From 991a2253dc7d30fe3a054f7640d6b216c254f4a6 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Mon, 4 Mar 2024 14:32:07 +0000
Subject: [PATCH 02/13] - Prevent spill of VG for functions marked as
'nounwind' - Added a test to sme-vg-to-stack.ll with the 'nounwind' attribute
---
.../Target/AArch64/AArch64FrameLowering.cpp | 8 +-
...compatible-to-normal-fn-wihout-sme-attr.ll | 11 +--
.../CodeGen/AArch64/sme-lazy-save-call.ll | 5 +-
.../sme-streaming-compatible-interface.ll | 77 ++++++++-----------
...nging-call-disable-stackslot-scavenging.ll | 9 +--
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 42 ++++++++++
6 files changed, 87 insertions(+), 65 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 4e9a1454e1f7d..15e6022154339 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1660,7 +1660,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// If the function contains streaming mode changes, we expect the first
// instruction of MBB to be a CNTD. Move past this instruction if found.
- if (AFI->hasStreamingModeChanges()) {
+ if (AFI->hasStreamingModeChanges() && F.needsUnwindTableEntry()) {
assert(MBBI->getOpcode() == AArch64::CNTD_XPiI && "Unexpected instruction");
MBBI = std::next(MBBI);
}
@@ -3468,7 +3468,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// Increase the callee-saved stack size if the function has streaming mode
// changes, as we will need to spill the value of the VG register.
- if (AFI->hasStreamingModeChanges())
+ const Function &F = MF.getFunction();
+ if (AFI->hasStreamingModeChanges() && F.needsUnwindTableEntry())
CSStackSize += 8;
// Save number of saved regs, so we can easily update CSStackSize later.
@@ -3608,7 +3609,8 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
}
// Insert VG into the list of CSRs, immediately before LR if saved.
- if (AFI->hasStreamingModeChanges()) {
+ const Function &F = MF.getFunction();
+ if (AFI->hasStreamingModeChanges() && F.needsUnwindTableEntry()) {
auto VGInfo = CalleeSavedInfo(AArch64::VG);
VGInfo.setRestored(false);
bool InsertBeforeLR = false;
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 25743cf68b148..718448055de9c 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -10,13 +10,11 @@ target triple = "aarch64"
define void @streaming_compatible() #0 {
; CHECK-LABEL: streaming_compatible:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
-; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB0_2
@@ -28,12 +26,11 @@ define void @streaming_compatible() #0 {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @non_streaming()
ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index c24585a971fb7..9d635f0b88f19 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -121,14 +121,13 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
-; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
@@ -161,7 +160,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: sub sp, x29, #64
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 8037ec4edbab5..7432dfbbc0ff1 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -36,13 +36,11 @@ define void @normal_caller_streaming_compatible_callee() nounwind {
define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_caller_normal_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
-; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB1_2
@@ -54,12 +52,11 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB1_4:
+; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @normal_callee();
@@ -75,13 +72,11 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_caller_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
-; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
@@ -93,12 +88,11 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_4:
+; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee();
@@ -129,13 +123,12 @@ define void @streaming_compatible_caller_and_callee() "aarch64_pstate_sm_compati
define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_neon_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: add x8, sp, #16
@@ -167,8 +160,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -183,9 +176,8 @@ declare <2 x double> @normal_callee_vec_arg(<2 x double>)
define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale x 2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_scalable_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
-; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -263,8 +255,8 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload
-; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x double> %arg)
%fadd = fadd <vscale x 2 x double> %res, %arg
@@ -276,9 +268,8 @@ declare <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x doub
define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_predicate_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
-; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -356,8 +347,8 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload
-; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
%and = and <vscale x 2 x i1> %res, %arg
@@ -369,13 +360,11 @@ declare <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1>)
define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: conditional_smstart_unreachable_block:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
-; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB7_2
@@ -392,13 +381,11 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
; CHECK: // %bb.0:
; CHECK-NEXT: tbz w0, #0, .LBB8_6
; CHECK-NEXT: // %bb.1: // %if.then
-; CHECK-NEXT: cntd x9
-; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB8_3
@@ -410,12 +397,11 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
; CHECK-NEXT: // %bb.4: // %if.then
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB8_5: // %if.then
+; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT: .LBB8_6: // %exit
; CHECK-NEXT: ret
br i1 %p, label %if.then, label %exit
@@ -431,13 +417,11 @@ exit:
define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
-; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB9_2
@@ -449,12 +433,11 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB9_4:
+; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
; CHECK-NEXT: ret
tail call void @normal_callee();
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index 27fde24891ff8..45ca7844b0655 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -14,13 +14,12 @@ target triple = "aarch64"
define void @test_no_stackslot_scavenging(float %f) #0 {
; CHECK-LABEL: test_no_stackslot_scavenging:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x9, x24, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
@@ -32,8 +31,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
; CHECK-NEXT: smstart sm
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x24, [sp, #88] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x30, x24, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 79ef691eccf77..5a99c9500c42c 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -637,5 +637,47 @@ entry:
ret void
}
+; Ensure VG is not spilled if nounwind is used
+;
+define void @vg_nounwind_simple() #2 {
+; CHECK-LABEL: vg_nounwind_simple:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_nounwind_simple:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; FP-CHECK-NEXT: ret
+ call void @callee();
+ ret void;
+}
+
attributes #0 = { "aarch64_pstate_sm_enabled" uwtable(async) }
attributes #1 = { "probe-stack"="inline-asm" "aarch64_pstate_sm_enabled" uwtable(async) }
+attributes #2 = { "aarch64_pstate_sm_enabled" nounwind }
>From cbc72ca9a18d7664ca25cc92be96e8067a551caf Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Fri, 8 Mar 2024 15:50:26 +0000
Subject: [PATCH 03/13] Changes to handle locally streaming functions with
streaming mode changes:
- Emit both the streaming and non-streaming value of VG in the prologue of
functions with the aarch64_pstate_sm_body attribute.
- Added the VGUnwindInfoPseudo node which expands to either .cfi_restore or a
.cfi_offset depending on the value of the immediate used (0 or 1 respectively).
- VGUnwindInfoPseudo nodes are emitted with the smstop/smstart pair around
calls to streaming-mode functions from a locally-streaming caller. The
.cfi_offset will save the streaming-VG value, whilst the restore sets the rule
for VG to the same as it was at the beginning of the function (non-streaming).
- The frame index used for the streaming VG value is saved in AArch64FunctionInfo
so that it can be used to calculate the offset when expanding the pseudo.
- Added the @vg_locally_streaming_fn() test to sme-vg-to-stack.ll
---
.../AArch64/AArch64ExpandPseudoInsts.cpp | 48 ++++++
.../Target/AArch64/AArch64FrameLowering.cpp | 94 +++++++++--
.../Target/AArch64/AArch64ISelLowering.cpp | 16 ++
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +
.../AArch64/AArch64MachineFunctionInfo.h | 5 +
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 12 ++
.../sme-streaming-compatible-interface.ll | 2 +-
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 148 ++++++++++++++++--
8 files changed, 296 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 9b7fc228d5de8..45ba57d4778fb 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -29,6 +29,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
@@ -1598,6 +1599,53 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case AArch64::COALESCER_BARRIER_FPR128:
MI.eraseFromParent();
return true;
+ case AArch64::VGUnwindInfoPseudo: {
+ MachineFunction &MF = *MBB.getParent();
+ SMEAttrs FuncAttrs(MF.getFunction());
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+ if ((!FuncAttrs.hasStreamingBody() && FuncAttrs.hasStreamingInterface()) ||
+ !AFI->hasStreamingModeChanges())
+ return false;
+
+ int64_t StreamingVGIdx = AFI->getStreamingVGIdx();
+ assert(StreamingVGIdx != std::numeric_limits<int>::max() &&
+ "Expected FrameIdx for Streaming-VG");
+
+ const TargetSubtargetInfo &STI = MF.getSubtarget();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+ if (MI.getOperand(0).getImm() == 1) {
+ // This pseudo has been inserted after a streaming-mode change
+ // to save the streaming value of VG before a call.
+ // Calculate and emit the CFI offset using StreamingVGIdx.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const AArch64FrameLowering *TFI =
+ MF.getSubtarget<AArch64Subtarget>().getFrameLowering();
+
+ int64_t Offset =
+ MFI.getObjectOffset(StreamingVGIdx) - TFI->getOffsetOfLocalArea();
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, TRI.getDwarfRegNum(AArch64::VG, true), Offset));
+ BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+ TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ } else {
+ // This is a restore of VG after returning from the call. Emit the
+ // .cfi_restore instruction, which sets the rule for VG to the same
+ // as it was on entry to the function.
+ ++MBBI;
+ DebugLoc DL = MI.getDebugLoc();
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
+ nullptr, TRI.getDwarfRegNum(AArch64::VG, true)));
+ BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
case AArch64::LD1B_2Z_IMM_PSEUDO:
return expandMultiVecPseudo(
MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 15e6022154339..bfd578f5bf9c0 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -550,6 +550,7 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
if (CSI.empty())
@@ -561,14 +562,20 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
DebugLoc DL = MBB.findDebugLoc(MBBI);
for (const auto &Info : CSI) {
- if (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector)
+ unsigned FrameIdx = Info.getFrameIdx();
+ if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
continue;
assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
+ int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
+
+ // Locally streaming functions save two values for VG, but we should only
+ // emit the location of the non-streaming value here.
+ if (DwarfReg == TRI.getDwarfRegNum(AArch64::VG, true) &&
+ FrameIdx == AFI->getStreamingVGIdx())
+ continue;
- int64_t Offset =
- MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
unsigned CFIIndex = MF.addFrameInst(
MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1348,6 +1355,20 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
int CFAOffset = 0) {
unsigned NewOpc;
+ // If the function contains streaming mode changes, we expect instructions
+ // to calculate the value of VG before spilling. For locally-streaming
+ // functions, we need to do this for both the streaming and non-streaming
+ // vector length. Move past these instructions if necessary.
+ unsigned Opc = MBBI->getOpcode();
+ if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI) {
+ AArch64FunctionInfo AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
+ assert(AFI.hasStreamingModeChanges() &&
+ "Unexpected callee-save save/restore opcode!");
+ ++MBBI;
+ if (MBBI->getOpcode() == AArch64::UBFMXri)
+ ++MBBI;
+ }
+
switch (MBBI->getOpcode()) {
default:
llvm_unreachable("Unexpected callee-save save/restore opcode!");
@@ -1658,13 +1679,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
LiveRegs.removeReg(AArch64::LR);
}
- // If the function contains streaming mode changes, we expect the first
- // instruction of MBB to be a CNTD. Move past this instruction if found.
- if (AFI->hasStreamingModeChanges() && F.needsUnwindTableEntry()) {
- assert(MBBI->getOpcode() == AArch64::CNTD_XPiI && "Unexpected instruction");
- MBBI = std::next(MBBI);
- }
-
auto VerifyClobberOnExit = make_scope_exit([&]() {
if (NonFrameStart == MBB.end())
return;
@@ -1849,6 +1863,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// pointer bump above.
while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
!IsSVECalleeSave(MBBI)) {
+ unsigned Opc = MBBI->getOpcode();
+ // Move past instructions generated to calculate VG
+ if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
+ Opc == AArch64::UBFMXri) {
+ assert(AFI->hasStreamingModeChanges() && "Unexpected opcode!");
+ ++MBBI;
+ }
if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
NeedsWinCFI, &HasWinCFI);
@@ -3014,6 +3035,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
bool NeedsWinCFI = needsWinCFI(MF);
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
+ bool SpilledStreamingVG = false;
+ MachineFrameInfo &MFI = MF.getFrameInfo();
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
@@ -3089,10 +3112,30 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
// Find an available register to store value of VG to.
Reg1 = findScratchNonCalleeSaveRegister(&MBB);
assert(Reg1 != AArch64::NoRegister);
+ SMEAttrs Attrs(MF.getFunction());
+
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() &&
+ !SpilledStreamingVG) {
+ // For locally-streaming functions, we need to store both the streaming
+ // & non-streaming VG. Spill the streaming value first.
+ BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1)
+ .addImm(1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1)
+ .addReg(Reg1)
+ .addImm(3)
+ .addImm(63)
+ .setMIFlag(MachineInstr::FrameSetup);
- BuildMI(MBB, MBB.begin(), DL, TII.get(AArch64::CNTD_XPiI), Reg1)
- .addImm(31)
- .addImm(1);
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ AFI->setStreamingVGIdx(RPI.FrameIdx);
+ SpilledStreamingVG = true;
+ } else {
+ BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
+ .addImm(31)
+ .addImm(1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
}
LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
@@ -3468,9 +3511,16 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// Increase the callee-saved stack size if the function has streaming mode
// changes, as we will need to spill the value of the VG register.
+ // For locally streaming functions, we spill both the streaming and
+ // non-streaming VG value.
const Function &F = MF.getFunction();
- if (AFI->hasStreamingModeChanges() && F.needsUnwindTableEntry())
- CSStackSize += 8;
+ SMEAttrs Attrs(F);
+ if (AFI->hasStreamingModeChanges() && F.needsUnwindTableEntry()) {
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
+ CSStackSize += 16;
+ else
+ CSStackSize += 8;
+ }
// Save number of saved regs, so we can easily update CSStackSize later.
unsigned NumSavedRegs = SavedRegs.count();
@@ -3611,19 +3661,29 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
// Insert VG into the list of CSRs, immediately before LR if saved.
const Function &F = MF.getFunction();
if (AFI->hasStreamingModeChanges() && F.needsUnwindTableEntry()) {
+ std::vector<CalleeSavedInfo> VGSaves;
+ SMEAttrs Attrs(MF.getFunction());
+
auto VGInfo = CalleeSavedInfo(AArch64::VG);
VGInfo.setRestored(false);
+ VGSaves.push_back(VGInfo);
+
+ // Add VG again if the function is locally-streaming, as we will spill two
+ // values.
+ if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
+ VGSaves.push_back(VGInfo);
+
bool InsertBeforeLR = false;
for (unsigned I = 0; I < CSI.size(); I++)
if (CSI[I].getReg() == AArch64::LR) {
InsertBeforeLR = true;
- CSI.insert(CSI.begin() + I, VGInfo);
+ CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end());
break;
}
if (!InsertBeforeLR)
- CSI.push_back(VGInfo);
+ CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end());
}
for (auto &CS : CSI) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 360a841bdade4..a8be248eaaf49 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2489,6 +2489,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::FIRST_NUMBER:
break;
MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
+ MAKE_CASE(AArch64ISD::VG_UNWIND)
MAKE_CASE(AArch64ISD::SMSTART)
MAKE_CASE(AArch64ISD::SMSTOP)
MAKE_CASE(AArch64ISD::RESTORE_ZA)
@@ -8509,12 +8510,22 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
SDValue InGlue;
+ bool IsLocallyStreaming =
+ !CallerAttrs.hasStreamingInterface() && CallerAttrs.hasStreamingBody();
if (RequiresSMChange) {
SDValue NewChain = changeStreamingMode(
DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
Chain = NewChain.getValue(0);
InGlue = NewChain.getValue(1);
+
+ if (IsLocallyStreaming && MF.getFunction().needsUnwindTableEntry()) {
+ NewChain = DAG.getNode(
+ AArch64ISD::VG_UNWIND, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+ {Chain, DAG.getTargetConstant(/*Save*/ 1, DL, MVT::i64), InGlue});
+ Chain = NewChain.getValue(0);
+ InGlue = NewChain.getValue(1);
+ }
}
// Build a sequence of copy-to-reg nodes chained together with token chain
@@ -8687,6 +8698,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Result = changeStreamingMode(
DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
+
+ if (IsLocallyStreaming && MF.getFunction().needsUnwindTableEntry())
+ Result = DAG.getNode(
+ AArch64ISD::VG_UNWIND, DL, MVT::Other,
+ {Result, DAG.getTargetConstant(/*Restore*/ 0, DL, MVT::i64)});
}
if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 48a4ea91c2782..dadfd39039192 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -70,6 +70,8 @@ enum NodeType : unsigned {
COALESCER_BARRIER,
+ VG_UNWIND,
+
SMSTART,
SMSTOP,
RESTORE_ZA,
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index df09fc5592edf..b81d8ea59c0ce 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -216,6 +216,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
unsigned PredicateRegForFillSpill = 0;
+ int64_t StreamingVGIdx = std::numeric_limits<int>::max();
+
public:
AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
@@ -234,6 +236,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
+ int64_t getStreamingVGIdx() const { return StreamingVGIdx; };
+ void setStreamingVGIdx(unsigned Idx) { StreamingVGIdx = Idx; };
+
bool isSVECC() const { return IsSVECC; };
void setIsSVECC(bool s) { IsSVECC = s; };
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2b70c4715bf9e..ab4349e3c3468 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -31,6 +31,9 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64VGUnwind : SDNode<"AArch64ISD::VG_UNWIND", SDTypeProfile<0, 1, []>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
@@ -221,6 +224,15 @@ def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)),
(MSRpstatesvcrImm1 svcr_op:$pstate, 0b0)>;
+// Pseudo to insert cfi_offset/cfi_restore instructions. Used to save or restore
+// the streaming value of VG around streaming-mode changes in locally-streaming
+// functions.
+def VGUnwindInfoPseudo :
+ Pseudo<(outs), (ins timm0_1:$save_restore), []>, Sched<[]>;
+
+def : Pat<(AArch64VGUnwind (i64 timm0_1:$save_restore)),
+ (VGUnwindInfoPseudo timm0_1:$save_restore)>;
+
//===----------------------------------------------------------------------===//
// SME2 Instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 7432dfbbc0ff1..7da1a08c432aa 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -447,8 +447,8 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cntd x9
; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 5a99c9500c42c..f7c5d8a083635 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -6,14 +6,16 @@ declare void @callee();
declare void @fixed_callee(<4 x i32>);
declare void @scalable_callee(<vscale x 2 x i64>);
+declare void @streaming_callee() #0;
+
; Simple example of a function with one call requiring a streaming mode change
;
define void @vg_unwind_simple() #0 {
; CHECK-LABEL: vg_unwind_simple:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 80
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
@@ -50,9 +52,9 @@ define void @vg_unwind_simple() #0 {
;
; FP-CHECK-LABEL: vg_unwind_simple:
; FP-CHECK: // %bb.0:
-; FP-CHECK-NEXT: cntd x9
; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
@@ -102,9 +104,9 @@ define void @vg_unwind_simple() #0 {
define void @vg_unwind_needs_gap() #0 {
; CHECK-LABEL: vg_unwind_needs_gap:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
@@ -147,9 +149,9 @@ define void @vg_unwind_needs_gap() #0 {
;
; FP-CHECK-LABEL: vg_unwind_needs_gap:
; FP-CHECK: // %bb.0:
-; FP-CHECK-NEXT: cntd x9
; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
@@ -202,9 +204,9 @@ define void @vg_unwind_needs_gap() #0 {
define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
; CHECK-LABEL: vg_unwind_with_fixed_args:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
; CHECK-NEXT: sub sp, sp, #96
; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
@@ -245,9 +247,9 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
;
; FP-CHECK-LABEL: vg_unwind_with_fixed_args:
; FP-CHECK: // %bb.0:
-; FP-CHECK-NEXT: cntd x9
; FP-CHECK-NEXT: sub sp, sp, #112
; FP-CHECK-NEXT: .cfi_def_cfa_offset 112
+; FP-CHECK-NEXT: cntd x9
; FP-CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; FP-CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; FP-CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
@@ -298,9 +300,9 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-LABEL: vg_unwind_with_sve_args:
; CHECK: // %bb.0:
-; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_offset w28, -8
; CHECK-NEXT: .cfi_offset vg, -16
@@ -403,11 +405,11 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
;
; FP-CHECK-LABEL: vg_unwind_with_sve_args:
; FP-CHECK: // %bb.0:
-; FP-CHECK-NEXT: cntd x9
; FP-CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
; FP-CHECK-NEXT: .cfi_def_cfa_offset 48
-; FP-CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; FP-CHECK-NEXT: cntd x9
; FP-CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; FP-CHECK-NEXT: mov x29, sp
; FP-CHECK-NEXT: .cfi_def_cfa w29, 48
; FP-CHECK-NEXT: .cfi_offset w27, -8
@@ -518,9 +520,9 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
; CHECK-LABEL: vg_unwind_multiple_scratch_regs:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
@@ -574,9 +576,9 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
;
; FP-CHECK-LABEL: vg_unwind_multiple_scratch_regs:
; FP-CHECK: // %bb.0: // %entry
-; FP-CHECK-NEXT: cntd x9
; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
@@ -637,9 +639,128 @@ entry:
ret void
}
+; Locally streaming functions require storing both the streaming and
+; non-streaming values of VG.
+;
+define void @vg_locally_streaming_fn() #3 {
+; CHECK-LABEL: vg_locally_streaming_fn:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset vg, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_locally_streaming_fn:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: rdsvl x9, #1
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: lsr x9, x9, #3
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset vg, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: bl streaming_callee
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+ call void @callee()
+ call void @streaming_callee()
+ call void @callee()
+ ret void
+}
+
; Ensure VG is not spilled if nounwind is used
;
-define void @vg_nounwind_simple() #2 {
+define void @vg_nounwind_simple() #4 {
; CHECK-LABEL: vg_nounwind_simple:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
@@ -680,4 +801,5 @@ define void @vg_nounwind_simple() #2 {
attributes #0 = { "aarch64_pstate_sm_enabled" uwtable(async) }
attributes #1 = { "probe-stack"="inline-asm" "aarch64_pstate_sm_enabled" uwtable(async) }
-attributes #2 = { "aarch64_pstate_sm_enabled" nounwind }
+attributes #3 = { "aarch64_pstate_sm_body" uwtable(async) }
+attributes #4 = { "aarch64_pstate_sm_enabled" nounwind }
>From c4c08e00a0031ccc56ea21b354c7767dd5b71759 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Tue, 2 Apr 2024 10:37:33 +0000
Subject: [PATCH 04/13] - Removed calls to needsUnwindTableEntry() so that VG
is always spilled if there are streaming mode changes in the function.
- Added requiresVGSpill() to AArch64MachineFunctionInfo which returns
true if the function has streaming mode changes and hasSVE is true.
With this change, we will no longer spill VG at the beginning of
functions if the target does not also have SVE.
- Removed SpilledStreamingVG flag from spillCalleeSavedRegisters.
- Rebased to include recent changes to the changeStreamingMode interface.
---
.../AArch64/AArch64ExpandPseudoInsts.cpp | 2 +-
.../Target/AArch64/AArch64FrameLowering.cpp | 28 +++--
.../Target/AArch64/AArch64ISelLowering.cpp | 4 +-
.../AArch64/AArch64MachineFunctionInfo.h | 6 +
...compatible-to-normal-fn-wihout-sme-attr.ll | 2 +-
.../sme-streaming-compatible-interface.ll | 34 +++---
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 109 +++++++++++-------
7 files changed, 106 insertions(+), 79 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 45ba57d4778fb..da396f04b6575 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1605,7 +1605,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
if ((!FuncAttrs.hasStreamingBody() && FuncAttrs.hasStreamingInterface()) ||
- !AFI->hasStreamingModeChanges())
+ !AFI->requiresVGSpill(MF))
return false;
int64_t StreamingVGIdx = AFI->getStreamingVGIdx();
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index bfd578f5bf9c0..83855d0a1eebc 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -321,7 +321,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
return false;
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
- if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
+ if (AFI->hasSwiftAsyncContext() || AFI->requiresVGSpill(MF))
return false;
// If there are an odd number of GPRs before LR and FP in the CSRs list,
@@ -567,7 +567,7 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
continue;
assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
- unsigned DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
+ int64_t DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
// Locally streaming functions save two values for VG, but we should only
@@ -1361,8 +1361,9 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// vector length. Move past these instructions if necessary.
unsigned Opc = MBBI->getOpcode();
if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI) {
- AArch64FunctionInfo AFI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
- assert(AFI.hasStreamingModeChanges() &&
+ MachineFunction &MF = *MBB.getParent();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ assert(AFI->requiresVGSpill(MF) &&
"Unexpected callee-save save/restore opcode!");
++MBBI;
if (MBBI->getOpcode() == AArch64::UBFMXri)
@@ -1865,10 +1866,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
!IsSVECalleeSave(MBBI)) {
unsigned Opc = MBBI->getOpcode();
// Move past instructions generated to calculate VG
- if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
- Opc == AArch64::UBFMXri) {
- assert(AFI->hasStreamingModeChanges() && "Unexpected opcode!");
+ if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI) {
+ assert(AFI->requiresVGSpill(MF) && "Unexpected opcode!");
++MBBI;
+ if (MBBI->getOpcode() == AArch64::UBFMXri)
+ ++MBBI;
}
if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
@@ -3032,11 +3034,10 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool NeedsWinCFI = needsWinCFI(MF);
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
- bool SpilledStreamingVG = false;
- MachineFrameInfo &MFI = MF.getFrameInfo();
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
@@ -3115,7 +3116,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
SMEAttrs Attrs(MF.getFunction());
if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() &&
- !SpilledStreamingVG) {
+ AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) {
// For locally-streaming functions, we need to store both the streaming
// & non-streaming VG. Spill the streaming value first.
BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1)
@@ -3127,9 +3128,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
.addImm(63)
.setMIFlag(MachineInstr::FrameSetup);
- AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
AFI->setStreamingVGIdx(RPI.FrameIdx);
- SpilledStreamingVG = true;
} else {
BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
.addImm(31)
@@ -3515,7 +3514,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// non-streaming VG value.
const Function &F = MF.getFunction();
SMEAttrs Attrs(F);
- if (AFI->hasStreamingModeChanges() && F.needsUnwindTableEntry()) {
+ if (AFI->requiresVGSpill(MF)) {
if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
CSStackSize += 16;
else
@@ -3659,8 +3658,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
}
// Insert VG into the list of CSRs, immediately before LR if saved.
- const Function &F = MF.getFunction();
- if (AFI->hasStreamingModeChanges() && F.needsUnwindTableEntry()) {
+ if (AFI->requiresVGSpill(MF)) {
std::vector<CalleeSavedInfo> VGSaves;
SMEAttrs Attrs(MF.getFunction());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a8be248eaaf49..4954b543d2b24 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8519,7 +8519,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = NewChain.getValue(0);
InGlue = NewChain.getValue(1);
- if (IsLocallyStreaming && MF.getFunction().needsUnwindTableEntry()) {
+ if (IsLocallyStreaming && Subtarget->hasSVE()) {
NewChain = DAG.getNode(
AArch64ISD::VG_UNWIND, DL, DAG.getVTList(MVT::Other, MVT::Glue),
{Chain, DAG.getTargetConstant(/*Save*/ 1, DL, MVT::i64), InGlue});
@@ -8699,7 +8699,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
- if (IsLocallyStreaming && MF.getFunction().needsUnwindTableEntry())
+ if (IsLocallyStreaming && Subtarget->hasSVE())
Result = DAG.getNode(
AArch64ISD::VG_UNWIND, DL, MVT::Other,
{Result, DAG.getTargetConstant(/*Restore*/ 0, DL, MVT::i64)});
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index b81d8ea59c0ce..9be0d2ec41bd6 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+#include "AArch64Subtarget.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -499,6 +500,11 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
HasStreamingModeChanges = HasChanges;
}
+ bool requiresVGSpill(const MachineFunction &MF) const {
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ return STI.hasSVE() && HasStreamingModeChanges;
+ }
+
bool hasStackProbing() const { return StackProbeSize != 0; }
int64_t getStackProbeSize() const { return StackProbeSize; }
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 718448055de9c..07377195d62a0 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mattr=+sve < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s
; Verify that the following code can be compiled without +sme, because if the
; call is not entered in streaming-SVE mode at runtime, the codepath leading
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 7da1a08c432aa..1e16f140676ba 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -447,26 +447,23 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #128
-; CHECK-NEXT: cntd x9
+; CHECK-NEXT: sub sp, sp, #112
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT: str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 128
-; CHECK-NEXT: .cfi_offset w19, -16
-; CHECK-NEXT: .cfi_offset vg, -24
-; CHECK-NEXT: .cfi_offset w30, -32
-; CHECK-NEXT: .cfi_offset b8, -40
-; CHECK-NEXT: .cfi_offset b9, -48
-; CHECK-NEXT: .cfi_offset b10, -56
-; CHECK-NEXT: .cfi_offset b11, -64
-; CHECK-NEXT: .cfi_offset b12, -72
-; CHECK-NEXT: .cfi_offset b13, -80
-; CHECK-NEXT: .cfi_offset b14, -88
-; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 112
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x8, x1
; CHECK-NEXT: mov x9, x0
@@ -486,13 +483,12 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-NEXT: // %bb.3: // %entry
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB10_4: // %entry
+; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #112] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
entry:
call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index f7c5d8a083635..27e624097ef27 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=FP-CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=FP-CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=NO-SVE-CHECK
declare void @callee();
declare void @fixed_callee(<4 x i32>);
@@ -94,6 +95,7 @@ define void @vg_unwind_simple() #0 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
call void @callee();
ret void;
}
@@ -196,6 +198,7 @@ define void @vg_unwind_needs_gap() #0 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
call void asm sideeffect "", "~{x20}"()
call void @callee();
ret void;
@@ -293,6 +296,7 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
call void @fixed_callee(<4 x i32> %x);
ret void;
}
@@ -509,6 +513,7 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: .cfi_restore w30
; FP-CHECK-NEXT: .cfi_restore w29
; FP-CHECK-NEXT: ret
+;
call void asm sideeffect "", "~{x28}"()
call void @scalable_callee(<vscale x 2 x i64> %x);
ret void;
@@ -632,6 +637,7 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
entry:
%v = alloca i8, i64 327680, align 1
store ptr %v, ptr %out, align 8
@@ -752,54 +758,75 @@ define void @vg_locally_streaming_fn() #3 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
call void @callee()
call void @streaming_callee()
call void @callee()
ret void
}
-; Ensure VG is not spilled if nounwind is used
+; If the target does not have SVE, do not spill VG even if the function
+; has streaming-mode changes.
;
-define void @vg_nounwind_simple() #4 {
-; CHECK-LABEL: vg_nounwind_simple:
-; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: smstop sm
-; CHECK-NEXT: bl callee
-; CHECK-NEXT: smstart sm
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; CHECK-NEXT: ret
-;
-; FP-CHECK-LABEL: vg_nounwind_simple:
-; FP-CHECK: // %bb.0:
-; FP-CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; FP-CHECK-NEXT: add x29, sp, #64
-; FP-CHECK-NEXT: smstop sm
-; FP-CHECK-NEXT: bl callee
-; FP-CHECK-NEXT: smstart sm
-; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
-; FP-CHECK-NEXT: ret
- call void @callee();
- ret void;
+define void @streaming_compatible_no_sve() #4 {
+; NO-SVE-CHECK-LABEL: streaming_compatible_no_sve:
+; NO-SVE-CHECK: // %bb.0:
+; NO-SVE-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa_offset 96
+; NO-SVE-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; NO-SVE-CHECK-NEXT: add x29, sp, #64
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa w29, 32
+; NO-SVE-CHECK-NEXT: .cfi_offset w19, -16
+; NO-SVE-CHECK-NEXT: .cfi_offset w30, -24
+; NO-SVE-CHECK-NEXT: .cfi_offset w29, -32
+; NO-SVE-CHECK-NEXT: .cfi_offset b8, -40
+; NO-SVE-CHECK-NEXT: .cfi_offset b9, -48
+; NO-SVE-CHECK-NEXT: .cfi_offset b10, -56
+; NO-SVE-CHECK-NEXT: .cfi_offset b11, -64
+; NO-SVE-CHECK-NEXT: .cfi_offset b12, -72
+; NO-SVE-CHECK-NEXT: .cfi_offset b13, -80
+; NO-SVE-CHECK-NEXT: .cfi_offset b14, -88
+; NO-SVE-CHECK-NEXT: .cfi_offset b15, -96
+; NO-SVE-CHECK-NEXT: bl __arm_sme_state
+; NO-SVE-CHECK-NEXT: and x19, x0, #0x1
+; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB6_2
+; NO-SVE-CHECK-NEXT: // %bb.1:
+; NO-SVE-CHECK-NEXT: smstart sm
+; NO-SVE-CHECK-NEXT: .LBB6_2:
+; NO-SVE-CHECK-NEXT: bl streaming_callee
+; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB6_4
+; NO-SVE-CHECK-NEXT: // %bb.3:
+; NO-SVE-CHECK-NEXT: smstop sm
+; NO-SVE-CHECK-NEXT: .LBB6_4:
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; NO-SVE-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; NO-SVE-CHECK-NEXT: .cfi_def_cfa_offset 0
+; NO-SVE-CHECK-NEXT: .cfi_restore w19
+; NO-SVE-CHECK-NEXT: .cfi_restore w30
+; NO-SVE-CHECK-NEXT: .cfi_restore w29
+; NO-SVE-CHECK-NEXT: .cfi_restore b8
+; NO-SVE-CHECK-NEXT: .cfi_restore b9
+; NO-SVE-CHECK-NEXT: .cfi_restore b10
+; NO-SVE-CHECK-NEXT: .cfi_restore b11
+; NO-SVE-CHECK-NEXT: .cfi_restore b12
+; NO-SVE-CHECK-NEXT: .cfi_restore b13
+; NO-SVE-CHECK-NEXT: .cfi_restore b14
+; NO-SVE-CHECK-NEXT: .cfi_restore b15
+; NO-SVE-CHECK-NEXT: ret
+ call void @streaming_callee()
+ ret void
}
attributes #0 = { "aarch64_pstate_sm_enabled" uwtable(async) }
attributes #1 = { "probe-stack"="inline-asm" "aarch64_pstate_sm_enabled" uwtable(async) }
attributes #3 = { "aarch64_pstate_sm_body" uwtable(async) }
-attributes #4 = { "aarch64_pstate_sm_enabled" nounwind }
+attributes #4 = { "aarch64_pstate_sm_compatible" uwtable(async) }
>From b5363ead2fd52ff7aa1fd30d7201b770a273b4be Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Thu, 11 Apr 2024 13:12:19 +0000
Subject: [PATCH 05/13] - Emit .cfi_offset to save VG before all streaming-mode
changes in a function.
- Change emitCalleeSavedGPRLocations to only emit the non-streaming location
of VG in the prologue for locally-streaming functions.
- Move the .cfi_offset directive before the smstart/smstop.
- Added streaming-compatible tests.
---
.../AArch64/AArch64ExpandPseudoInsts.cpp | 16 +-
.../Target/AArch64/AArch64FrameLowering.cpp | 14 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 26 +-
.../AArch64/AArch64MachineFunctionInfo.h | 5 +
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 268 +++++++++++++++++-
5 files changed, 290 insertions(+), 39 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index da396f04b6575..be03fc4d2cc40 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1602,15 +1602,17 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case AArch64::VGUnwindInfoPseudo: {
MachineFunction &MF = *MBB.getParent();
SMEAttrs FuncAttrs(MF.getFunction());
+ bool LocallyStreaming =
+ FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- if ((!FuncAttrs.hasStreamingBody() && FuncAttrs.hasStreamingInterface()) ||
- !AFI->requiresVGSpill(MF))
+ if (!AFI->requiresVGSpill(MF))
return false;
- int64_t StreamingVGIdx = AFI->getStreamingVGIdx();
- assert(StreamingVGIdx != std::numeric_limits<int>::max() &&
- "Expected FrameIdx for Streaming-VG");
+ int64_t VGFrameIdx =
+ LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx();
+ assert(VGFrameIdx != std::numeric_limits<int>::max() &&
+ "Expected FrameIdx for VG");
const TargetSubtargetInfo &STI = MF.getSubtarget();
const TargetInstrInfo &TII = *STI.getInstrInfo();
@@ -1618,13 +1620,13 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
if (MI.getOperand(0).getImm() == 1) {
// This pseudo has been inserted after a streaming-mode change
// to save the streaming value of VG before a call.
- // Calculate and emit the CFI offset using StreamingVGIdx.
+ // Calculate and emit the CFI offset using VGFrameIdx.
MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64FrameLowering *TFI =
MF.getSubtarget<AArch64Subtarget>().getFrameLowering();
int64_t Offset =
- MFI.getObjectOffset(StreamingVGIdx) - TFI->getOffsetOfLocalArea();
+ MFI.getObjectOffset(VGFrameIdx) - TFI->getOffsetOfLocalArea();
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
nullptr, TRI.getDwarfRegNum(AArch64::VG, true), Offset));
BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 83855d0a1eebc..9f7227944c80b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -551,6 +551,9 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
MachineFunction &MF = *MBB.getParent();
MachineFrameInfo &MFI = MF.getFrameInfo();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ SMEAttrs Attrs(MF.getFunction());
+ bool LocallyStreaming =
+ Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface();
const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
if (CSI.empty())
@@ -570,10 +573,12 @@ void AArch64FrameLowering::emitCalleeSavedGPRLocations(
int64_t DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
- // Locally streaming functions save two values for VG, but we should only
- // emit the location of the non-streaming value here.
- if (DwarfReg == TRI.getDwarfRegNum(AArch64::VG, true) &&
- FrameIdx == AFI->getStreamingVGIdx())
+ // The location of VG will be emitted before each streaming-mode change in
+ // the function. Only locally-streaming functions require emitting the
+ // non-streaming VG location here.
+ if ((LocallyStreaming && FrameIdx == AFI->getStreamingVGIdx()) ||
+ (!LocallyStreaming &&
+ DwarfReg == TRI.getDwarfRegNum(AArch64::VG, true)))
continue;
unsigned CFIIndex = MF.addFrameInst(
@@ -3134,6 +3139,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
.addImm(31)
.addImm(1)
.setMIFlag(MachineInstr::FrameSetup);
+ AFI->setVGIdx(RPI.FrameIdx);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4954b543d2b24..748491cf61a67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8510,22 +8510,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
SDValue InGlue;
- bool IsLocallyStreaming =
- !CallerAttrs.hasStreamingInterface() && CallerAttrs.hasStreamingBody();
if (RequiresSMChange) {
+
+ if (Subtarget->hasSVE()) {
+ Chain = DAG.getNode(
+ AArch64ISD::VG_UNWIND, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+ {Chain, DAG.getTargetConstant(/*Save*/ 1, DL, MVT::i64)});
+ InGlue = Chain.getValue(1);
+ }
+
SDValue NewChain = changeStreamingMode(
DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
Chain = NewChain.getValue(0);
InGlue = NewChain.getValue(1);
-
- if (IsLocallyStreaming && Subtarget->hasSVE()) {
- NewChain = DAG.getNode(
- AArch64ISD::VG_UNWIND, DL, DAG.getVTList(MVT::Other, MVT::Glue),
- {Chain, DAG.getTargetConstant(/*Save*/ 1, DL, MVT::i64), InGlue});
- Chain = NewChain.getValue(0);
- InGlue = NewChain.getValue(1);
- }
}
// Build a sequence of copy-to-reg nodes chained together with token chain
@@ -8695,14 +8693,16 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (RequiresSMChange) {
assert(PStateSM && "Expected a PStateSM to be set");
+
Result = changeStreamingMode(
DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
+ InGlue = Result.getValue(1);
- if (IsLocallyStreaming && Subtarget->hasSVE())
+ if (Subtarget->hasSVE())
Result = DAG.getNode(
- AArch64ISD::VG_UNWIND, DL, MVT::Other,
- {Result, DAG.getTargetConstant(/*Restore*/ 0, DL, MVT::i64)});
+ AArch64ISD::VG_UNWIND, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+ {Result, DAG.getTargetConstant(/*Restore*/ 0, DL, MVT::i64), InGlue});
}
if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 9be0d2ec41bd6..aee0986203889 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -217,6 +217,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// The PTRUE is used for the LD/ST of ZReg pairs in save and restore.
unsigned PredicateRegForFillSpill = 0;
+ // The stack slots where VG values are stored to.
+ int64_t VGIdx = std::numeric_limits<int>::max();
int64_t StreamingVGIdx = std::numeric_limits<int>::max();
public:
@@ -237,6 +239,9 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
Register getPStateSMReg() const { return PStateSMReg; };
void setPStateSMReg(Register Reg) { PStateSMReg = Reg; };
+ int64_t getVGIdx() const { return VGIdx; };
+ void setVGIdx(unsigned Idx) { VGIdx = Idx; };
+
int64_t getStreamingVGIdx() const { return StreamingVGIdx; };
void setStreamingVGIdx(unsigned Idx) { StreamingVGIdx = Idx; };
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 27e624097ef27..04211e23ec8d5 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=FP-CHECK
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=NO-SVE-CHECK
@@ -21,7 +20,6 @@ define void @vg_unwind_simple() #0 {
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_offset vg, -8
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: .cfi_offset b8, -24
; CHECK-NEXT: .cfi_offset b9, -32
@@ -31,9 +29,11 @@ define void @vg_unwind_simple() #0 {
; CHECK-NEXT: .cfi_offset b13, -64
; CHECK-NEXT: .cfi_offset b14, -72
; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: .cfi_offset vg, -8
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
@@ -63,7 +63,6 @@ define void @vg_unwind_simple() #0 {
; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; FP-CHECK-NEXT: add x29, sp, #64
; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
-; FP-CHECK-NEXT: .cfi_offset vg, -16
; FP-CHECK-NEXT: .cfi_offset w30, -24
; FP-CHECK-NEXT: .cfi_offset w29, -32
; FP-CHECK-NEXT: .cfi_offset b8, -40
@@ -74,9 +73,11 @@ define void @vg_unwind_simple() #0 {
; FP-CHECK-NEXT: .cfi_offset b13, -80
; FP-CHECK-NEXT: .cfi_offset b14, -88
; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: .cfi_offset vg, -16
; FP-CHECK-NEXT: smstop sm
; FP-CHECK-NEXT: bl callee
; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
@@ -115,7 +116,6 @@ define void @vg_unwind_needs_gap() #0 {
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: str x20, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: .cfi_offset w20, -16
-; CHECK-NEXT: .cfi_offset vg, -24
; CHECK-NEXT: .cfi_offset w30, -32
; CHECK-NEXT: .cfi_offset b8, -40
; CHECK-NEXT: .cfi_offset b9, -48
@@ -127,9 +127,11 @@ define void @vg_unwind_needs_gap() #0 {
; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: .cfi_offset vg, -24
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldr x20, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
@@ -162,7 +164,6 @@ define void @vg_unwind_needs_gap() #0 {
; FP-CHECK-NEXT: add x29, sp, #64
; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
; FP-CHECK-NEXT: .cfi_offset w20, -8
-; FP-CHECK-NEXT: .cfi_offset vg, -16
; FP-CHECK-NEXT: .cfi_offset w30, -24
; FP-CHECK-NEXT: .cfi_offset w29, -32
; FP-CHECK-NEXT: .cfi_offset b8, -40
@@ -175,9 +176,11 @@ define void @vg_unwind_needs_gap() #0 {
; FP-CHECK-NEXT: .cfi_offset b15, -96
; FP-CHECK-NEXT: //APP
; FP-CHECK-NEXT: //NO_APP
+; FP-CHECK-NEXT: .cfi_offset vg, -16
; FP-CHECK-NEXT: smstop sm
; FP-CHECK-NEXT: bl callee
; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; FP-CHECK-NEXT: ldr x20, [sp, #88] // 8-byte Folded Reload
@@ -215,7 +218,6 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_offset vg, -8
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: .cfi_offset b8, -24
; CHECK-NEXT: .cfi_offset b9, -32
@@ -226,10 +228,12 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
; CHECK-NEXT: .cfi_offset b14, -72
; CHECK-NEXT: .cfi_offset b15, -80
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset vg, -8
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl fixed_callee
; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
@@ -261,7 +265,6 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
; FP-CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
; FP-CHECK-NEXT: add x29, sp, #80
; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
-; FP-CHECK-NEXT: .cfi_offset vg, -16
; FP-CHECK-NEXT: .cfi_offset w30, -24
; FP-CHECK-NEXT: .cfi_offset w29, -32
; FP-CHECK-NEXT: .cfi_offset b8, -40
@@ -273,10 +276,12 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
; FP-CHECK-NEXT: .cfi_offset b14, -88
; FP-CHECK-NEXT: .cfi_offset b15, -96
; FP-CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_offset vg, -16
; FP-CHECK-NEXT: smstop sm
; FP-CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; FP-CHECK-NEXT: bl fixed_callee
; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
; FP-CHECK-NEXT: .cfi_def_cfa wsp, 112
; FP-CHECK-NEXT: ldp x29, x30, [sp, #80] // 16-byte Folded Reload
; FP-CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
@@ -309,7 +314,6 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp x9, x28, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: .cfi_offset w28, -8
-; CHECK-NEXT: .cfi_offset vg, -16
; CHECK-NEXT: .cfi_offset w30, -24
; CHECK-NEXT: .cfi_offset w29, -32
; CHECK-NEXT: addvl sp, sp, #-18
@@ -355,10 +359,12 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: .cfi_offset vg, -16
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: bl scalable_callee
; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
@@ -418,7 +424,6 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: .cfi_def_cfa w29, 48
; FP-CHECK-NEXT: .cfi_offset w27, -8
; FP-CHECK-NEXT: .cfi_offset w28, -16
-; FP-CHECK-NEXT: .cfi_offset vg, -32
; FP-CHECK-NEXT: .cfi_offset w30, -40
; FP-CHECK-NEXT: .cfi_offset w29, -48
; FP-CHECK-NEXT: addvl sp, sp, #-18
@@ -462,10 +467,12 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: str z0, [x29, #-19, mul vl] // 16-byte Folded Spill
; FP-CHECK-NEXT: //APP
; FP-CHECK-NEXT: //NO_APP
+; FP-CHECK-NEXT: .cfi_offset vg, -32
; FP-CHECK-NEXT: smstop sm
; FP-CHECK-NEXT: ldr z0, [x29, #-19, mul vl] // 16-byte Folded Reload
; FP-CHECK-NEXT: bl scalable_callee
; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
; FP-CHECK-NEXT: addvl sp, sp, #1
; FP-CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; FP-CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -533,7 +540,6 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_offset vg, -16
; CHECK-NEXT: .cfi_offset w30, -24
; CHECK-NEXT: .cfi_offset w29, -32
; CHECK-NEXT: .cfi_offset b8, -40
@@ -556,9 +562,11 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
; CHECK-NEXT: .cfi_def_cfa_register wsp
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: .cfi_offset vg, -16
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
; CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
; CHECK-NEXT: .cfi_def_cfa_offset 96
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
@@ -592,7 +600,6 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
; FP-CHECK-NEXT: add x29, sp, #64
; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
; FP-CHECK-NEXT: .cfi_offset w28, -8
-; FP-CHECK-NEXT: .cfi_offset vg, -16
; FP-CHECK-NEXT: .cfi_offset w30, -24
; FP-CHECK-NEXT: .cfi_offset w29, -32
; FP-CHECK-NEXT: .cfi_offset b8, -40
@@ -613,9 +620,11 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
; FP-CHECK-NEXT: // %bb.2: // %entry
; FP-CHECK-NEXT: mov x8, sp
; FP-CHECK-NEXT: str x8, [x0]
+; FP-CHECK-NEXT: .cfi_offset vg, -16
; FP-CHECK-NEXT: smstop sm
; FP-CHECK-NEXT: bl callee
; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
; FP-CHECK-NEXT: add sp, sp, #80, lsl #12 // =327680
; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
@@ -672,14 +681,14 @@ define void @vg_locally_streaming_fn() #3 {
; CHECK-NEXT: .cfi_offset b14, -88
; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: smstart sm
-; CHECK-NEXT: smstop sm
; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .cfi_restore vg
; CHECK-NEXT: bl streaming_callee
-; CHECK-NEXT: smstop sm
; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl callee
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .cfi_restore vg
@@ -728,14 +737,14 @@ define void @vg_locally_streaming_fn() #3 {
; FP-CHECK-NEXT: .cfi_offset b14, -88
; FP-CHECK-NEXT: .cfi_offset b15, -96
; FP-CHECK-NEXT: smstart sm
-; FP-CHECK-NEXT: smstop sm
; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
; FP-CHECK-NEXT: bl callee
; FP-CHECK-NEXT: smstart sm
; FP-CHECK-NEXT: .cfi_restore vg
; FP-CHECK-NEXT: bl streaming_callee
-; FP-CHECK-NEXT: smstop sm
; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
; FP-CHECK-NEXT: bl callee
; FP-CHECK-NEXT: smstart sm
; FP-CHECK-NEXT: .cfi_restore vg
@@ -765,6 +774,234 @@ define void @vg_locally_streaming_fn() #3 {
ret void
}
+define void @streaming_compatible_to_streaming() #4 {
+; CHECK-LABEL: streaming_compatible_to_streaming:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: tbnz w19, #0, .LBB6_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB6_2:
+; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: tbnz w19, #0, .LBB6_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB6_4:
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: streaming_compatible_to_streaming:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w19, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: bl __arm_sme_state
+; FP-CHECK-NEXT: and x19, x0, #0x1
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_2
+; FP-CHECK-NEXT: // %bb.1:
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .LBB6_2:
+; FP-CHECK-NEXT: bl streaming_callee
+; FP-CHECK-NEXT: tbnz w19, #0, .LBB6_4
+; FP-CHECK-NEXT: // %bb.3:
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .LBB6_4:
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w19
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+ call void @streaming_callee()
+ ret void
+}
+
+define void @streaming_compatible_to_non_streaming() #4 {
+; CHECK-LABEL: streaming_compatible_to_non_streaming:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 96
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: .cfi_offset vg, -24
+; CHECK-NEXT: tbz w19, #0, .LBB7_2
+; CHECK-NEXT: // %bb.1:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB7_2:
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: tbz w19, #0, .LBB7_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB7_4:
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: streaming_compatible_to_non_streaming:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w19, -8
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: bl __arm_sme_state
+; FP-CHECK-NEXT: and x19, x0, #0x1
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: tbz w19, #0, .LBB7_2
+; FP-CHECK-NEXT: // %bb.1:
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: .LBB7_2:
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: tbz w19, #0, .LBB7_4
+; FP-CHECK-NEXT: // %bb.3:
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .LBB7_4:
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w19
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+;
+ call void @callee()
+ ret void
+}
+
; If the target does not have SVE, do not spill VG even if the function
; has streaming-mode changes.
;
@@ -822,6 +1059,7 @@ define void @streaming_compatible_no_sve() #4 {
; NO-SVE-CHECK-NEXT: .cfi_restore b14
; NO-SVE-CHECK-NEXT: .cfi_restore b15
; NO-SVE-CHECK-NEXT: ret
+;
call void @streaming_callee()
ret void
}
>From 8334cc5302f4c9b814a74d9204e5f174c07d5bde Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Thu, 18 Apr 2024 16:19:44 +0000
Subject: [PATCH 06/13] - Add warnings to clang for unsupported cases where
unwinding is not possible because of streaming-mode changes without SVE
available.
- Fixed incorrect labels in sme-vg-to-stack.ll
---
clang/include/clang/Basic/DiagnosticGroups.td | 2 +-
.../clang/Basic/DiagnosticSemaKinds.td | 6 +++
clang/lib/Sema/SemaChecking.cpp | 32 ++++++++-----
.../aarch64-sme-inline-streaming-attrs.c | 8 ++--
.../aarch64-sme-attrs.cpp | 26 +++++------
...-sme-func-attrs-without-target-feature.cpp | 2 +-
.../aarch64-streaming-mode-changes-no-sve.c | 45 +++++++++++++++++++
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 8 ++--
8 files changed, 96 insertions(+), 33 deletions(-)
create mode 100644 clang/test/Sema/aarch64-streaming-mode-changes-no-sve.c
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 7d5ba7869ec34..3d8e5311e5c31 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1412,7 +1412,7 @@ def MultiGPU: DiagGroup<"multi-gpu">;
// libc and the CRT to be skipped.
def AVRRtlibLinkingQuirks : DiagGroup<"avr-rtlib-linking-quirks">;
-// A warning group related to AArch64 SME function attribues.
+// A warning group related to AArch64 SME function attributes.
def AArch64SMEAttributes : DiagGroup<"aarch64-sme-attributes">;
// A warning group for things that will change semantics in the future.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 9f0b6f5a36389..b714e74052ab8 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3740,6 +3740,12 @@ def warn_gnu_inline_cplusplus_without_extern : Warning<
"'gnu_inline' attribute without 'extern' in C++ treated as externally"
" available, this changed in Clang 10">,
InGroup<DiagGroup<"gnu-inline-cpp-without-extern">>;
+def warn_sme_streaming_mode_change_no_sve : Warning<
+ "function requires a streaming-mode change, unwinding is not possible without 'sve'">,
+ InGroup<AArch64SMEAttributes>;
+def warn_sme_locally_streaming_no_sve : Warning<
+ "unwinding is not possible for locally-streaming functions without 'sve'">,
+ InGroup<AArch64SMEAttributes>;
def err_attribute_vecreturn_only_vector_member : Error<
"the vecreturn attribute can only be used on a class or structure with one member, which must be a vector">;
def err_attribute_vecreturn_only_pod_record : Error<
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 300af02239779..40840c9e60de9 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3901,8 +3901,11 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
// If the callee has an AArch64 SME attribute to indicate that it is an
// __arm_streaming function, then the caller requires SME to be available.
FunctionProtoType::ExtProtoInfo ExtInfo = Proto->getExtProtoInfo();
- if (ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask) {
- if (auto *CallerFD = dyn_cast<FunctionDecl>(CurContext)) {
+ auto *CallerFD = dyn_cast<FunctionDecl>(CurContext);
+ bool IsCalleeStreaming =
+ ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask;
+ if (IsCalleeStreaming) {
+ if (CallerFD) {
llvm::StringMap<bool> CallerFeatureMap;
Context.getFunctionFeatureMap(CallerFeatureMap, CallerFD);
if (!CallerFeatureMap.contains("sme"))
@@ -3912,18 +3915,27 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
}
}
- // If the call requires a streaming-mode change and has scalable vector
- // arguments or return values, then warn the user that the streaming and
- // non-streaming vector lengths may be different.
- const auto *CallerFD = dyn_cast<FunctionDecl>(CurContext);
- if (CallerFD && (!FD || !FD->getBuiltinID()) &&
- (IsScalableArg || IsScalableRet)) {
- bool IsCalleeStreaming =
- ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask;
+ if (CallerFD && (!FD || !FD->getBuiltinID())) {
bool IsCalleeStreamingCompatible =
ExtInfo.AArch64SMEAttributes &
FunctionType::SME_PStateSMCompatibleMask;
SemaARM::ArmStreamingType CallerFnType = getArmStreamingFnType(CallerFD);
+
+ // SME functions may require SVE to be available for unwinding, as the
+ // value of VG needs to be preserved across streaming-mode changes.
+ if (!Context.getTargetInfo().hasFeature("sve")) {
+ if (CallerFD->hasAttr<ArmLocallyStreamingAttr>())
+ Diag(Loc, diag::warn_sme_locally_streaming_no_sve);
+
+ if ((CallerFnType == SemaARM::ArmStreaming ||
+ CallerFnType == SemaARM::ArmStreamingCompatible) &&
+ (!IsCalleeStreaming && !IsCalleeStreamingCompatible))
+ Diag(Loc, diag::warn_sme_streaming_mode_change_no_sve);
+ }
+
+ // If the call requires a streaming-mode change and has scalable vector
+ // arguments or return values, then warn the user that the streaming and
+ // non-streaming vector lengths may be different.
if (!IsCalleeStreamingCompatible &&
(CallerFnType == SemaARM::ArmStreamingCompatible ||
((CallerFnType == SemaARM::ArmStreaming) ^ IsCalleeStreaming))) {
diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
index 25aebeced9379..64b5891bed259 100644
--- a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
+++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_NONE %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_COMPATIBLE %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_STREAMING %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_LOCALLY %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sve -target-feature +sme -verify -DTEST_NONE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sve -target-feature +sme -verify -DTEST_COMPATIBLE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sve -target-feature +sme -verify -DTEST_STREAMING %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sve -target-feature +sme -verify -DTEST_LOCALLY %s
#define __ai __attribute__((always_inline))
__ai void inlined_fn(void) {}
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
index af8933d93d6cb..57780cda0fb55 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
-// RUN: -disable-O0-optnone -Werror -emit-llvm -o - %s \
+// RUN: -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s \
// RUN: | opt -S -passes=mem2reg \
// RUN: | opt -S -passes=inline \
// RUN: | FileCheck %s
@@ -278,18 +278,18 @@ int test_variadic_template() __arm_inout("za") {
preserves_za_decl);
}
-// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_new_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
-// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_new_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
// CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" }
// CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" }
// CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" }
diff --git a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
index ec6bb6f503578..03100c560b0a4 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
+++ b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
@@ -38,7 +38,7 @@ void non_streaming_def(void (*streaming_fn_ptr)(void) __arm_streaming,
void streaming_compatible_def2(void (*streaming_fn_ptr)(void) __arm_streaming,
void (*streaming_compatible_fn_ptr)(void) __arm_streaming_compatible)
__arm_streaming_compatible {
- non_streaming_decl(); // OK
+ non_streaming_decl(); // expected-warning {{function requires a streaming-mode change, unwinding is not possible without 'sve'}}
streaming_compatible_decl(); // OK
streaming_compatible_fn_ptr(); // OK
streaming_decl(); // expected-error {{call to a streaming function requires 'sme'}}
diff --git a/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.c b/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.c
new file mode 100644
index 0000000000000..c9851adfd7277
--- /dev/null
+++ b/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.c
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
+// RUN: -target-feature -sve -Waarch64-sme-attributes -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+
+#include "arm_sme.h"
+
+int non_streaming_decl(void);
+int streaming_decl(void) __arm_streaming;
+int streaming_compatible_decl(void) __arm_streaming_compatible;
+
+// Streaming-mode changes which would require spilling VG, unsupported without SVE
+
+int streaming_caller_no_sve(void) __arm_streaming {
+ // expected-warning at +1 {{function requires a streaming-mode change, unwinding is not possible without 'sve'}}
+ return non_streaming_decl();
+}
+
+int sc_caller_non_streaming_callee(void) __arm_streaming_compatible {
+ // expected-warning at +1 {{function requires a streaming-mode change, unwinding is not possible without 'sve'}}
+ return non_streaming_decl();
+}
+
+__arm_locally_streaming int locally_streaming_no_sve(void) {
+ // expected-warning at +1 {{unwinding is not possible for locally-streaming functions without 'sve'}}
+ return streaming_decl();
+}
+
+// No warnings expected
+
+int normal_caller_streaming_callee(void) {
+ return streaming_decl();
+}
+
+int normal_caller_streaming_compatible_callee(void) {
+ return streaming_compatible_decl();
+}
+
+int sc_caller_streaming_callee(void) __arm_streaming_compatible {
+ return streaming_decl();
+}
+
+int sc_caller_sc_callee(void) __arm_streaming_compatible {
+ return streaming_compatible_decl();
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 04211e23ec8d5..b0c8c05f46169 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -1030,15 +1030,15 @@ define void @streaming_compatible_no_sve() #4 {
; NO-SVE-CHECK-NEXT: .cfi_offset b15, -96
; NO-SVE-CHECK-NEXT: bl __arm_sme_state
; NO-SVE-CHECK-NEXT: and x19, x0, #0x1
-; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB6_2
+; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_2
; NO-SVE-CHECK-NEXT: // %bb.1:
; NO-SVE-CHECK-NEXT: smstart sm
-; NO-SVE-CHECK-NEXT: .LBB6_2:
+; NO-SVE-CHECK-NEXT: .LBB8_2:
; NO-SVE-CHECK-NEXT: bl streaming_callee
-; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB6_4
+; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_4
; NO-SVE-CHECK-NEXT: // %bb.3:
; NO-SVE-CHECK-NEXT: smstop sm
-; NO-SVE-CHECK-NEXT: .LBB6_4:
+; NO-SVE-CHECK-NEXT: .LBB8_4:
; NO-SVE-CHECK-NEXT: .cfi_def_cfa wsp, 96
; NO-SVE-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; NO-SVE-CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
>From 53520f730634217956c76588a060be1812d1bdf3 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Mon, 22 Apr 2024 12:59:34 +0000
Subject: [PATCH 07/13] - Fixed indentation in checkCall.
- Rebased after new SME warnings were added to SemaChecking.cpp in main.
---
clang/lib/Sema/SemaChecking.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 40840c9e60de9..7d1883d1694a3 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3927,10 +3927,10 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
if (CallerFD->hasAttr<ArmLocallyStreamingAttr>())
Diag(Loc, diag::warn_sme_locally_streaming_no_sve);
- if ((CallerFnType == SemaARM::ArmStreaming ||
- CallerFnType == SemaARM::ArmStreamingCompatible) &&
- (!IsCalleeStreaming && !IsCalleeStreamingCompatible))
- Diag(Loc, diag::warn_sme_streaming_mode_change_no_sve);
+ if ((CallerFnType == SemaARM::ArmStreaming ||
+ CallerFnType == SemaARM::ArmStreamingCompatible) &&
+ (!IsCalleeStreaming && !IsCalleeStreamingCompatible))
+ Diag(Loc, diag::warn_sme_streaming_mode_change_no_sve);
}
// If the call requires a streaming-mode change and has scalable vector
>From 325b6b4374791498ca0dbf79bb6b37262740cffc Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Tue, 7 May 2024 12:44:45 +0000
Subject: [PATCH 08/13] - Split pseudo into VGSavePseudo & VGRestorePseudo.
- Check for noexcept or nothrow when emitting Clang errors.
---
.../clang/Basic/DiagnosticSemaKinds.td | 12 +--
clang/lib/Sema/SemaChecking.cpp | 11 ++-
...-sme-func-attrs-without-target-feature.cpp | 2 +-
.../aarch64-streaming-mode-changes-no-sve.c | 45 ----------
.../aarch64-streaming-mode-changes-no-sve.cpp | 89 +++++++++++++++++++
.../AArch64/AArch64ExpandPseudoInsts.cpp | 6 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 14 +--
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 +-
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 15 ++--
9 files changed, 126 insertions(+), 71 deletions(-)
delete mode 100644 clang/test/Sema/aarch64-streaming-mode-changes-no-sve.c
create mode 100644 clang/test/Sema/aarch64-streaming-mode-changes-no-sve.cpp
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b714e74052ab8..2d9764ee59a79 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3740,12 +3740,6 @@ def warn_gnu_inline_cplusplus_without_extern : Warning<
"'gnu_inline' attribute without 'extern' in C++ treated as externally"
" available, this changed in Clang 10">,
InGroup<DiagGroup<"gnu-inline-cpp-without-extern">>;
-def warn_sme_streaming_mode_change_no_sve : Warning<
- "function requires a streaming-mode change, unwinding is not possible without 'sve'">,
- InGroup<AArch64SMEAttributes>;
-def warn_sme_locally_streaming_no_sve : Warning<
- "unwinding is not possible for locally-streaming functions without 'sve'">,
- InGroup<AArch64SMEAttributes>;
def err_attribute_vecreturn_only_vector_member : Error<
"the vecreturn attribute can only be used on a class or structure with one member, which must be a vector">;
def err_attribute_vecreturn_only_pod_record : Error<
@@ -3782,6 +3776,12 @@ def err_conflicting_attributes_arm_state : Error<
"conflicting attributes for state '%0'">;
def err_sme_streaming_cannot_be_multiversioned : Error<
"streaming function cannot be multi-versioned">;
+def err_sme_streaming_mode_change_no_sve : Error<
+ "function requires a streaming-mode change, unwinding is not possible without 'sve'. "
+ "Consider marking this function as 'noexcept' or '__attribute__((nothrow))'">;
+def err_sme_locally_streaming_no_sve : Error<
+ "unwinding is not possible for locally-streaming functions without 'sve'. "
+ "Consider marking this function as 'noexcept' or '__attribute__((nothrow))'">;
def err_unknown_arm_state : Error<
"unknown state '%0'">;
def err_missing_arm_state : Error<
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7d1883d1694a3..fbca6efe38774 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3920,17 +3920,22 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
ExtInfo.AArch64SMEAttributes &
FunctionType::SME_PStateSMCompatibleMask;
SemaARM::ArmStreamingType CallerFnType = getArmStreamingFnType(CallerFD);
+ bool NoThrow =
+ !getLangOpts().Exceptions ||
+ (Proto && !isUnresolvedExceptionSpec(Proto->getExceptionSpecType()) &&
+ Proto->isNothrow()) ||
+ (FD && FD->hasAttr<NoThrowAttr>());
// SME functions may require SVE to be available for unwinding, as the
// value of VG needs to be preserved across streaming-mode changes.
- if (!Context.getTargetInfo().hasFeature("sve")) {
+ if (!NoThrow && !Context.getTargetInfo().hasFeature("sve")) {
if (CallerFD->hasAttr<ArmLocallyStreamingAttr>())
- Diag(Loc, diag::warn_sme_locally_streaming_no_sve);
+ Diag(Loc, diag::err_sme_locally_streaming_no_sve);
if ((CallerFnType == SemaARM::ArmStreaming ||
CallerFnType == SemaARM::ArmStreamingCompatible) &&
(!IsCalleeStreaming && !IsCalleeStreamingCompatible))
- Diag(Loc, diag::warn_sme_streaming_mode_change_no_sve);
+ Diag(Loc, diag::err_sme_streaming_mode_change_no_sve);
}
// If the call requires a streaming-mode change and has scalable vector
diff --git a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
index 03100c560b0a4..971ced8b59c86 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
+++ b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
@@ -38,7 +38,7 @@ void non_streaming_def(void (*streaming_fn_ptr)(void) __arm_streaming,
void streaming_compatible_def2(void (*streaming_fn_ptr)(void) __arm_streaming,
void (*streaming_compatible_fn_ptr)(void) __arm_streaming_compatible)
__arm_streaming_compatible {
- non_streaming_decl(); // expected-warning {{function requires a streaming-mode change, unwinding is not possible without 'sve'}}
+ non_streaming_decl();
streaming_compatible_decl(); // OK
streaming_compatible_fn_ptr(); // OK
streaming_decl(); // expected-error {{call to a streaming function requires 'sme'}}
diff --git a/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.c b/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.c
deleted file mode 100644
index c9851adfd7277..0000000000000
--- a/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
-// RUN: -target-feature -sve -Waarch64-sme-attributes -fsyntax-only -verify %s
-
-// REQUIRES: aarch64-registered-target
-
-#include "arm_sme.h"
-
-int non_streaming_decl(void);
-int streaming_decl(void) __arm_streaming;
-int streaming_compatible_decl(void) __arm_streaming_compatible;
-
-// Streaming-mode changes which would require spilling VG, unsupported without SVE
-
-int streaming_caller_no_sve(void) __arm_streaming {
- // expected-warning at +1 {{function requires a streaming-mode change, unwinding is not possible without 'sve'}}
- return non_streaming_decl();
-}
-
-int sc_caller_non_streaming_callee(void) __arm_streaming_compatible {
- // expected-warning at +1 {{function requires a streaming-mode change, unwinding is not possible without 'sve'}}
- return non_streaming_decl();
-}
-
-__arm_locally_streaming int locally_streaming_no_sve(void) {
- // expected-warning at +1 {{unwinding is not possible for locally-streaming functions without 'sve'}}
- return streaming_decl();
-}
-
-// No warnings expected
-
-int normal_caller_streaming_callee(void) {
- return streaming_decl();
-}
-
-int normal_caller_streaming_compatible_callee(void) {
- return streaming_compatible_decl();
-}
-
-int sc_caller_streaming_callee(void) __arm_streaming_compatible {
- return streaming_decl();
-}
-
-int sc_caller_sc_callee(void) __arm_streaming_compatible {
- return streaming_compatible_decl();
-}
diff --git a/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.cpp b/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.cpp
new file mode 100644
index 0000000000000..1a85f239f77e6
--- /dev/null
+++ b/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
+// RUN: -target-feature -sve -fexceptions -DNO_THROW -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
+// RUN: -target-feature -sve -fexceptions -DNO_EXCEPT -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
+// RUN: -target-feature -sve -DNO_EXCEPT_FLAG -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+
+#include "arm_sme.h"
+
+int non_streaming_decl(void);
+int streaming_decl(void) __arm_streaming;
+int streaming_compatible_decl(void) __arm_streaming_compatible;
+
+#ifdef NO_THROW
+#define NOTHROW_ATTR __attribute__((__nothrow__))
+#else
+#define NOTHROW_ATTR
+#endif
+
+#ifdef NO_EXCEPT
+#define NOEXCEPT_ATTR noexcept
+#else
+#define NOEXCEPT_ATTR
+#endif
+
+#ifdef NO_EXCEPT_FLAG
+ // expected-no-diagnostics
+#endif
+
+NOTHROW_ATTR int nothrow_non_streaming_decl(void) NOEXCEPT_ATTR;
+NOTHROW_ATTR int nothrow_streaming_decl(void) NOEXCEPT_ATTR;
+NOTHROW_ATTR int nothrow_streaming_compatible_decl(void) NOEXCEPT_ATTR;
+
+// Streaming-mode changes which would require spilling VG if unwinding is possible, unsupported without SVE
+
+int streaming_caller_no_sve(void) __arm_streaming {
+#ifndef NO_EXCEPT_FLAG
+ // expected-error at +2 {{function requires a streaming-mode change, unwinding is not possible without 'sve'. Consider marking this function as 'noexcept' or '__attribute__((nothrow))'}}
+#endif
+ return non_streaming_decl();
+}
+
+int sc_caller_non_streaming_callee(void) __arm_streaming_compatible {
+#ifndef NO_EXCEPT_FLAG
+ // expected-error at +2 {{function requires a streaming-mode change, unwinding is not possible without 'sve'. Consider marking this function as 'noexcept' or '__attribute__((nothrow))'}}
+#endif
+ return non_streaming_decl();
+}
+
+__arm_locally_streaming int locally_streaming_no_sve(void) {
+#ifndef NO_EXCEPT_FLAG
+ // expected-error at +2 {{unwinding is not possible for locally-streaming functions without 'sve'. Consider marking this function as 'noexcept' or '__attribute__((nothrow))'}}
+#endif
+ return streaming_decl();
+}
+
+// Nothrow / noexcept attribute on callee - warnings not expected
+
+int nothrow_streaming_caller_no_sve(void) __arm_streaming {
+ return nothrow_non_streaming_decl();
+}
+
+int nothrow_sc_caller_non_streaming_callee(void) __arm_streaming_compatible {
+ return nothrow_non_streaming_decl();
+}
+
+__arm_locally_streaming int nothrow_locally_streaming_no_sve(void) {
+ return nothrow_streaming_decl();
+}
+
+// No warnings expected, even if unwinding is possible
+
+int normal_caller_streaming_callee(void) {
+ return streaming_decl();
+}
+
+int normal_caller_streaming_compatible_callee(void) {
+ return streaming_compatible_decl();
+}
+
+int sc_caller_streaming_callee(void) __arm_streaming_compatible {
+ return streaming_decl();
+}
+
+int sc_caller_sc_callee(void) __arm_streaming_compatible {
+ return streaming_compatible_decl();
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index be03fc4d2cc40..f8d77ee329516 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1599,7 +1599,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case AArch64::COALESCER_BARRIER_FPR128:
MI.eraseFromParent();
return true;
- case AArch64::VGUnwindInfoPseudo: {
+ case AArch64::VGSavePseudo:
+ case AArch64::VGRestorePseudo: {
MachineFunction &MF = *MBB.getParent();
SMEAttrs FuncAttrs(MF.getFunction());
bool LocallyStreaming =
@@ -1617,7 +1618,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
const TargetSubtargetInfo &STI = MF.getSubtarget();
const TargetInstrInfo &TII = *STI.getInstrInfo();
const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
- if (MI.getOperand(0).getImm() == 1) {
+
+ if (Opcode == AArch64::VGSavePseudo) {
// This pseudo has been inserted after a streaming-mode change
// to save the streaming value of VG before a call.
// Calculate and emit the CFI offset using VGFrameIdx.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 748491cf61a67..ac13854124253 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2489,7 +2489,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::FIRST_NUMBER:
break;
MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
- MAKE_CASE(AArch64ISD::VG_UNWIND)
+ MAKE_CASE(AArch64ISD::VG_SAVE)
+ MAKE_CASE(AArch64ISD::VG_RESTORE)
MAKE_CASE(AArch64ISD::SMSTART)
MAKE_CASE(AArch64ISD::SMSTOP)
MAKE_CASE(AArch64ISD::RESTORE_ZA)
@@ -8513,9 +8514,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (RequiresSMChange) {
if (Subtarget->hasSVE()) {
- Chain = DAG.getNode(
- AArch64ISD::VG_UNWIND, DL, DAG.getVTList(MVT::Other, MVT::Glue),
- {Chain, DAG.getTargetConstant(/*Save*/ 1, DL, MVT::i64)});
+ Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), Chain);
InGlue = Chain.getValue(1);
}
@@ -8700,9 +8700,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
InGlue = Result.getValue(1);
if (Subtarget->hasSVE())
- Result = DAG.getNode(
- AArch64ISD::VG_UNWIND, DL, DAG.getVTList(MVT::Other, MVT::Glue),
- {Result, DAG.getTargetConstant(/*Restore*/ 0, DL, MVT::i64), InGlue});
+ Result =
+ DAG.getNode(AArch64ISD::VG_RESTORE, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
}
if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index dadfd39039192..b57ba097847cd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -70,7 +70,8 @@ enum NodeType : unsigned {
COALESCER_BARRIER,
- VG_UNWIND,
+ VG_SAVE,
+ VG_RESTORE,
SMSTART,
SMSTOP,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index ab4349e3c3468..fea70b7ffb074 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -31,8 +31,11 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
def AArch64CoalescerBarrier
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>;
-def AArch64VGUnwind : SDNode<"AArch64ISD::VG_UNWIND", SDTypeProfile<0, 1, []>,
- [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
@@ -227,11 +230,11 @@ def : Pat<(AArch64_smstop (i32 svcr_op:$pstate), (i64 /*AArch64SME::Always*/0)),
// Pseudo to insert cfi_offset/cfi_restore instructions. Used to save or restore
// the streaming value of VG around streaming-mode changes in locally-streaming
// functions.
-def VGUnwindInfoPseudo :
- Pseudo<(outs), (ins timm0_1:$save_restore), []>, Sched<[]>;
+def VGSavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+def : Pat<(AArch64VGSave), (VGSavePseudo)>;
-def : Pat<(AArch64VGUnwind (i64 timm0_1:$save_restore)),
- (VGUnwindInfoPseudo timm0_1:$save_restore)>;
+def VGRestorePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+def : Pat<(AArch64VGRestore), (VGRestorePseudo)>;
//===----------------------------------------------------------------------===//
// SME2 Instructions
>From 6e9cd8e9c0885676fa202a6d198fb02e2b192771 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Wed, 15 May 2024 15:28:31 +0000
Subject: [PATCH 09/13] - Replace pseudos with cfi instructions in
processFunctionBeforeFrameIndicesReplaced and removed handling from
AArch64ExpandPseudoInsts.
- Removed diagnostics from Clang for unwinding without +sve.
- Removed hasSVE() check when emitting pseudos around calls in AArch64ISelLowering.
- Emit a call to __arm_get_current_vg from spillCalleeSavedRegisters if
HasSVE is false & preserve X0 around the call if live.
- Updated LLVM tests with streaming-mode changes to also pass +sve.
---
clang/include/clang/Basic/DiagnosticGroups.td | 2 +-
.../clang/Basic/DiagnosticSemaKinds.td | 6 -
clang/lib/Sema/SemaChecking.cpp | 37 ++---
.../aarch64-sme-inline-streaming-attrs.c | 8 +-
.../aarch64-sme-attrs.cpp | 26 +--
...-sme-func-attrs-without-target-feature.cpp | 2 +-
.../aarch64-streaming-mode-changes-no-sve.cpp | 89 ----------
.../AArch64/AArch64ExpandPseudoInsts.cpp | 51 ------
.../Target/AArch64/AArch64FrameLowering.cpp | 152 +++++++++++++++---
.../Target/AArch64/AArch64ISelLowering.cpp | 16 +-
.../AArch64/AArch64MachineFunctionInfo.h | 7 +-
...compatible-to-normal-fn-wihout-sme-attr.ll | 22 ++-
.../AArch64/sme-disable-gisel-fisel.ll | 79 ++++++---
.../CodeGen/AArch64/sme-lazy-save-call.ll | 54 ++++---
...ing-body-streaming-compatible-interface.ll | 43 +++--
.../CodeGen/AArch64/sme-streaming-body.ll | 152 ++++++++++++------
.../sme-streaming-compatible-interface.ll | 120 ++++++++------
.../AArch64/sme-streaming-interface.ll | 49 +++---
...nging-call-disable-stackslot-scavenging.ll | 11 +-
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 26 ++-
.../streaming-compatible-memory-ops.ll | 76 +++++----
21 files changed, 566 insertions(+), 462 deletions(-)
delete mode 100644 clang/test/Sema/aarch64-streaming-mode-changes-no-sve.cpp
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 3d8e5311e5c31..7d5ba7869ec34 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1412,7 +1412,7 @@ def MultiGPU: DiagGroup<"multi-gpu">;
// libc and the CRT to be skipped.
def AVRRtlibLinkingQuirks : DiagGroup<"avr-rtlib-linking-quirks">;
-// A warning group related to AArch64 SME function attributes.
+// A warning group related to AArch64 SME function attribues.
def AArch64SMEAttributes : DiagGroup<"aarch64-sme-attributes">;
// A warning group for things that will change semantics in the future.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 2d9764ee59a79..9f0b6f5a36389 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3776,12 +3776,6 @@ def err_conflicting_attributes_arm_state : Error<
"conflicting attributes for state '%0'">;
def err_sme_streaming_cannot_be_multiversioned : Error<
"streaming function cannot be multi-versioned">;
-def err_sme_streaming_mode_change_no_sve : Error<
- "function requires a streaming-mode change, unwinding is not possible without 'sve'. "
- "Consider marking this function as 'noexcept' or '__attribute__((nothrow))'">;
-def err_sme_locally_streaming_no_sve : Error<
- "unwinding is not possible for locally-streaming functions without 'sve'. "
- "Consider marking this function as 'noexcept' or '__attribute__((nothrow))'">;
def err_unknown_arm_state : Error<
"unknown state '%0'">;
def err_missing_arm_state : Error<
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index fbca6efe38774..300af02239779 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3901,11 +3901,8 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
// If the callee has an AArch64 SME attribute to indicate that it is an
// __arm_streaming function, then the caller requires SME to be available.
FunctionProtoType::ExtProtoInfo ExtInfo = Proto->getExtProtoInfo();
- auto *CallerFD = dyn_cast<FunctionDecl>(CurContext);
- bool IsCalleeStreaming =
- ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask;
- if (IsCalleeStreaming) {
- if (CallerFD) {
+ if (ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask) {
+ if (auto *CallerFD = dyn_cast<FunctionDecl>(CurContext)) {
llvm::StringMap<bool> CallerFeatureMap;
Context.getFunctionFeatureMap(CallerFeatureMap, CallerFD);
if (!CallerFeatureMap.contains("sme"))
@@ -3915,32 +3912,18 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
}
}
- if (CallerFD && (!FD || !FD->getBuiltinID())) {
+ // If the call requires a streaming-mode change and has scalable vector
+ // arguments or return values, then warn the user that the streaming and
+ // non-streaming vector lengths may be different.
+ const auto *CallerFD = dyn_cast<FunctionDecl>(CurContext);
+ if (CallerFD && (!FD || !FD->getBuiltinID()) &&
+ (IsScalableArg || IsScalableRet)) {
+ bool IsCalleeStreaming =
+ ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask;
bool IsCalleeStreamingCompatible =
ExtInfo.AArch64SMEAttributes &
FunctionType::SME_PStateSMCompatibleMask;
SemaARM::ArmStreamingType CallerFnType = getArmStreamingFnType(CallerFD);
- bool NoThrow =
- !getLangOpts().Exceptions ||
- (Proto && !isUnresolvedExceptionSpec(Proto->getExceptionSpecType()) &&
- Proto->isNothrow()) ||
- (FD && FD->hasAttr<NoThrowAttr>());
-
- // SME functions may require SVE to be available for unwinding, as the
- // value of VG needs to be preserved across streaming-mode changes.
- if (!NoThrow && !Context.getTargetInfo().hasFeature("sve")) {
- if (CallerFD->hasAttr<ArmLocallyStreamingAttr>())
- Diag(Loc, diag::err_sme_locally_streaming_no_sve);
-
- if ((CallerFnType == SemaARM::ArmStreaming ||
- CallerFnType == SemaARM::ArmStreamingCompatible) &&
- (!IsCalleeStreaming && !IsCalleeStreamingCompatible))
- Diag(Loc, diag::err_sme_streaming_mode_change_no_sve);
- }
-
- // If the call requires a streaming-mode change and has scalable vector
- // arguments or return values, then warn the user that the streaming and
- // non-streaming vector lengths may be different.
if (!IsCalleeStreamingCompatible &&
(CallerFnType == SemaARM::ArmStreamingCompatible ||
((CallerFnType == SemaARM::ArmStreaming) ^ IsCalleeStreaming))) {
diff --git a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
index 64b5891bed259..25aebeced9379 100644
--- a/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
+++ b/clang/test/CodeGen/aarch64-sme-inline-streaming-attrs.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sve -target-feature +sme -verify -DTEST_NONE %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sve -target-feature +sme -verify -DTEST_COMPATIBLE %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sve -target-feature +sme -verify -DTEST_STREAMING %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sve -target-feature +sme -verify -DTEST_LOCALLY %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_NONE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_COMPATIBLE %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_STREAMING %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -S -o /dev/null -target-feature +sme -verify -DTEST_LOCALLY %s
#define __ai __attribute__((always_inline))
__ai void inlined_fn(void) {}
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
index 57780cda0fb55..af8933d93d6cb 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
-// RUN: -target-feature +sve -disable-O0-optnone -Werror -emit-llvm -o - %s \
+// RUN: -disable-O0-optnone -Werror -emit-llvm -o - %s \
// RUN: | opt -S -passes=mem2reg \
// RUN: | opt -S -passes=inline \
// RUN: | FileCheck %s
@@ -278,18 +278,18 @@ int test_variadic_template() __arm_inout("za") {
preserves_za_decl);
}
-// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind vscale_range(1,16) "aarch64_new_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
-// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind vscale_range(1,16) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme,+sve" }
+// CHECK: attributes #[[SM_ENABLED]] = { mustprogress noinline nounwind "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[NORMAL_DECL]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_ENABLED_DECL]] = { "aarch64_pstate_sm_enabled" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_COMPATIBLE]] = { mustprogress noinline nounwind "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_COMPATIBLE_DECL]] = { "aarch64_pstate_sm_compatible" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[SM_BODY]] = { mustprogress noinline nounwind "aarch64_pstate_sm_body" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_SHARED]] = { mustprogress noinline nounwind "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_SHARED_DECL]] = { "aarch64_inout_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_PRESERVED]] = { mustprogress noinline nounwind "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_PRESERVED_DECL]] = { "aarch64_preserves_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[ZA_NEW]] = { mustprogress noinline nounwind "aarch64_new_za" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
+// CHECK: attributes #[[NORMAL_DEF]] = { mustprogress noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
// CHECK: attributes #[[SM_ENABLED_CALL]] = { "aarch64_pstate_sm_enabled" }
// CHECK: attributes #[[SM_COMPATIBLE_CALL]] = { "aarch64_pstate_sm_compatible" }
// CHECK: attributes #[[SM_BODY_CALL]] = { "aarch64_pstate_sm_body" }
diff --git a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
index 971ced8b59c86..ec6bb6f503578 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
+++ b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
@@ -38,7 +38,7 @@ void non_streaming_def(void (*streaming_fn_ptr)(void) __arm_streaming,
void streaming_compatible_def2(void (*streaming_fn_ptr)(void) __arm_streaming,
void (*streaming_compatible_fn_ptr)(void) __arm_streaming_compatible)
__arm_streaming_compatible {
- non_streaming_decl();
+ non_streaming_decl(); // OK
streaming_compatible_decl(); // OK
streaming_compatible_fn_ptr(); // OK
streaming_decl(); // expected-error {{call to a streaming function requires 'sme'}}
diff --git a/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.cpp b/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.cpp
deleted file mode 100644
index 1a85f239f77e6..0000000000000
--- a/clang/test/Sema/aarch64-streaming-mode-changes-no-sve.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
-// RUN: -target-feature -sve -fexceptions -DNO_THROW -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
-// RUN: -target-feature -sve -fexceptions -DNO_EXCEPT -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
-// RUN: -target-feature -sve -DNO_EXCEPT_FLAG -fsyntax-only -verify %s
-
-// REQUIRES: aarch64-registered-target
-
-#include "arm_sme.h"
-
-int non_streaming_decl(void);
-int streaming_decl(void) __arm_streaming;
-int streaming_compatible_decl(void) __arm_streaming_compatible;
-
-#ifdef NO_THROW
-#define NOTHROW_ATTR __attribute__((__nothrow__))
-#else
-#define NOTHROW_ATTR
-#endif
-
-#ifdef NO_EXCEPT
-#define NOEXCEPT_ATTR noexcept
-#else
-#define NOEXCEPT_ATTR
-#endif
-
-#ifdef NO_EXCEPT_FLAG
- // expected-no-diagnostics
-#endif
-
-NOTHROW_ATTR int nothrow_non_streaming_decl(void) NOEXCEPT_ATTR;
-NOTHROW_ATTR int nothrow_streaming_decl(void) NOEXCEPT_ATTR;
-NOTHROW_ATTR int nothrow_streaming_compatible_decl(void) NOEXCEPT_ATTR;
-
-// Streaming-mode changes which would require spilling VG if unwinding is possible, unsupported without SVE
-
-int streaming_caller_no_sve(void) __arm_streaming {
-#ifndef NO_EXCEPT_FLAG
- // expected-error at +2 {{function requires a streaming-mode change, unwinding is not possible without 'sve'. Consider marking this function as 'noexcept' or '__attribute__((nothrow))'}}
-#endif
- return non_streaming_decl();
-}
-
-int sc_caller_non_streaming_callee(void) __arm_streaming_compatible {
-#ifndef NO_EXCEPT_FLAG
- // expected-error at +2 {{function requires a streaming-mode change, unwinding is not possible without 'sve'. Consider marking this function as 'noexcept' or '__attribute__((nothrow))'}}
-#endif
- return non_streaming_decl();
-}
-
-__arm_locally_streaming int locally_streaming_no_sve(void) {
-#ifndef NO_EXCEPT_FLAG
- // expected-error at +2 {{unwinding is not possible for locally-streaming functions without 'sve'. Consider marking this function as 'noexcept' or '__attribute__((nothrow))'}}
-#endif
- return streaming_decl();
-}
-
-// Nothrow / noexcept attribute on callee - warnings not expected
-
-int nothrow_streaming_caller_no_sve(void) __arm_streaming {
- return nothrow_non_streaming_decl();
-}
-
-int nothrow_sc_caller_non_streaming_callee(void) __arm_streaming_compatible {
- return nothrow_non_streaming_decl();
-}
-
-__arm_locally_streaming int nothrow_locally_streaming_no_sve(void) {
- return nothrow_streaming_decl();
-}
-
-// No warnings expected, even if unwinding is possible
-
-int normal_caller_streaming_callee(void) {
- return streaming_decl();
-}
-
-int normal_caller_streaming_compatible_callee(void) {
- return streaming_compatible_decl();
-}
-
-int sc_caller_streaming_callee(void) __arm_streaming_compatible {
- return streaming_decl();
-}
-
-int sc_caller_sc_callee(void) __arm_streaming_compatible {
- return streaming_compatible_decl();
-}
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index f8d77ee329516..6647a13aab870 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1599,57 +1599,6 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case AArch64::COALESCER_BARRIER_FPR128:
MI.eraseFromParent();
return true;
- case AArch64::VGSavePseudo:
- case AArch64::VGRestorePseudo: {
- MachineFunction &MF = *MBB.getParent();
- SMEAttrs FuncAttrs(MF.getFunction());
- bool LocallyStreaming =
- FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
- const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-
- if (!AFI->requiresVGSpill(MF))
- return false;
-
- int64_t VGFrameIdx =
- LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx();
- assert(VGFrameIdx != std::numeric_limits<int>::max() &&
- "Expected FrameIdx for VG");
-
- const TargetSubtargetInfo &STI = MF.getSubtarget();
- const TargetInstrInfo &TII = *STI.getInstrInfo();
- const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
-
- if (Opcode == AArch64::VGSavePseudo) {
- // This pseudo has been inserted after a streaming-mode change
- // to save the streaming value of VG before a call.
- // Calculate and emit the CFI offset using VGFrameIdx.
- MachineFrameInfo &MFI = MF.getFrameInfo();
- const AArch64FrameLowering *TFI =
- MF.getSubtarget<AArch64Subtarget>().getFrameLowering();
-
- int64_t Offset =
- MFI.getObjectOffset(VGFrameIdx) - TFI->getOffsetOfLocalArea();
- unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
- nullptr, TRI.getDwarfRegNum(AArch64::VG, true), Offset));
- BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
- TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
- } else {
- // This is a restore of VG after returning from the call. Emit the
- // .cfi_restore instruction, which sets the rule for VG to the same
- // as it was on entry to the function.
- ++MBBI;
- DebugLoc DL = MI.getDebugLoc();
- unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
- nullptr, TRI.getDwarfRegNum(AArch64::VG, true)));
- BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
- }
-
- MI.eraseFromParent();
- return true;
- }
case AArch64::LD1B_2Z_IMM_PSEUDO:
return expandMultiVecPseudo(
MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 9f7227944c80b..b16f2db8e6081 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -321,7 +321,7 @@ bool AArch64FrameLowering::homogeneousPrologEpilog(
return false;
auto *AFI = MF.getInfo<AArch64FunctionInfo>();
- if (AFI->hasSwiftAsyncContext() || AFI->requiresVGSpill(MF))
+ if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
return false;
// If there are an odd number of GPRs before LR and FP in the CSRs list,
@@ -1349,6 +1349,32 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
}
+bool requiresGetVGCall(MachineFunction &MF) {
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ return AFI->hasStreamingModeChanges() &&
+ !MF.getSubtarget<AArch64Subtarget>().hasSVE();
+}
+
+bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
+ unsigned Opc = MBBI->getOpcode();
+ if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
+ Opc == AArch64::UBFMXri)
+ return true;
+
+ if (requiresGetVGCall(*MBBI->getMF())) {
+ if (Opc == AArch64::ORRXrr)
+ return true;
+
+ if (Opc == AArch64::BL) {
+ auto Op1 = MBBI->getOperand(0);
+ return Op1.isSymbol() &&
+ (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
+ }
+ }
+
+ return false;
+}
+
// Convert callee-save register save/restore instruction to do stack pointer
// decrement/increment to allocate/deallocate the callee-save stack area by
// converting store/load to use pre/post increment version.
@@ -1364,16 +1390,11 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// to calculate the value of VG before spilling. For locally-streaming
// functions, we need to do this for both the streaming and non-streaming
// vector length. Move past these instructions if necessary.
- unsigned Opc = MBBI->getOpcode();
- if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI) {
- MachineFunction &MF = *MBB.getParent();
- AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- assert(AFI->requiresVGSpill(MF) &&
- "Unexpected callee-save save/restore opcode!");
- ++MBBI;
- if (MBBI->getOpcode() == AArch64::UBFMXri)
+ MachineFunction &MF = *MBB.getParent();
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ if (AFI->hasStreamingModeChanges())
+ while (isVGInstruction(MBBI))
++MBBI;
- }
switch (MBBI->getOpcode()) {
default:
@@ -1431,7 +1452,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// If the first store isn't right where we want SP then we can't fold the
// update in so create a normal arithmetic instruction instead.
- MachineFunction &MF = *MBB.getParent();
if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
@@ -1683,6 +1703,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
LiveRegs.removeReg(AArch64::X19);
LiveRegs.removeReg(AArch64::FP);
LiveRegs.removeReg(AArch64::LR);
+
+ // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
+ // This is necessary to spill VG if required where SVE is unavailable, but
+ // X0 is preserved around this call.
+ if (requiresGetVGCall(MF))
+ LiveRegs.removeReg(AArch64::X0);
}
auto VerifyClobberOnExit = make_scope_exit([&]() {
@@ -1869,14 +1895,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// pointer bump above.
while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
!IsSVECalleeSave(MBBI)) {
- unsigned Opc = MBBI->getOpcode();
// Move past instructions generated to calculate VG
- if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI) {
- assert(AFI->requiresVGSpill(MF) && "Unexpected opcode!");
- ++MBBI;
- if (MBBI->getOpcode() == AArch64::UBFMXri)
+ if (AFI->hasStreamingModeChanges())
+ while (isVGInstruction(MBBI))
++MBBI;
- }
+
if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
NeedsWinCFI, &HasWinCFI);
@@ -3041,6 +3064,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool NeedsWinCFI = needsWinCFI(MF);
+ bool HasSVE = MF.getSubtarget<AArch64Subtarget>().hasSVE();
DebugLoc DL;
SmallVector<RegPairInfo, 8> RegPairs;
@@ -3114,6 +3138,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
break;
}
+ unsigned X0Scratch = AArch64::NoRegister;
if (Reg1 == AArch64::VG) {
// Find an available register to store value of VG to.
Reg1 = findScratchNonCalleeSaveRegister(&MBB);
@@ -3135,10 +3160,32 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
AFI->setStreamingVGIdx(RPI.FrameIdx);
} else {
- BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
- .addImm(31)
- .addImm(1)
- .setMIFlag(MachineInstr::FrameSetup);
+ if (HasSVE)
+ BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
+ .addImm(31)
+ .addImm(1)
+ .setMIFlag(MachineInstr::FrameSetup);
+ else {
+ const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
+ for (const auto &LiveIn : MBB.liveins())
+ if (STI.getRegisterInfo()->isSuperOrSubRegisterEq(AArch64::X0,
+ LiveIn.PhysReg))
+ X0Scratch = Reg1;
+
+ if (X0Scratch != AArch64::NoRegister)
+ BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)
+ .addReg(AArch64::XZR)
+ .addReg(AArch64::X0, RegState::Undef)
+ .addReg(AArch64::X0, RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
+
+ // FIXME: Add PreserveMost_From_X1 regmask from PR #93963
+ BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
+ .addExternalSymbol("__arm_get_current_vg")
+ .addReg(AArch64::X0, RegState::ImplicitDefine)
+ .setMIFlag(MachineInstr::FrameSetup);
+ Reg1 = AArch64::X0;
+ }
AFI->setVGIdx(RPI.FrameIdx);
}
}
@@ -3234,6 +3281,13 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
if (RPI.isPaired())
MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
}
+
+ if (X0Scratch != AArch64::NoRegister)
+ BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0)
+ .addReg(AArch64::XZR)
+ .addReg(X0Scratch, RegState::Undef)
+ .addReg(X0Scratch, RegState::Implicit)
+ .setMIFlag(MachineInstr::FrameSetup);
}
return true;
}
@@ -3520,7 +3574,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// non-streaming VG value.
const Function &F = MF.getFunction();
SMEAttrs Attrs(F);
- if (AFI->requiresVGSpill(MF)) {
+ if (AFI->hasStreamingModeChanges()) {
if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
CSStackSize += 16;
else
@@ -3664,7 +3718,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
}
// Insert VG into the list of CSRs, immediately before LR if saved.
- if (AFI->requiresVGSpill(MF)) {
+ if (AFI->hasStreamingModeChanges()) {
std::vector<CalleeSavedInfo> VGSaves;
SMEAttrs Attrs(MF.getFunction());
@@ -4305,12 +4359,58 @@ MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
}
} // namespace
+MachineBasicBlock::iterator emitVGSaveRestore(MachineBasicBlock::iterator II,
+ const AArch64FrameLowering *TFI) {
+ MachineInstr &MI = *II;
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+
+ if (MI.getOpcode() != AArch64::VGSavePseudo &&
+ MI.getOpcode() != AArch64::VGRestorePseudo)
+ return II;
+
+ SMEAttrs FuncAttrs(MF->getFunction());
+ bool LocallyStreaming =
+ FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
+ const AArch64FunctionInfo *AFI = MF->getInfo<AArch64FunctionInfo>();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ const AArch64InstrInfo *TII =
+ MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+ int64_t VGFrameIdx =
+ LocallyStreaming ? AFI->getStreamingVGIdx() : AFI->getVGIdx();
+ assert(VGFrameIdx != std::numeric_limits<int>::max() &&
+ "Expected FrameIdx for VG");
+
+ unsigned CFIIndex;
+ if (MI.getOpcode() == AArch64::VGSavePseudo) {
+ const MachineFrameInfo &MFI = MF->getFrameInfo();
+ int64_t Offset =
+ MFI.getObjectOffset(VGFrameIdx) - TFI->getOffsetOfLocalArea();
+ CFIIndex = MF->addFrameInst(MCCFIInstruction::createOffset(
+ nullptr, TRI->getDwarfRegNum(AArch64::VG, true), Offset));
+ } else
+ CFIIndex = MF->addFrameInst(MCCFIInstruction::createRestore(
+ nullptr, TRI->getDwarfRegNum(AArch64::VG, true)));
+
+ MachineInstr *UnwindInst = BuildMI(*MBB, II, II->getDebugLoc(),
+ TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex);
+
+ MI.eraseFromParent();
+ return UnwindInst->getIterator();
+}
+
void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
MachineFunction &MF, RegScavenger *RS = nullptr) const {
- if (StackTaggingMergeSetTag)
- for (auto &BB : MF)
- for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
+ AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+ for (auto &BB : MF)
+ for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) {
+ if (AFI->hasStreamingModeChanges())
+ II = emitVGSaveRestore(II, this);
+ if (StackTaggingMergeSetTag)
II = tryMergeAdjacentSTG(II, this, RS);
+ }
}
/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ac13854124253..4d11f731e096e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8513,11 +8513,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue InGlue;
if (RequiresSMChange) {
- if (Subtarget->hasSVE()) {
- Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
- DAG.getVTList(MVT::Other, MVT::Glue), Chain);
- InGlue = Chain.getValue(1);
- }
+ Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), Chain);
+ InGlue = Chain.getValue(1);
SDValue NewChain = changeStreamingMode(
DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
@@ -8693,16 +8691,14 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (RequiresSMChange) {
assert(PStateSM && "Expected a PStateSM to be set");
-
Result = changeStreamingMode(
DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
InGlue = Result.getValue(1);
- if (Subtarget->hasSVE())
- Result =
- DAG.getNode(AArch64ISD::VG_RESTORE, DL,
- DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
+ Result =
+ DAG.getNode(AArch64ISD::VG_RESTORE, DL,
+ DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
}
if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index aee0986203889..839a3a3878076 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -243,7 +243,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
void setVGIdx(unsigned Idx) { VGIdx = Idx; };
int64_t getStreamingVGIdx() const { return StreamingVGIdx; };
- void setStreamingVGIdx(unsigned Idx) { StreamingVGIdx = Idx; };
+ void setStreamingVGIdx(unsigned FrameIdx) { StreamingVGIdx = FrameIdx; };
bool isSVECC() const { return IsSVECC; };
void setIsSVECC(bool s) { IsSVECC = s; };
@@ -505,11 +505,6 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
HasStreamingModeChanges = HasChanges;
}
- bool requiresVGSpill(const MachineFunction &MF) const {
- const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
- return STI.hasSVE() && HasStreamingModeChanges;
- }
-
bool hasStackProbing() const { return StackProbeSize != 0; }
int64_t getStackProbeSize() const { return StackProbeSize; }
diff --git a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
index 07377195d62a0..c4440e7bcc3ff 100644
--- a/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
+++ b/llvm/test/CodeGen/AArch64/sme-call-streaming-compatible-to-normal-fn-wihout-sme-attr.ll
@@ -10,11 +10,13 @@ target triple = "aarch64"
define void @streaming_compatible() #0 {
; CHECK-LABEL: streaming_compatible:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: stp x0, x19, [sp, #72] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB0_2
@@ -26,11 +28,12 @@ define void @streaming_compatible() #0 {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB0_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @non_streaming()
ret void
@@ -44,12 +47,14 @@ declare void @non_streaming()
define void @streaming_compatible_arg(float %f) #0 {
; CHECK-LABEL: streaming_compatible_arg:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: bl __arm_get_current_vg
+; CHECK-NEXT: stp x0, x19, [sp, #88] // 16-byte Folded Spill
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
@@ -63,12 +68,13 @@ define void @streaming_compatible_arg(float %f) #0 {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
call void @non_streaming(float %f)
ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 254e37e836cbb..d786ffd412c47 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
+; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 < %s \
; RUN: | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-FISEL
-; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
+; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 < %s \
; RUN: | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-GISEL
@@ -17,6 +17,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
; CHECK-FISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-FISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-FISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-FISEL-NEXT: cntd x9
+; CHECK-FISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-FISEL-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-FISEL-NEXT: smstart sm
; CHECK-FISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -43,6 +45,8 @@ define double @nonstreaming_caller_streaming_callee(double %x) nounwind noinline
; CHECK-GISEL-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-GISEL-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-GISEL-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-GISEL-NEXT: cntd x9
+; CHECK-GISEL-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-GISEL-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-GISEL-NEXT: smstart sm
; CHECK-GISEL-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -76,6 +80,8 @@ define double @streaming_caller_nonstreaming_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -102,12 +108,17 @@ entry:
define double @locally_streaming_caller_normal_callee(double %x) nounwind noinline optnone "aarch64_pstate_sm_body" {
; CHECK-COMMON-LABEL: locally_streaming_caller_normal_callee:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: sub sp, sp, #112
+; CHECK-COMMON-NEXT: sub sp, sp, #128
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: rdsvl x9, #1
+; CHECK-COMMON-NEXT: lsr x9, x9, #3
+; CHECK-COMMON-NEXT: str x9, [sp, #104] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #112] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: str d0, [sp, #24] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: ldr d0, [sp, #24] // 8-byte Folded Reload
@@ -129,7 +140,7 @@ define double @locally_streaming_caller_normal_callee(double %x) nounwind noinli
; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: add sp, sp, #112
+; CHECK-COMMON-NEXT: add sp, sp, #128
; CHECK-COMMON-NEXT: ret
%call = call double @normal_callee(double %x);
%add = fadd double %call, 4.200000e+01
@@ -166,11 +177,16 @@ define double @normal_caller_to_locally_streaming_callee(double %x) nounwind noi
define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noinline optnone "aarch64_pstate_sm_body" {
; CHECK-COMMON-LABEL: locally_streaming_caller_streaming_callee_ptr:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: rdsvl x9, #1
+; CHECK-COMMON-NEXT: lsr x9, x9, #3
+; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: blr x0
; CHECK-COMMON-NEXT: smstop sm
@@ -178,7 +194,7 @@ define void @locally_streaming_caller_streaming_callee_ptr(ptr %p) nounwind noin
; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
call void %p() "aarch64_pstate_sm_enabled"
ret void
@@ -192,6 +208,8 @@ define void @normal_call_to_streaming_callee_ptr(ptr %p) nounwind noinline optno
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: cntd x9
+; CHECK-COMMON-NEXT: str x9, [sp, #72] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: blr x0
; CHECK-COMMON-NEXT: smstop sm
@@ -214,7 +232,8 @@ declare double @za_shared_callee(double) "aarch64_inout_za"
define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{
; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
; CHECK-COMMON: // %bb.0: // %prelude
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -240,7 +259,8 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
; CHECK-COMMON-NEXT: fadd d0, d0, d1
; CHECK-COMMON-NEXT: smstop za
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
entry:
%call = call double @za_shared_callee(double %x)
@@ -251,7 +271,8 @@ entry:
define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_inout_za"{
; CHECK-COMMON-LABEL: za_shared_caller_to_za_none_callee:
; CHECK-COMMON: // %bb.0: // %entry
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -279,7 +300,8 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: fmov d1, x8
; CHECK-COMMON-NEXT: fadd d0, d0, d1
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
entry:
%call = call double @normal_callee(double %x)
@@ -291,7 +313,8 @@ entry:
define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-LABEL: f128_call_za:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -314,7 +337,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: .LBB8_2:
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
%res = fadd fp128 %a, %b
ret fp128 %res
@@ -326,21 +350,22 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
; CHECK-COMMON-LABEL: f128_call_sm:
; CHECK-COMMON: // %bb.0:
; CHECK-COMMON-NEXT: sub sp, sp, #112
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
; CHECK-COMMON-NEXT: bl __addtf3
; CHECK-COMMON-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
-; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: add sp, sp, #112
@@ -353,7 +378,8 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-LABEL: frem_call_za:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: mov x29, sp
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
@@ -376,7 +402,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
; CHECK-COMMON-NEXT: .LBB10_2:
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr
; CHECK-COMMON-NEXT: mov sp, x29
-; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ret
%res = frem double %a, %b
ret double %res
@@ -387,21 +414,22 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
; CHECK-COMMON-LABEL: frem_call_sm:
; CHECK-COMMON: // %bb.0:
; CHECK-COMMON-NEXT: sub sp, sp, #96
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: smstop sm
; CHECK-COMMON-NEXT: ldp s0, s1, [sp, #8] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: bl fmodf
; CHECK-COMMON-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-COMMON-NEXT: smstart sm
-; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
-; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: add sp, sp, #96
@@ -414,12 +442,14 @@ define float @frem_call_sm(float %a, float %b) "aarch64_pstate_sm_enabled" nounw
define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-COMMON-LABEL: frem_call_sm_compat:
; CHECK-COMMON: // %bb.0:
-; CHECK-COMMON-NEXT: sub sp, sp, #96
+; CHECK-COMMON-NEXT: sub sp, sp, #112
+; CHECK-COMMON-NEXT: cntd x9
; CHECK-COMMON-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-COMMON-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT: str x19, [sp, #96] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-COMMON-NEXT: bl __arm_sme_state
; CHECK-COMMON-NEXT: and x19, x0, #0x1
@@ -434,13 +464,14 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
; CHECK-COMMON-NEXT: // %bb.3:
; CHECK-COMMON-NEXT: smstart sm
; CHECK-COMMON-NEXT: .LBB12_4:
-; CHECK-COMMON-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-COMMON-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT: add sp, sp, #96
+; CHECK-COMMON-NEXT: add sp, sp, #112
; CHECK-COMMON-NEXT: ret
%res = frem float %a, %b
ret float %res
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9d635f0b88f19..b0d6e046042e6 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve -mattr=+sme < %s | FileCheck %s
declare void @private_za_callee()
declare float @llvm.cos.f32(float)
@@ -8,7 +8,8 @@ declare float @llvm.cos.f32(float)
define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_1_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -31,7 +32,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
ret void
@@ -41,20 +43,21 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_2_callees:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT: str x21, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x19, #1
+; CHECK-NEXT: rdsvl x20, #1
; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: msub x8, x19, x19, x8
+; CHECK-NEXT: msub x8, x20, x20, x8
; CHECK-NEXT: mov sp, x8
-; CHECK-NEXT: sub x20, x29, #16
+; CHECK-NEXT: sub x21, x29, #16
; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur x8, [x29, #-16]
-; CHECK-NEXT: sturh w19, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sturh w20, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x21
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -64,8 +67,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: bl __arm_tpidr2_restore
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
-; CHECK-NEXT: sturh w19, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x20
+; CHECK-NEXT: sturh w20, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x21
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -76,8 +79,9 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x21, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #48 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
call void @private_za_callee()
@@ -88,7 +92,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" {
; CHECK-LABEL: test_lazy_save_expanded_intrinsic:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
@@ -111,7 +116,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
; CHECK-NEXT: .LBB2_2:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: mov sp, x29
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call float @llvm.cos.f32(float %a)
ret float %res
@@ -121,13 +127,15 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za" "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: test_lazy_save_and_conditional_smstart:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
-; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
@@ -140,13 +148,13 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: sturh w8, [x29, #-72]
; CHECK-NEXT: msr TPIDR2_EL0, x10
; CHECK-NEXT: bl __arm_sme_state
-; CHECK-NEXT: and x19, x0, #0x1
-; CHECK-NEXT: tbz w19, #0, .LBB3_2
+; CHECK-NEXT: and x20, x0, #0x1
+; CHECK-NEXT: tbz w20, #0, .LBB3_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB3_2:
; CHECK-NEXT: bl private_za_callee
-; CHECK-NEXT: tbz w19, #0, .LBB3_4
+; CHECK-NEXT: tbz w20, #0, .LBB3_4
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB3_4:
@@ -159,12 +167,12 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: .LBB3_6:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: sub sp, x29, #64
+; CHECK-NEXT: ldp x20, x19, [sp, #96] // 16-byte Folded Reload
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #112 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_callee()
ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
index d67573384ca95..6c8aff585808f 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
declare void @normal_callee();
declare void @streaming_callee() "aarch64_pstate_sm_enabled";
@@ -8,11 +8,15 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: sm_body_sm_compatible_simple:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x8, x0, #0x1
; CHECK-NEXT: tbnz w8, #0, .LBB0_2
@@ -28,7 +32,7 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
ret float zeroinitializer
}
@@ -36,11 +40,15 @@ define float @sm_body_sm_compatible_simple() "aarch64_pstate_sm_compatible" "aar
define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: sm_body_caller_sm_compatible_caller_normal_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB1_2
@@ -54,11 +62,12 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @normal_callee()
ret void
@@ -68,12 +77,16 @@ define void @sm_body_caller_sm_compatible_caller_normal_callee() "aarch64_pstate
define void @streaming_body_and_streaming_compatible_interface_multi_basic_block(i32 noundef %x) "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: streaming_body_and_streaming_compatible_interface_multi_basic_block:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov w8, w0
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
@@ -87,11 +100,12 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
; CHECK-NEXT: // %bb.4: // %if.else
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_5: // %if.else
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_6: // %if.then
; CHECK-NEXT: smstop sm
@@ -101,11 +115,12 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
; CHECK-NEXT: // %bb.7: // %if.then
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_8: // %if.then
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
%cmp = icmp eq i32 %x, 0
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
index cd6d45f54eb9a..3afd571ffba28 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
declare void @normal_callee();
declare void @streaming_callee() "aarch64_pstate_sm_enabled";
@@ -8,11 +8,15 @@ declare void @streaming_compatible_callee() "aarch64_pstate_sm_compatible";
define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_compatible_callee
; CHECK-NEXT: bl streaming_compatible_callee
@@ -21,7 +25,7 @@ define void @locally_streaming_caller_streaming_callee() "aarch64_pstate_sm_body
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_compatible_callee();
@@ -47,26 +51,33 @@ define void @streaming_and_locally_streaming_caller_streaming_callee() "aarch64_
define void @locally_streaming_multiple_exit(i64 %cond) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_multiple_exit:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: str x9, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
+; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: cmp x0, #1
; CHECK-NEXT: b.ne .LBB2_2
; CHECK-NEXT: // %bb.1: // %if.else
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_2: // %if.end
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
entry:
@@ -87,11 +98,16 @@ if.end:
define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_no_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #80
-; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: index z0.d, #0, #1
@@ -102,12 +118,12 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #80
+; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
%add = add <2 x i64> %a, <i64 41, i64 42>;
@@ -120,11 +136,15 @@ define <2 x i64> @locally_streaming_caller_no_callee(<2 x i64> %a) "aarch64_psta
define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_locally_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl locally_streaming_caller_streaming_callee
@@ -134,7 +154,7 @@ define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @locally_streaming_caller_streaming_callee();
@@ -151,12 +171,16 @@ define void @locally_streaming_caller_locally_streaming_callee() "aarch64_pstate
define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_compatible_callee_vec_args_ret:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -169,7 +193,7 @@ define <2 x i64> @locally_streaming_caller_compatible_callee_vec_args_ret(<2 x i
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
%res = call <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64> %a) "aarch64_pstate_sm_compatible"
ret <2 x i64> %res;
@@ -180,12 +204,16 @@ declare <2 x i64> @streaming_compatible_callee_vec_args_ret(<2 x i64>) "aarch64_
define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct_arg_ret({<2 x i64>, <2 x i64>} %arg) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: locally_streaming_caller_compatible_callee_struct_arg_ret:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #112] // 8-byte Folded Spill
; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
@@ -198,7 +226,7 @@ define {<2 x i64>, <2 x i64>} @locally_streaming_caller_compatible_callee_struct
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #112
+; CHECK-NEXT: add sp, sp, #128
; CHECK-NEXT: ret
%v1.arg = extractvalue {<2 x i64>, <2 x i64>} %arg, 1
%res = call {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<2 x i64> %v1.arg) "aarch64_pstate_sm_compatible"
@@ -212,11 +240,16 @@ declare {<2 x i64>, <2 x i64>} @streaming_compatible_callee_vec_arg_struct_ret(<
define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: locally_streaming_caller_alloca:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #88] // 8-byte Folded Spill
; CHECK-NEXT: addsvl sp, sp, #-1
; CHECK-NEXT: smstart sm
; CHECK-NEXT: mov x0, sp
@@ -227,7 +260,7 @@ define void @locally_streaming_caller_alloca() nounwind "aarch64_pstate_sm_body"
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
%alloca = alloca <vscale x 4 x i32>
call void @use_ptr(ptr %alloca) "aarch64_pstate_sm_compatible"
@@ -239,12 +272,16 @@ declare void @use_ptr(ptr) "aarch64_pstate_sm_compatible"
define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: call_to_intrinsic_without_chain:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #96] // 8-byte Folded Spill
; CHECK-NEXT: str d0, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: smstop sm
@@ -259,7 +296,7 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #96
+; CHECK-NEXT: add sp, sp, #112
; CHECK-NEXT: ret
entry:
%0 = call fast double @llvm.cos.f64(double %x)
@@ -272,11 +309,16 @@ declare double @llvm.cos.f64(double)
define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstate_sm_body" {
; CHECK-LABEL: test_arg_survives_loop:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #80
-; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: rdsvl x9, #1
+; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
+; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB9_1: // %for.body
@@ -289,12 +331,12 @@ define float @test_arg_survives_loop(float %arg, i32 %N) nounwind "aarch64_pstat
; CHECK-NEXT: fadd s0, s1, s0
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #80
+; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
entry:
br label %for.body
@@ -314,11 +356,15 @@ for.cond.cleanup:
define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_compatible_callee
; CHECK-NEXT: smstop sm
@@ -326,7 +372,7 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_body" nounwind {
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
tail call void @streaming_compatible_callee();
ret void;
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 1e16f140676ba..0c4da665337c0 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -mattr=+sve -mattr=+sme < %s | FileCheck %s
; This file tests the following combinations related to streaming-enabled functions:
; [ ] N -> SC (Normal -> Streaming-compatible)
@@ -36,11 +36,13 @@ define void @normal_caller_streaming_compatible_callee() nounwind {
define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_caller_normal_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB1_2
@@ -52,11 +54,12 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB1_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @normal_callee();
@@ -72,11 +75,13 @@ define void @streaming_compatible_caller_normal_callee() "aarch64_pstate_sm_comp
define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_caller_streaming_callee:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
@@ -88,11 +93,12 @@ define void @streaming_compatible_caller_streaming_callee() "aarch64_pstate_sm_c
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB2_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee();
@@ -124,11 +130,12 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-LABEL: streaming_compatible_with_neon_vectors:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: add x8, sp, #16
@@ -136,10 +143,10 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: add x8, sp, #16
-; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB4_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
@@ -160,8 +167,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -176,8 +183,9 @@ declare <2 x double> @normal_callee_vec_arg(<2 x double>)
define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale x 2 x double> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_scalable_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -255,8 +263,8 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x double> %arg)
%fadd = fadd <vscale x 2 x double> %res, %arg
@@ -268,8 +276,9 @@ declare <vscale x 2 x double> @normal_callee_scalable_vec_arg(<vscale x 2 x doub
define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x 2 x i1> %arg) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: streaming_compatible_with_predicate_vectors:
; CHECK: // %bb.0:
-; CHECK-NEXT: str x29, [sp, #-32]! // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x9, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -347,8 +356,8 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
%res = call <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1> %arg)
%and = and <vscale x 2 x i1> %res, %arg
@@ -360,11 +369,13 @@ declare <vscale x 2 x i1> @normal_callee_predicate_vec_arg(<vscale x 2 x i1>)
define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: conditional_smstart_unreachable_block:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB7_2
@@ -372,6 +383,10 @@ define i32 @conditional_smstart_unreachable_block() "aarch64_pstate_sm_compatibl
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB7_2:
; CHECK-NEXT: bl streaming_callee
+; CHECK-NEXT: tbnz w19, #0, .LBB7_4
+; CHECK-NEXT: // %bb.3:
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB7_4:
call void @streaming_callee()
unreachable
}
@@ -381,11 +396,13 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
; CHECK: // %bb.0:
; CHECK-NEXT: tbz w0, #0, .LBB8_6
; CHECK-NEXT: // %bb.1: // %if.then
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB8_3
@@ -397,11 +414,12 @@ define void @conditional_smstart_no_successor_block(i1 %p) "aarch64_pstate_sm_co
; CHECK-NEXT: // %bb.4: // %if.then
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB8_5: // %if.then
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: .LBB8_6: // %exit
; CHECK-NEXT: ret
br i1 %p, label %if.then, label %exit
@@ -417,11 +435,13 @@ exit:
define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB9_2
@@ -433,11 +453,12 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB9_4:
-; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
tail call void @normal_callee();
@@ -447,29 +468,32 @@ define void @disable_tailcallopt() "aarch64_pstate_sm_compatible" nounwind {
define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2) "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 112
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: .cfi_offset b8, -24
-; CHECK-NEXT: .cfi_offset b9, -32
-; CHECK-NEXT: .cfi_offset b10, -40
-; CHECK-NEXT: .cfi_offset b11, -48
-; CHECK-NEXT: .cfi_offset b12, -56
-; CHECK-NEXT: .cfi_offset b13, -64
-; CHECK-NEXT: .cfi_offset b14, -72
-; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: str x19, [sp, #112] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 128
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -32
+; CHECK-NEXT: .cfi_offset b8, -40
+; CHECK-NEXT: .cfi_offset b9, -48
+; CHECK-NEXT: .cfi_offset b10, -56
+; CHECK-NEXT: .cfi_offset b11, -64
+; CHECK-NEXT: .cfi_offset b12, -72
+; CHECK-NEXT: .cfi_offset b13, -80
+; CHECK-NEXT: .cfi_offset b14, -88
+; CHECK-NEXT: .cfi_offset b15, -96
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x8, x1
; CHECK-NEXT: mov x9, x0
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
+; CHECK-NEXT: .cfi_offset vg, -24
; CHECK-NEXT: tbz w19, #0, .LBB10_2
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: smstop sm
@@ -483,12 +507,14 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-NEXT: // %bb.3: // %entry
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB10_4: // %entry
-; CHECK-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_restore vg
; CHECK-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x19, [sp, #112] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: add sp, sp, #112
+; CHECK-NEXT: add sp, sp, #128
; CHECK-NEXT: ret
entry:
call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 465fb4667af69..4321493434230 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
; This file tests the following combinations related to streaming-enabled functions:
; [ ] N -> S (Normal -> Streaming)
@@ -22,10 +22,11 @@ define void @normal_caller_streaming_callee() nounwind {
; CHECK-LABEL: normal_caller_streaming_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_callee
; CHECK-NEXT: smstop sm
@@ -47,10 +48,11 @@ define void @streaming_caller_normal_callee() nounwind "aarch64_pstate_sm_enable
; CHECK-LABEL: streaming_caller_normal_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: bl normal_callee
; CHECK-NEXT: smstart sm
@@ -103,10 +105,11 @@ define void @call_to_function_pointer_streaming_enabled(ptr %p) nounwind {
; CHECK-LABEL: call_to_function_pointer_streaming_enabled:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: blr x0
; CHECK-NEXT: smstop sm
@@ -125,19 +128,20 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_simdfp:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_callee
; CHECK-NEXT: smstop sm
-; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
@@ -150,7 +154,9 @@ define <4 x i32> @smstart_clobber_simdfp(<4 x i32> %x) nounwind {
define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_sve:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -216,7 +222,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee()
ret <vscale x 4 x i32> %x;
@@ -227,7 +233,9 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x) nounwind {
; CHECK-LABEL: smstart_clobber_sve_duplicate:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
@@ -296,7 +304,7 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @streaming_callee()
call void @streaming_callee()
@@ -308,11 +316,12 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
; CHECK-LABEL: call_to_intrinsic_without_chain:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #96
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: stp d0, d0, [sp] // 16-byte Folded Spill
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr d0, [sp] // 8-byte Folded Reload
@@ -320,11 +329,11 @@ define double @call_to_intrinsic_without_chain(double %x) nounwind "aarch64_psta
; CHECK-NEXT: str d0, [sp] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: ldp d1, d0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT: fadd d0, d1, d0
; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: fadd d0, d1, d0
; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
; CHECK-NEXT: ret
@@ -342,10 +351,11 @@ define void @disable_tailcallopt() nounwind {
; CHECK-LABEL: disable_tailcallopt:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: bl streaming_callee
; CHECK-NEXT: smstop sm
@@ -362,11 +372,13 @@ define void @disable_tailcallopt() nounwind {
define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone %ptr) #0 {
; CHECK-LABEL: call_to_non_streaming_pass_sve_objects:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: rdsvl x3, #1
; CHECK-NEXT: addvl x0, sp, #2
@@ -383,7 +395,7 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
entry:
%Data1 = alloca <vscale x 16 x i8>, align 16
@@ -400,11 +412,12 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #112
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: stp d2, d3, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp s0, s1, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: smstop sm
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index 45ca7844b0655..ac19bd59babe4 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -15,11 +15,12 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
; CHECK-LABEL: test_no_stackslot_scavenging:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x29, [sp, #64] // 8-byte Folded Spill
-; CHECK-NEXT: stp x30, x24, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp x9, x24, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: str s0, [sp, #12] // 4-byte Folded Spill
@@ -31,8 +32,8 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
; CHECK-NEXT: smstart sm
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: ldp x30, x24, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x24, [sp, #88] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -46,4 +47,4 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
declare void @use_f(float)
-attributes #0 = { nounwind "target-features"="+sme" "aarch64_pstate_sm_enabled" }
+attributes #0 = { nounwind "target-features"="+sve,+sme" "aarch64_pstate_sm_enabled" }
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index b0c8c05f46169..9ad68820bb27b 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -7,6 +7,7 @@ declare void @fixed_callee(<4 x i32>);
declare void @scalable_callee(<vscale x 2 x i64>);
declare void @streaming_callee() #0;
+declare void @streaming_callee_with_arg(i32) #0;
; Simple example of a function with one call requiring a streaming mode change
;
@@ -1002,22 +1003,27 @@ define void @streaming_compatible_to_non_streaming() #4 {
ret void
}
-; If the target does not have SVE, do not spill VG even if the function
-; has streaming-mode changes.
+; If the target does not have SVE, do not emit cntd in the prologue and
+; instead spill the result returned by __arm_get_current_vg.
+; This requires preserving the argument %x as the vg value is returned
+; in X0.
;
-define void @streaming_compatible_no_sve() #4 {
+define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
; NO-SVE-CHECK-LABEL: streaming_compatible_no_sve:
; NO-SVE-CHECK: // %bb.0:
; NO-SVE-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; NO-SVE-CHECK-NEXT: .cfi_def_cfa_offset 96
; NO-SVE-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: mov x9, x0
; NO-SVE-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; NO-SVE-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; NO-SVE-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; NO-SVE-CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; NO-SVE-CHECK-NEXT: bl __arm_get_current_vg
+; NO-SVE-CHECK-NEXT: stp x0, x19, [sp, #80] // 16-byte Folded Spill
+; NO-SVE-CHECK-NEXT: mov x0, x9
; NO-SVE-CHECK-NEXT: add x29, sp, #64
; NO-SVE-CHECK-NEXT: .cfi_def_cfa w29, 32
-; NO-SVE-CHECK-NEXT: .cfi_offset w19, -16
+; NO-SVE-CHECK-NEXT: .cfi_offset w19, -8
; NO-SVE-CHECK-NEXT: .cfi_offset w30, -24
; NO-SVE-CHECK-NEXT: .cfi_offset w29, -32
; NO-SVE-CHECK-NEXT: .cfi_offset b8, -40
@@ -1028,20 +1034,24 @@ define void @streaming_compatible_no_sve() #4 {
; NO-SVE-CHECK-NEXT: .cfi_offset b13, -80
; NO-SVE-CHECK-NEXT: .cfi_offset b14, -88
; NO-SVE-CHECK-NEXT: .cfi_offset b15, -96
+; NO-SVE-CHECK-NEXT: mov w8, w0
; NO-SVE-CHECK-NEXT: bl __arm_sme_state
; NO-SVE-CHECK-NEXT: and x19, x0, #0x1
+; NO-SVE-CHECK-NEXT: .cfi_offset vg, -16
; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_2
; NO-SVE-CHECK-NEXT: // %bb.1:
; NO-SVE-CHECK-NEXT: smstart sm
; NO-SVE-CHECK-NEXT: .LBB8_2:
-; NO-SVE-CHECK-NEXT: bl streaming_callee
+; NO-SVE-CHECK-NEXT: mov w0, w8
+; NO-SVE-CHECK-NEXT: bl streaming_callee_with_arg
; NO-SVE-CHECK-NEXT: tbnz w19, #0, .LBB8_4
; NO-SVE-CHECK-NEXT: // %bb.3:
; NO-SVE-CHECK-NEXT: smstop sm
; NO-SVE-CHECK-NEXT: .LBB8_4:
+; NO-SVE-CHECK-NEXT: .cfi_restore vg
; NO-SVE-CHECK-NEXT: .cfi_def_cfa wsp, 96
; NO-SVE-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; NO-SVE-CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
+; NO-SVE-CHECK-NEXT: ldr x19, [sp, #88] // 8-byte Folded Reload
; NO-SVE-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; NO-SVE-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; NO-SVE-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
@@ -1060,7 +1070,7 @@ define void @streaming_compatible_no_sve() #4 {
; NO-SVE-CHECK-NEXT: .cfi_restore b15
; NO-SVE-CHECK-NEXT: ret
;
- call void @streaming_callee()
+ call void @streaming_callee_with_arg(i32 %x)
ret void
}
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index c39894c27d9d4..106d6190e88b9 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs -aarch64-lower-to-sme-routines=false < %s | FileCheck %s -check-prefixes=CHECK-NO-SME-ROUTINES
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -mattr=+mops -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MOPS
@dst = global [512 x i8] zeroinitializer, align 1
@src = global [512 x i8] zeroinitializer, align 1
@@ -22,13 +22,14 @@ define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-LABEL: se_memcpy:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
@@ -71,12 +72,13 @@ define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-LABEL: se_memset:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
; CHECK-NO-SME-ROUTINES-NEXT: mov w1, #2 // =0x2
@@ -119,13 +121,14 @@ define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-LABEL: se_memmove:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
@@ -168,16 +171,18 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
;
; CHECK-NO-SME-ROUTINES-LABEL: sc_memcpy:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: bl __arm_sme_state
; CHECK-NO-SME-ROUTINES-NEXT: adrp x8, :got:dst
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: and x19, x0, #0x1
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: ldr x8, [x8, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_2
@@ -190,11 +195,12 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: // %bb.3: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: smstart sm
; CHECK-NO-SME-ROUTINES-NEXT: .LBB3_4: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ret
;
; CHECK-MOPS-LABEL: sc_memcpy:
@@ -215,12 +221,16 @@ entry:
define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-LABEL: sb_memcpy:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x2, x0
+; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: smstart sm
; CHECK-NEXT: adrp x0, :got:dst
; CHECK-NEXT: adrp x1, :got:src
@@ -232,17 +242,21 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
;
; CHECK-NO-SME-ROUTINES-LABEL: sb_memcpy:
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
-; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: rdsvl x9, #1
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT: lsr x9, x9, #3
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: smstart sm
; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
@@ -256,15 +270,20 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NO-SME-ROUTINES-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NO-SME-ROUTINES-NEXT: ret
;
; CHECK-MOPS-LABEL: sb_memcpy:
; CHECK-MOPS: // %bb.0: // %entry
-; CHECK-MOPS-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
-; CHECK-MOPS-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-MOPS-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: rdsvl x9, #1
+; CHECK-MOPS-NEXT: lsr x9, x9, #3
+; CHECK-MOPS-NEXT: str x9, [sp, #-80]! // 8-byte Folded Spill
+; CHECK-MOPS-NEXT: cntd x9
+; CHECK-MOPS-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: str x9, [sp, #8] // 8-byte Folded Spill
+; CHECK-MOPS-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-MOPS-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
; CHECK-MOPS-NEXT: smstart sm
; CHECK-MOPS-NEXT: adrp x8, :got:src
; CHECK-MOPS-NEXT: adrp x9, :got:dst
@@ -274,10 +293,11 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-MOPS-NEXT: cpyfm [x9]!, [x8]!, x0!
; CHECK-MOPS-NEXT: cpyfe [x9]!, [x8]!, x0!
; CHECK-MOPS-NEXT: smstop sm
-; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-MOPS-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-MOPS-NEXT: add sp, sp, #80
; CHECK-MOPS-NEXT: ret
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr align 1 @dst, ptr nonnull align 1 @src, i64 %n, i1 false)
>From 2750fb2ad6b59450fcd73f2dc3685cf70369479c Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Thu, 6 Jun 2024 10:37:23 +0000
Subject: [PATCH 10/13] - Added PreserveMost_From_X1 regmask to
__arm_get_current_vg call
---
.../AArch64/AArch64ExpandPseudoInsts.cpp | 1 -
.../Target/AArch64/AArch64FrameLowering.cpp | 5 +-
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 104 +++++++-----------
3 files changed, 42 insertions(+), 68 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 6647a13aab870..9b7fc228d5de8 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -29,7 +29,6 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/Pass.h"
#include "llvm/Support/CodeGen.h"
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index b16f2db8e6081..3bb199eb9046c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3179,9 +3179,12 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
.addReg(AArch64::X0, RegState::Implicit)
.setMIFlag(MachineInstr::FrameSetup);
- // FIXME: Add PreserveMost_From_X1 regmask from PR #93963
+ const uint32_t *RegMask = TRI->getCallPreservedMask(
+ MF, CallingConv::
+ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1);
BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
.addExternalSymbol("__arm_get_current_vg")
+ .addRegMask(RegMask)
.addReg(AArch64::X0, RegState::ImplicitDefine)
.setMIFlag(MachineInstr::FrameSetup);
Reg1 = AArch64::X0;
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 9ad68820bb27b..a6ff46884c118 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -319,34 +319,27 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: .cfi_offset w29, -32
; CHECK-NEXT: addvl sp, sp, #-18
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG
; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG
@@ -368,23 +361,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; CHECK-NEXT: .cfi_restore vg
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG
-; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ptrue pn8.b
; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -428,34 +414,27 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: .cfi_offset w30, -40
; FP-CHECK-NEXT: .cfi_offset w29, -48
; FP-CHECK-NEXT: addvl sp, sp, #-18
+; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: ptrue pn8.b
; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; FP-CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; FP-CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG
; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG
@@ -475,23 +454,16 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: smstart sm
; FP-CHECK-NEXT: .cfi_restore vg
; FP-CHECK-NEXT: addvl sp, sp, #1
-; FP-CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; FP-CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ptrue pn8.b
; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload
+; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload
; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
>From 53105fd6235687e905d109b832c34b3fd7d36e68 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Fri, 7 Jun 2024 15:44:51 +0000
Subject: [PATCH 11/13] - Return true from needsAsyncDwarfUnwindInfo() if the
function contains a streaming-mode change.
---
.../AArch64/AArch64MachineFunctionInfo.cpp | 8 +-
.../CodeGen/AArch64/sme-lazy-save-call.ll | 2 +-
...ing-body-streaming-compatible-interface.ll | 2 +-
.../sme-streaming-compatible-interface.ll | 13 ++-
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 94 +++++++++++++++++++
.../streaming-compatible-memory-ops.ll | 22 ++---
6 files changed, 124 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index c3d64f5a0a965..f66ce4c32902b 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -196,12 +196,14 @@ bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo(
const MachineFunction &MF) const {
if (!NeedsAsyncDwarfUnwindInfo) {
const Function &F = MF.getFunction();
+ const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// The check got "minsize" is because epilogue unwind info is not emitted
// (yet) for homogeneous epilogues, outlined functions, and functions
// outlined from.
- NeedsAsyncDwarfUnwindInfo = needsDwarfUnwindInfo(MF) &&
- F.getUWTableKind() == UWTableKind::Async &&
- !F.hasMinSize();
+ NeedsAsyncDwarfUnwindInfo =
+ (needsDwarfUnwindInfo(MF) && F.getUWTableKind() == UWTableKind::Async &&
+ !F.hasMinSize()) ||
+ AFI->hasStreamingModeChanges();
}
return *NeedsAsyncDwarfUnwindInfo;
}
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index b0d6e046042e6..a6542ff8c7691 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -133,9 +133,9 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: add x29, sp, #64
; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
index 6c8aff585808f..3ec280e946601 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
@@ -80,13 +80,13 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 0c4da665337c0..58992eb2d592a 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -469,6 +469,7 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-LABEL: call_to_non_streaming_pass_args:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: .cfi_def_cfa_offset 128
; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp d15, d14, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #48] // 16-byte Folded Spill
@@ -476,7 +477,6 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
; CHECK-NEXT: stp x30, x9, [sp, #96] // 16-byte Folded Spill
; CHECK-NEXT: str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 128
; CHECK-NEXT: .cfi_offset w19, -16
; CHECK-NEXT: .cfi_offset w30, -32
; CHECK-NEXT: .cfi_offset b8, -40
@@ -515,6 +515,17 @@ define void @call_to_non_streaming_pass_args(ptr nocapture noundef readnone %ptr
; CHECK-NEXT: ldp d13, d12, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
; CHECK-NEXT: ret
entry:
call void @bar(ptr noundef nonnull %ptr, i64 %long1, i64 %long2, i32 %int1, i32 %int2, float %float1, float %float2, double %double1, double %double2)
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index a6ff46884c118..5e241dbdc32ad 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -1046,7 +1046,101 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
ret void
}
+; Ensure we still emit async unwind information with -fno-asynchronous-unwind-tables
+; if the function contains a streaming-mode change.
+
+define void @vg_unwind_noasync() #5 {
+; CHECK-LABEL: vg_unwind_noasync:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 80
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: .cfi_offset b8, -24
+; CHECK-NEXT: .cfi_offset b9, -32
+; CHECK-NEXT: .cfi_offset b10, -40
+; CHECK-NEXT: .cfi_offset b11, -48
+; CHECK-NEXT: .cfi_offset b12, -56
+; CHECK-NEXT: .cfi_offset b13, -64
+; CHECK-NEXT: .cfi_offset b14, -72
+; CHECK-NEXT: .cfi_offset b15, -80
+; CHECK-NEXT: .cfi_offset vg, -8
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: bl callee
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore b8
+; CHECK-NEXT: .cfi_restore b9
+; CHECK-NEXT: .cfi_restore b10
+; CHECK-NEXT: .cfi_restore b11
+; CHECK-NEXT: .cfi_restore b12
+; CHECK-NEXT: .cfi_restore b13
+; CHECK-NEXT: .cfi_restore b14
+; CHECK-NEXT: .cfi_restore b15
+; CHECK-NEXT: ret
+;
+; FP-CHECK-LABEL: vg_unwind_noasync:
+; FP-CHECK: // %bb.0:
+; FP-CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 96
+; FP-CHECK-NEXT: cntd x9
+; FP-CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; FP-CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; FP-CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; FP-CHECK-NEXT: add x29, sp, #64
+; FP-CHECK-NEXT: .cfi_def_cfa w29, 32
+; FP-CHECK-NEXT: .cfi_offset w30, -24
+; FP-CHECK-NEXT: .cfi_offset w29, -32
+; FP-CHECK-NEXT: .cfi_offset b8, -40
+; FP-CHECK-NEXT: .cfi_offset b9, -48
+; FP-CHECK-NEXT: .cfi_offset b10, -56
+; FP-CHECK-NEXT: .cfi_offset b11, -64
+; FP-CHECK-NEXT: .cfi_offset b12, -72
+; FP-CHECK-NEXT: .cfi_offset b13, -80
+; FP-CHECK-NEXT: .cfi_offset b14, -88
+; FP-CHECK-NEXT: .cfi_offset b15, -96
+; FP-CHECK-NEXT: .cfi_offset vg, -16
+; FP-CHECK-NEXT: smstop sm
+; FP-CHECK-NEXT: bl callee
+; FP-CHECK-NEXT: smstart sm
+; FP-CHECK-NEXT: .cfi_restore vg
+; FP-CHECK-NEXT: .cfi_def_cfa wsp, 96
+; FP-CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; FP-CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
+; FP-CHECK-NEXT: .cfi_def_cfa_offset 0
+; FP-CHECK-NEXT: .cfi_restore w30
+; FP-CHECK-NEXT: .cfi_restore w29
+; FP-CHECK-NEXT: .cfi_restore b8
+; FP-CHECK-NEXT: .cfi_restore b9
+; FP-CHECK-NEXT: .cfi_restore b10
+; FP-CHECK-NEXT: .cfi_restore b11
+; FP-CHECK-NEXT: .cfi_restore b12
+; FP-CHECK-NEXT: .cfi_restore b13
+; FP-CHECK-NEXT: .cfi_restore b14
+; FP-CHECK-NEXT: .cfi_restore b15
+; FP-CHECK-NEXT: ret
+ call void @callee();
+ ret void;
+}
+
attributes #0 = { "aarch64_pstate_sm_enabled" uwtable(async) }
attributes #1 = { "probe-stack"="inline-asm" "aarch64_pstate_sm_enabled" uwtable(async) }
attributes #3 = { "aarch64_pstate_sm_body" uwtable(async) }
attributes #4 = { "aarch64_pstate_sm_compatible" uwtable(async) }
+attributes #5 = { "aarch64_pstate_sm_enabled" }
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index 106d6190e88b9..9c784e1f18f82 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -24,12 +24,12 @@ define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
@@ -73,12 +73,12 @@ define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
-; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
; CHECK-NO-SME-ROUTINES-NEXT: mov w1, #2 // =0x2
@@ -123,12 +123,12 @@ define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
@@ -174,11 +174,11 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: bl __arm_sme_state
; CHECK-NO-SME-ROUTINES-NEXT: adrp x8, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: and x19, x0, #0x1
@@ -224,13 +224,13 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: mov x2, x0
; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: cntd x9
; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT: mov x2, x0
; CHECK-NEXT: smstart sm
; CHECK-NEXT: adrp x0, :got:dst
; CHECK-NEXT: adrp x1, :got:src
@@ -250,13 +250,13 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: rdsvl x9, #1
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: lsr x9, x9, #3
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: smstart sm
; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
>From 2bd09fe3771cc518c26e632ae6e5bf600ffe19ae Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Mon, 10 Jun 2024 12:02:56 +0000
Subject: [PATCH 12/13] - Fixed misplaced brackets in
needsAsyncDwarfUnwindInfo()
---
.../AArch64/AArch64MachineFunctionInfo.cpp | 6 ++---
.../CodeGen/AArch64/sme-lazy-save-call.ll | 2 +-
...ing-body-streaming-compatible-interface.ll | 2 +-
.../streaming-compatible-memory-ops.ll | 22 +++++++++----------
4 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index f66ce4c32902b..957d7bc79b187 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -201,9 +201,9 @@ bool AArch64FunctionInfo::needsAsyncDwarfUnwindInfo(
// (yet) for homogeneous epilogues, outlined functions, and functions
// outlined from.
NeedsAsyncDwarfUnwindInfo =
- (needsDwarfUnwindInfo(MF) && F.getUWTableKind() == UWTableKind::Async &&
- !F.hasMinSize()) ||
- AFI->hasStreamingModeChanges();
+ needsDwarfUnwindInfo(MF) &&
+ ((F.getUWTableKind() == UWTableKind::Async && !F.hasMinSize()) ||
+ AFI->hasStreamingModeChanges());
}
return *NeedsAsyncDwarfUnwindInfo;
}
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index a6542ff8c7691..b0d6e046042e6 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -133,9 +133,9 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #64
; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill
-; CHECK-NEXT: add x29, sp, #64
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
index 3ec280e946601..6c8aff585808f 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-body-streaming-compatible-interface.ll
@@ -80,13 +80,13 @@ define void @streaming_body_and_streaming_compatible_interface_multi_basic_block
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: cntd x9
; CHECK-NEXT: stp x9, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT: mov w8, w0
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbnz w19, #0, .LBB2_2
diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
index 9c784e1f18f82..106d6190e88b9 100644
--- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
+++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll
@@ -24,12 +24,12 @@ define void @se_memcpy(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
+; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
@@ -73,12 +73,12 @@ define void @se_memset(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES: // %bb.0: // %entry
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
+; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
-; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
; CHECK-NO-SME-ROUTINES-NEXT: mov w1, #2 // =0x2
@@ -123,12 +123,12 @@ define void @se_memmove(i64 noundef %n) "aarch64_pstate_sm_enabled" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-80]! // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
+; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
+; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst]
; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src]
; CHECK-NO-SME-ROUTINES-NEXT: smstop sm
@@ -174,11 +174,11 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: bl __arm_sme_state
; CHECK-NO-SME-ROUTINES-NEXT: adrp x8, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: and x19, x0, #0x1
@@ -224,13 +224,13 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: rdsvl x9, #1
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT: mov x2, x0
; CHECK-NEXT: lsr x9, x9, #3
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: cntd x9
; CHECK-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT: mov x2, x0
; CHECK-NEXT: smstart sm
; CHECK-NEXT: adrp x0, :got:dst
; CHECK-NEXT: adrp x1, :got:src
@@ -250,13 +250,13 @@ define void @sb_memcpy(i64 noundef %n) "aarch64_pstate_sm_body" nounwind {
; CHECK-NO-SME-ROUTINES-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: rdsvl x9, #1
; CHECK-NO-SME-ROUTINES-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: lsr x9, x9, #3
; CHECK-NO-SME-ROUTINES-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill
; CHECK-NO-SME-ROUTINES-NEXT: cntd x9
; CHECK-NO-SME-ROUTINES-NEXT: str x9, [sp, #80] // 8-byte Folded Spill
-; CHECK-NO-SME-ROUTINES-NEXT: mov x2, x0
; CHECK-NO-SME-ROUTINES-NEXT: smstart sm
; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst
; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src
>From d6a7fb1f3bf10423dd6dff78280658482892dd1d Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin at arm.com>
Date: Tue, 11 Jun 2024 10:13:17 +0000
Subject: [PATCH 13/13] - Disable outlining from functions which require
streaming-mode changes.
---
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 7 +++++
llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll | 31 ++++++++++++++++++++
2 files changed, 38 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 7d540efe2b41e..e79d4422e255e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8700,6 +8700,13 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
if (!AFI || AFI->hasRedZone().value_or(true))
return false;
+ // FIXME: Determine whether it is safe to outline from functions which contain
+ // streaming-mode changes. We may need to ensure any smstart/smstop pairs are
+ // outlined together and ensure it is safe to outline with async unwind info,
+ // required for saving & restoring VG around calls.
+ if (AFI->hasStreamingModeChanges())
+ return false;
+
// FIXME: Teach the outliner to generate/handle Windows unwind info.
if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
return false;
diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
index 5e241dbdc32ad..6264ce0cf4ae6 100644
--- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
+++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll
@@ -1,6 +1,7 @@
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=FP-CHECK
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -frame-pointer=non-leaf -verify-machineinstrs < %s | FileCheck %s --check-prefix=NO-SVE-CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+sme2 -verify-machineinstrs -enable-machine-outliner < %s | FileCheck %s --check-prefix=OUTLINER-CHECK
declare void @callee();
declare void @fixed_callee(<4 x i32>);
@@ -97,6 +98,9 @@ define void @vg_unwind_simple() #0 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_simple:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
;
call void @callee();
ret void;
@@ -202,6 +206,9 @@ define void @vg_unwind_needs_gap() #0 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_needs_gap:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
;
call void asm sideeffect "", "~{x20}"()
call void @callee();
@@ -302,6 +309,9 @@ define void @vg_unwind_with_fixed_args(<4 x i32> %x) #0 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_with_fixed_args:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
;
call void @fixed_callee(<4 x i32> %x);
ret void;
@@ -493,6 +503,9 @@ define void @vg_unwind_with_sve_args(<vscale x 2 x i64> %x) #0 {
; FP-CHECK-NEXT: .cfi_restore w30
; FP-CHECK-NEXT: .cfi_restore w29
; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_unwind_with_sve_args:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
;
call void asm sideeffect "", "~{x28}"()
call void @scalable_callee(<vscale x 2 x i64> %x);
@@ -620,6 +633,9 @@ define void @vg_unwind_multiple_scratch_regs(ptr %out) #1 {
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
;
+; OUTLINER-CHECK-LABEL: vg_unwind_multiple_scratch_regs:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
entry:
%v = alloca i8, i64 327680, align 1
store ptr %v, ptr %out, align 8
@@ -740,6 +756,9 @@ define void @vg_locally_streaming_fn() #3 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: vg_locally_streaming_fn:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
;
call void @callee()
call void @streaming_callee()
@@ -856,6 +875,9 @@ define void @streaming_compatible_to_streaming() #4 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: streaming_compatible_to_streaming:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
;
call void @streaming_callee()
ret void
@@ -970,6 +992,9 @@ define void @streaming_compatible_to_non_streaming() #4 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: streaming_compatible_to_non_streaming:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
;
call void @callee()
ret void
@@ -1041,6 +1066,9 @@ define void @streaming_compatible_no_sve(i32 noundef %x) #4 {
; NO-SVE-CHECK-NEXT: .cfi_restore b14
; NO-SVE-CHECK-NEXT: .cfi_restore b15
; NO-SVE-CHECK-NEXT: ret
+;
+; OUTLINER-CHECK-LABEL: streaming_compatible_no_sve:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
;
call void @streaming_callee_with_arg(i32 %x)
ret void
@@ -1135,6 +1163,9 @@ define void @vg_unwind_noasync() #5 {
; FP-CHECK-NEXT: .cfi_restore b14
; FP-CHECK-NEXT: .cfi_restore b15
; FP-CHECK-NEXT: ret
+; OUTLINER-CHECK-LABEL: vg_unwind_noasync:
+; OUTLINER-CHECK-NOT: OUTLINED_FUNCTION_
+;
call void @callee();
ret void;
}
More information about the cfe-commits
mailing list