[llvm] [AArch64][SVE] Avoid extra pop of "FixedObject" with FPAfterSVECalleeSaves (PR #156452)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 9 02:48:20 PDT 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/156452
>From 414b3ea9014192d12f931c773b270f9ba82167cf Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 2 Sep 2025 08:28:45 +0000
Subject: [PATCH 1/3] [AArch64][SVE] Avoid extra pop of "FixedObject" with
FPAfterSVECalleeSaves
Previously, we would pop `FixedObject`-bytes after deallocating the SVE
area, then again as part of the "AfterCSRPopSize". This could be seen
in the tests `@f6` and `@f9`.
This patch removes the erroneous pop, and refactors `FPAfterSVECalleeSaves`
to reuse more of the existing GPR deallocation logic, which allows for
post-decrements.
---
.../AArch64/AArch64PrologueEpilogue.cpp | 48 +++++----
.../CodeGen/AArch64/framelayout-sve-win.mir | 30 ++----
llvm/test/CodeGen/AArch64/win-sve.ll | 98 ++++++-------------
3 files changed, 69 insertions(+), 107 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index f110558fcb0d4..193c5b3cf9e04 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -1360,14 +1360,24 @@ void AArch64EpilogueEmitter::emitEpilogue() {
}
bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes);
- // Assume we can't combine the last pop with the sp restore.
- bool CombineAfterCSRBump = false;
+
+ unsigned ProloguePopSize = PrologueSaveSize;
if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ // With CalleeSavesAboveFrameRecord ProloguePopSize is the amount of stack
+ // that needs to be popped until we reach the start of the SVE save area.
+ // The "FixedObject" stack occurs after the SVE area and must be popped
+ // later.
+ ProloguePopSize -= FixedObject;
AfterCSRPopSize += FixedObject;
- } else if (!CombineSPBump && PrologueSaveSize != 0) {
+ }
+
+ // Assume we can't combine the last pop with the sp restore.
+ if (!CombineSPBump && ProloguePopSize != 0) {
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
- AArch64InstrInfo::isSEHInstruction(*Pop))
+ AArch64InstrInfo::isSEHInstruction(*Pop) ||
+ (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord &&
+ isPartOfSVECalleeSaves(Pop)))
Pop = std::prev(Pop);
// Converting the last ldp to a post-index ldp is valid only if the last
// ldp's offset is 0.
@@ -1377,18 +1387,24 @@ void AArch64EpilogueEmitter::emitEpilogue() {
// may clobber), convert it to a post-index ldp.
if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
convertCalleeSaveRestoreToSPPrePostIncDec(
- Pop, DL, PrologueSaveSize, EmitCFI, MachineInstr::FrameDestroy,
- PrologueSaveSize);
+ Pop, DL, ProloguePopSize, EmitCFI, MachineInstr::FrameDestroy,
+ ProloguePopSize);
+ } else if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ // If not, and CalleeSavesAboveFrameRecord is enabled, deallocate
+ // callee-save non-SVE registers to move the stack pointer to the start of
+ // the SVE area.
+ emitFrameOffset(MBB, std::next(Pop), DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(ProloguePopSize), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI);
} else {
- // If not, make sure to emit an add after the last ldp.
+ // Otherwise, make sure to emit an add after the last ldp.
// We're doing this by transferring the size to be restored from the
// adjustment *before* the CSR pops to the adjustment *after* the CSR
// pops.
- AfterCSRPopSize += PrologueSaveSize;
- CombineAfterCSRBump = true;
+ AfterCSRPopSize += ProloguePopSize;
}
}
-
// Move past the restores of the callee-saved registers.
// If we plan on combining the sp bump of the local stack size and the callee
// save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1467,16 +1483,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NeedsWinCFI, &HasWinCFI);
}
- // Deallocate callee-save non-SVE registers.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
- // Deallocate fixed objects.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(FixedObject), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
// Deallocate callee-save SVE registers.
emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
@@ -1619,7 +1625,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
false, NeedsWinCFI, &HasWinCFI, EmitCFI,
- StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
+ StackOffset::getFixed(AfterCSRPopSize - ArgumentStackToRestore));
}
}
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
index 5933c5daa67ed..b8302e64f282d 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
@@ -380,10 +380,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
- ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1)
- ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+ ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
; CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 0 :: (load (s16) from %stack.4)
; CHECK-NEXT: frame-destroy SEH_SavePReg 4, 0
; CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 1 :: (load (s16) from %stack.3)
@@ -430,10 +428,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
- ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1)
- ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+ ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 0
; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3)
@@ -557,10 +553,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
; CHECK-NEXT: $x21, $lr = frame-destroy LDPXi $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
; CHECK-NEXT: frame-destroy SEH_SaveRegP 21, 30, 16
- ; CHECK-NEXT: $x19, $x20 = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.4), (load (s64) from %stack.5)
- ; CHECK-NEXT: frame-destroy SEH_SaveRegP 19, 20, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
+ ; CHECK-NEXT: early-clobber $sp, $x19, $x20 = frame-destroy LDPXpost $sp, 4 :: (load (s64) from %stack.4), (load (s64) from %stack.5)
+ ; CHECK-NEXT: frame-destroy SEH_SaveRegP_X 19, 20, -32
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.21)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2
; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.20)
@@ -745,10 +739,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
; CHECK-NEXT: frame-destroy SEH_SetFP
- ; CHECK-NEXT: $fp, $lr = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
- ; CHECK-NEXT: frame-destroy SEH_SaveFPLR 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
+ ; CHECK-NEXT: frame-destroy SEH_SaveFPLR_X -16
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.19)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2
; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.18)
@@ -869,10 +861,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 7, implicit $vg
; CHECK-NEXT: frame-destroy SEH_AllocZ 7
- ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.6)
- ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.6)
+ ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.8)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 1
; CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.7)
diff --git a/llvm/test/CodeGen/AArch64/win-sve.ll b/llvm/test/CodeGen/AArch64/win-sve.ll
index 53ac9344175a3..8446de1a59cf9 100644
--- a/llvm/test/CodeGen/AArch64/win-sve.ll
+++ b/llvm/test/CodeGen/AArch64/win-sve.ll
@@ -75,10 +75,8 @@ define i32 @f(<vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -234,10 +232,8 @@ define void @f2(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -384,10 +380,8 @@ define void @f3(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -538,10 +532,8 @@ define void @f4(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -702,10 +694,8 @@ define void @f5(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -860,10 +850,10 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: stur x0, [x8, #16]
; CHECK-NEXT: addvl x8, x29, #18
; CHECK-NEXT: ldr x1, [x8, #32]
-; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: .Ltmp0: // EH_LABEL
; CHECK-NEXT: add x0, x19, #0
; CHECK-NEXT: bl g6
-; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: .Ltmp1: // EH_LABEL
; CHECK-NEXT: // %bb.1: // %invoke.cont
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: add sp, sp, #64
@@ -872,10 +862,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -932,8 +920,6 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: .seh_save_preg p14, 10
; CHECK-NEXT: ldr p15, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: .seh_save_preg p15, 11
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: addvl sp, sp, #18
; CHECK-NEXT: .seh_allocz 18
; CHECK-NEXT: add sp, sp, #16
@@ -1024,10 +1010,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1144,10 +1128,8 @@ define void @f8(<vscale x 2 x i64> %v) {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: .seh_startepilogue
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1196,14 +1178,10 @@ define void @f9(<vscale x 2 x i64> %v, ...) {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: .seh_startepilogue
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
-; CHECK-NEXT: add sp, sp, #64
-; CHECK-NEXT: .seh_stackalloc 64
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .seh_allocz 1
; CHECK-NEXT: add sp, sp, #64
@@ -1301,10 +1279,8 @@ define void @f10(i64 %n, <vscale x 2 x i64> %x) "frame-pointer"="all" {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_fplr 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1390,10 +1366,8 @@ define i32 @f11(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: str d0, [sp, #8]
; CHECK-NEXT: .seh_startepilogue
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1431,10 +1405,8 @@ define i32 @f12(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .seh_allocz 1
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1475,10 +1447,8 @@ define i32 @f13(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" {
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_fplr 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 32
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1521,10 +1491,8 @@ define i32 @f14(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" {
; CHECK-NEXT: .seh_allocz 1
; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_fplr 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 32
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1572,10 +1540,8 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
>From 7e05ecdf5fdc73cee84b9d8f9a05170026ac2ade Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 2 Sep 2025 13:44:58 +0000
Subject: [PATCH 2/3] Update codeview test
---
llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll b/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
index 446a84dc0294c..ffdc80a350a24 100644
--- a/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
+++ b/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
@@ -101,7 +101,7 @@
; CHECK-NEXT: LocalVariableAddrRange {
; CHECK-NEXT: OffsetStart: .text+0x0
; CHECK-NEXT: ISectStart: 0x0
-; CHECK-NEXT: Range: 0xBC
+; CHECK-NEXT: Range: 0xB8
; CHECK-NEXT: }
; CHECK-NEXT: }
; CHECK-NEXT: ProcEnd {
>From a1b6990d3df0f7625fa0a90a6b754d946bcc570f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 25 Sep 2025 15:16:06 +0000
Subject: [PATCH 3/3] Add swift async test
---
.../AArch64/AArch64PrologueEpilogue.cpp | 19 ++++---
llvm/test/CodeGen/AArch64/win-sve.ll | 50 +++++++++++++++++++
2 files changed, 61 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 193c5b3cf9e04..a9ac852c4c98a 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -1435,6 +1435,17 @@ void AArch64EpilogueEmitter::emitEpilogue() {
--SEHEpilogueStartI;
}
+ // Determine the ranges of SVE callee-saves. This is done before emitting any
+ // code at the end of the epilogue (for Swift async), which can get in the way
+ // of finding SVE callee-saves with CalleeSavesAboveFrameRecord.
+ auto [PPR, ZPR] = getSVEStackFrameSizes();
+ auto [PPRRange, ZPRRange] = partitionSVECS(
+ MBB,
+ SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
+ ? MBB.getFirstTerminator()
+ : FirstGPRRestoreI,
+ PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
+
if (HasFP && AFI->hasSwiftAsyncContext())
emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
@@ -1457,14 +1468,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
- auto [PPR, ZPR] = getSVEStackFrameSizes();
- auto [PPRRange, ZPRRange] = partitionSVECS(
- MBB,
- SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
- ? MBB.getFirstTerminator()
- : FirstGPRRestoreI,
- PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
-
StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
StackOffset SVEStackSize =
SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
diff --git a/llvm/test/CodeGen/AArch64/win-sve.ll b/llvm/test/CodeGen/AArch64/win-sve.ll
index 8446de1a59cf9..8997ba10fd10c 100644
--- a/llvm/test/CodeGen/AArch64/win-sve.ll
+++ b/llvm/test/CodeGen/AArch64/win-sve.ll
@@ -1560,3 +1560,53 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) {
store i32 %i, ptr %a
ret void
}
+
+declare ptr @llvm.swift.async.context.addr()
+
+define void @f16(ptr swiftasync %ctx, <vscale x 2 x i64> %foo) {
+; CHECK-LABEL: f16:
+; CHECK: .seh_proc f16
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: orr x29, x29, #0x1000000000000000
+; CHECK-NEXT: .seh_nop
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .seh_allocz 1
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_zreg z8, 0
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: stp x29, x30, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 8
+; CHECK-NEXT: str x22, [sp]
+; CHECK-NEXT: .seh_nop
+; CHECK-NEXT: add x29, sp, #8
+; CHECK-NEXT: .seh_add_fp 8
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: ldr x8, [x22]
+; CHECK-NEXT: stur x8, [x29, #-8]
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: .seh_save_fplr 8
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_zreg z8, 0
+; CHECK-NEXT: and x29, x29, #0xefffffffffffffff
+; CHECK-NEXT: .seh_nop
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .seh_allocz 1
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: ret
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+ tail call void asm sideeffect "", "~{z8}"()
+ %1 = load ptr, ptr %ctx, align 8
+ %2 = tail call ptr @llvm.swift.async.context.addr()
+ store ptr %1, ptr %2, align 8
+ ret void
+}
More information about the llvm-commits
mailing list