[llvm] [AArch64][SVE] Avoid extra pop of "FixedObject" with FPAfterSVECalleeSaves (PR #156452)

Benjamin Maxwell via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 9 02:48:20 PDT 2025


https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/156452

>From 414b3ea9014192d12f931c773b270f9ba82167cf Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 2 Sep 2025 08:28:45 +0000
Subject: [PATCH 1/3] [AArch64][SVE] Avoid extra pop of "FixedObject" with
 FPAfterSVECalleeSaves

Previously, we would pop `FixedObject`-bytes after deallocating the SVE
area, then again as part of the "AfterCSRPopSize". This could be seen
in the tests `@f6` and `@f9`.

This patch removes the erroneous pop, and refactors `FPAfterSVECalleeSaves`
to reuse more of the existing GPR deallocation logic, which allows for
post-decrements.
---
 .../AArch64/AArch64PrologueEpilogue.cpp       | 48 +++++----
 .../CodeGen/AArch64/framelayout-sve-win.mir   | 30 ++----
 llvm/test/CodeGen/AArch64/win-sve.ll          | 98 ++++++-------------
 3 files changed, 69 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index f110558fcb0d4..193c5b3cf9e04 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -1360,14 +1360,24 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   }
 
   bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes);
-  // Assume we can't combine the last pop with the sp restore.
-  bool CombineAfterCSRBump = false;
+
+  unsigned ProloguePopSize = PrologueSaveSize;
   if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+    // With CalleeSavesAboveFrameRecord ProloguePopSize is the amount of stack
+    // that needs to be popped until we reach the start of the SVE save area.
+    // The "FixedObject" stack occurs after the SVE area and must be popped
+    // later.
+    ProloguePopSize -= FixedObject;
     AfterCSRPopSize += FixedObject;
-  } else if (!CombineSPBump && PrologueSaveSize != 0) {
+  }
+
+  // Assume we can't combine the last pop with the sp restore.
+  if (!CombineSPBump && ProloguePopSize != 0) {
     MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
     while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
-           AArch64InstrInfo::isSEHInstruction(*Pop))
+           AArch64InstrInfo::isSEHInstruction(*Pop) ||
+           (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord &&
+            isPartOfSVECalleeSaves(Pop)))
       Pop = std::prev(Pop);
     // Converting the last ldp to a post-index ldp is valid only if the last
     // ldp's offset is 0.
@@ -1377,18 +1387,24 @@ void AArch64EpilogueEmitter::emitEpilogue() {
     // may clobber), convert it to a post-index ldp.
     if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
       convertCalleeSaveRestoreToSPPrePostIncDec(
-          Pop, DL, PrologueSaveSize, EmitCFI, MachineInstr::FrameDestroy,
-          PrologueSaveSize);
+          Pop, DL, ProloguePopSize, EmitCFI, MachineInstr::FrameDestroy,
+          ProloguePopSize);
+    } else if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+      // If not, and CalleeSavesAboveFrameRecord is enabled, deallocate
+      // callee-save non-SVE registers to move the stack pointer to the start of
+      // the SVE area.
+      emitFrameOffset(MBB, std::next(Pop), DL, AArch64::SP, AArch64::SP,
+                      StackOffset::getFixed(ProloguePopSize), TII,
+                      MachineInstr::FrameDestroy, false, NeedsWinCFI,
+                      &HasWinCFI);
     } else {
-      // If not, make sure to emit an add after the last ldp.
+      // Otherwise, make sure to emit an add after the last ldp.
       // We're doing this by transferring the size to be restored from the
       // adjustment *before* the CSR pops to the adjustment *after* the CSR
       // pops.
-      AfterCSRPopSize += PrologueSaveSize;
-      CombineAfterCSRBump = true;
+      AfterCSRPopSize += ProloguePopSize;
     }
   }
-
   // Move past the restores of the callee-saved registers.
   // If we plan on combining the sp bump of the local stack size and the callee
   // save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1467,16 +1483,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
                       NeedsWinCFI, &HasWinCFI);
     }
 
-    // Deallocate callee-save non-SVE registers.
-    emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
-    // Deallocate fixed objects.
-    emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
-                    StackOffset::getFixed(FixedObject), TII,
-                    MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
     // Deallocate callee-save SVE registers.
     emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
                     SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
@@ -1619,7 +1625,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
         MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
         StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
         false, NeedsWinCFI, &HasWinCFI, EmitCFI,
-        StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
+        StackOffset::getFixed(AfterCSRPopSize - ArgumentStackToRestore));
   }
 }
 
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
index 5933c5daa67ed..b8302e64f282d 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
@@ -380,10 +380,8 @@ body:             |
     ; CHECK-NEXT: frame-destroy SEH_EpilogStart
     ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
     ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
-    ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1)
-    ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
-    ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
-    ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+    ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+    ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
     ; CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 0 :: (load (s16) from %stack.4)
     ; CHECK-NEXT: frame-destroy SEH_SavePReg 4, 0
     ; CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 1 :: (load (s16) from %stack.3)
@@ -430,10 +428,8 @@ body:             |
     ; CHECK-NEXT: frame-destroy SEH_EpilogStart
     ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
     ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
-    ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1)
-    ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
-    ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
-    ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+    ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+    ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
     ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4)
     ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 0
     ; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3)
@@ -557,10 +553,8 @@ body:             |
     ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
     ; CHECK-NEXT: $x21, $lr = frame-destroy LDPXi $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
     ; CHECK-NEXT: frame-destroy SEH_SaveRegP 21, 30, 16
-    ; CHECK-NEXT: $x19, $x20 = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.4), (load (s64) from %stack.5)
-    ; CHECK-NEXT: frame-destroy SEH_SaveRegP 19, 20, 0
-    ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
-    ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
+    ; CHECK-NEXT: early-clobber $sp, $x19, $x20 = frame-destroy LDPXpost $sp, 4 :: (load (s64) from %stack.4), (load (s64) from %stack.5)
+    ; CHECK-NEXT: frame-destroy SEH_SaveRegP_X 19, 20, -32
     ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.21)
     ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2
     ; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.20)
@@ -745,10 +739,8 @@ body:             |
     ; CHECK-NEXT: frame-destroy SEH_EpilogStart
     ; CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
     ; CHECK-NEXT: frame-destroy SEH_SetFP
-    ; CHECK-NEXT: $fp, $lr = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
-    ; CHECK-NEXT: frame-destroy SEH_SaveFPLR 0
-    ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
-    ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+    ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
+    ; CHECK-NEXT: frame-destroy SEH_SaveFPLR_X -16
     ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.19)
     ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2
     ; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.18)
@@ -869,10 +861,8 @@ body:             |
     ; CHECK-NEXT: frame-destroy SEH_EpilogStart
     ; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 7, implicit $vg
     ; CHECK-NEXT: frame-destroy SEH_AllocZ 7
-    ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.6)
-    ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
-    ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
-    ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+    ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.6)
+    ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
     ; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.8)
     ; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 1
     ; CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.7)
diff --git a/llvm/test/CodeGen/AArch64/win-sve.ll b/llvm/test/CodeGen/AArch64/win-sve.ll
index 53ac9344175a3..8446de1a59cf9 100644
--- a/llvm/test/CodeGen/AArch64/win-sve.ll
+++ b/llvm/test/CodeGen/AArch64/win-sve.ll
@@ -75,10 +75,8 @@ define i32 @f(<vscale x 2 x i64> %x) {
 ; CHECK-NEXT:    .seh_startepilogue
 ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x30, 8
-; CHECK-NEXT:    ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x28, 0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    .seh_stackalloc 16
+; CHECK-NEXT:    ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x28, 16
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 2
 ; CHECK-NEXT:    ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -234,10 +232,8 @@ define void @f2(i64 %n, <vscale x 2 x i64> %x) {
 ; CHECK-NEXT:    .seh_save_fplr 16
 ; CHECK-NEXT:    ldr x28, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x28, 8
-; CHECK-NEXT:    ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x19, 0
-; CHECK-NEXT:    add sp, sp, #32
-; CHECK-NEXT:    .seh_stackalloc 32
+; CHECK-NEXT:    ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x19, 32
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 2
 ; CHECK-NEXT:    ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -384,10 +380,8 @@ define void @f3(i64 %n, <vscale x 2 x i64> %x) {
 ; CHECK-NEXT:    .seh_stackalloc 16
 ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x30, 8
-; CHECK-NEXT:    ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x28, 0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    .seh_stackalloc 16
+; CHECK-NEXT:    ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x28, 16
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 2
 ; CHECK-NEXT:    ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -538,10 +532,8 @@ define void @f4(i64 %n, <vscale x 2 x i64> %x) {
 ; CHECK-NEXT:    .seh_stackalloc 16
 ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x30, 8
-; CHECK-NEXT:    ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x28, 0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    .seh_stackalloc 16
+; CHECK-NEXT:    ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x28, 16
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 2
 ; CHECK-NEXT:    ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -702,10 +694,8 @@ define void @f5(i64 %n, <vscale x 2 x i64> %x) {
 ; CHECK-NEXT:    .seh_save_fplr 16
 ; CHECK-NEXT:    ldr x28, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x28, 8
-; CHECK-NEXT:    ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x19, 0
-; CHECK-NEXT:    add sp, sp, #32
-; CHECK-NEXT:    .seh_stackalloc 32
+; CHECK-NEXT:    ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x19, 32
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 2
 ; CHECK-NEXT:    ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -860,10 +850,10 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
 ; CHECK-NEXT:    stur x0, [x8, #16]
 ; CHECK-NEXT:    addvl x8, x29, #18
 ; CHECK-NEXT:    ldr x1, [x8, #32]
-; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:  .Ltmp0: // EH_LABEL
 ; CHECK-NEXT:    add x0, x19, #0
 ; CHECK-NEXT:    bl g6
-; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:  .Ltmp1: // EH_LABEL
 ; CHECK-NEXT:  // %bb.1: // %invoke.cont
 ; CHECK-NEXT:    .seh_startepilogue
 ; CHECK-NEXT:    add sp, sp, #64
@@ -872,10 +862,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
 ; CHECK-NEXT:    .seh_save_fplr 16
 ; CHECK-NEXT:    ldr x28, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x28, 8
-; CHECK-NEXT:    ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x19, 0
-; CHECK-NEXT:    add sp, sp, #32
-; CHECK-NEXT:    .seh_stackalloc 32
+; CHECK-NEXT:    ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x19, 32
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 2
 ; CHECK-NEXT:    ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -932,8 +920,6 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
 ; CHECK-NEXT:    .seh_save_preg p14, 10
 ; CHECK-NEXT:    ldr p15, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_preg p15, 11
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    .seh_stackalloc 16
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    .seh_allocz 18
 ; CHECK-NEXT:    add sp, sp, #16
@@ -1024,10 +1010,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
 ; CHECK-NEXT:    .seh_save_fplr 16
 ; CHECK-NEXT:    ldr x28, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x28, 8
-; CHECK-NEXT:    ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x19, 0
-; CHECK-NEXT:    add sp, sp, #32
-; CHECK-NEXT:    .seh_stackalloc 32
+; CHECK-NEXT:    ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x19, 32
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 2
 ; CHECK-NEXT:    ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1144,10 +1128,8 @@ define void @f8(<vscale x 2 x i64> %v) {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    .seh_startepilogue
-; CHECK-NEXT:    ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x30, 0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    .seh_stackalloc 16
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x30, 16
 ; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 0
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -1196,14 +1178,10 @@ define void @f9(<vscale x 2 x i64> %v, ...) {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    .seh_startepilogue
-; CHECK-NEXT:    ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x30, 0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    .seh_stackalloc 16
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x30, 16
 ; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 0
-; CHECK-NEXT:    add sp, sp, #64
-; CHECK-NEXT:    .seh_stackalloc 64
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    .seh_allocz 1
 ; CHECK-NEXT:    add sp, sp, #64
@@ -1301,10 +1279,8 @@ define void @f10(i64 %n, <vscale x 2 x i64> %x) "frame-pointer"="all" {
 ; CHECK-NEXT:    .seh_stackalloc 16
 ; CHECK-NEXT:    ldp x29, x30, [sp, #8] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_fplr 8
-; CHECK-NEXT:    ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x28, 0
-; CHECK-NEXT:    add sp, sp, #32
-; CHECK-NEXT:    .seh_stackalloc 32
+; CHECK-NEXT:    ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x28, 32
 ; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 2
 ; CHECK-NEXT:    ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1390,10 +1366,8 @@ define i32 @f11(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    str d0, [sp, #8]
 ; CHECK-NEXT:    .seh_startepilogue
-; CHECK-NEXT:    ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x30, 0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    .seh_stackalloc 16
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x30, 16
 ; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 0
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -1431,10 +1405,8 @@ define i32 @f12(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible
 ; CHECK-NEXT:    .seh_startepilogue
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    .seh_allocz 1
-; CHECK-NEXT:    ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x30, 0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    .seh_stackalloc 16
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x30, 16
 ; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 0
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -1475,10 +1447,8 @@ define i32 @f13(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" {
 ; CHECK-NEXT:    .seh_startepilogue
 ; CHECK-NEXT:    ldp x29, x30, [sp, #8] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_fplr 8
-; CHECK-NEXT:    ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x28, 0
-; CHECK-NEXT:    add sp, sp, #32
-; CHECK-NEXT:    .seh_stackalloc 32
+; CHECK-NEXT:    ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x28, 32
 ; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 0
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -1521,10 +1491,8 @@ define i32 @f14(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" {
 ; CHECK-NEXT:    .seh_allocz 1
 ; CHECK-NEXT:    ldp x29, x30, [sp, #8] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_fplr 8
-; CHECK-NEXT:    ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x28, 0
-; CHECK-NEXT:    add sp, sp, #32
-; CHECK-NEXT:    .seh_stackalloc 32
+; CHECK-NEXT:    ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x28, 32
 ; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 0
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -1572,10 +1540,8 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) {
 ; CHECK-NEXT:    .seh_stackalloc 16
 ; CHECK-NEXT:    ldr x30, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_reg x30, 8
-; CHECK-NEXT:    ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT:    .seh_save_reg x28, 0
-; CHECK-NEXT:    add sp, sp, #16
-; CHECK-NEXT:    .seh_stackalloc 16
+; CHECK-NEXT:    ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .seh_save_reg_x x28, 16
 ; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    .seh_save_zreg z8, 0
 ; CHECK-NEXT:    addvl sp, sp, #1

>From 7e05ecdf5fdc73cee84b9d8f9a05170026ac2ade Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 2 Sep 2025 13:44:58 +0000
Subject: [PATCH 2/3] Update codeview test

---
 llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll b/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
index 446a84dc0294c..ffdc80a350a24 100644
--- a/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
+++ b/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
@@ -101,7 +101,7 @@
 ; CHECK-NEXT:      LocalVariableAddrRange {
 ; CHECK-NEXT:        OffsetStart: .text+0x0
 ; CHECK-NEXT:        ISectStart: 0x0
-; CHECK-NEXT:        Range: 0xBC
+; CHECK-NEXT:        Range: 0xB8
 ; CHECK-NEXT:      }
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    ProcEnd {

>From a1b6990d3df0f7625fa0a90a6b754d946bcc570f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Thu, 25 Sep 2025 15:16:06 +0000
Subject: [PATCH 3/3] Add swift async test

---
 .../AArch64/AArch64PrologueEpilogue.cpp       | 19 ++++---
 llvm/test/CodeGen/AArch64/win-sve.ll          | 50 +++++++++++++++++++
 2 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 193c5b3cf9e04..a9ac852c4c98a 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -1435,6 +1435,17 @@ void AArch64EpilogueEmitter::emitEpilogue() {
     --SEHEpilogueStartI;
   }
 
+  // Determine the ranges of SVE callee-saves. This is done before emitting any
+  // code at the end of the epilogue (for Swift async), which can get in the way
+  // of finding SVE callee-saves with CalleeSavesAboveFrameRecord.
+  auto [PPR, ZPR] = getSVEStackFrameSizes();
+  auto [PPRRange, ZPRRange] = partitionSVECS(
+      MBB,
+      SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
+          ? MBB.getFirstTerminator()
+          : FirstGPRRestoreI,
+      PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
+
   if (HasFP && AFI->hasSwiftAsyncContext())
     emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
 
@@ -1457,14 +1468,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
   NumBytes -= PrologueSaveSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
-  auto [PPR, ZPR] = getSVEStackFrameSizes();
-  auto [PPRRange, ZPRRange] = partitionSVECS(
-      MBB,
-      SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
-          ? MBB.getFirstTerminator()
-          : FirstGPRRestoreI,
-      PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
-
   StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
   StackOffset SVEStackSize =
       SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
diff --git a/llvm/test/CodeGen/AArch64/win-sve.ll b/llvm/test/CodeGen/AArch64/win-sve.ll
index 8446de1a59cf9..8997ba10fd10c 100644
--- a/llvm/test/CodeGen/AArch64/win-sve.ll
+++ b/llvm/test/CodeGen/AArch64/win-sve.ll
@@ -1560,3 +1560,53 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) {
   store i32 %i, ptr %a
   ret void
 }
+
+declare ptr @llvm.swift.async.context.addr()
+
+define void @f16(ptr swiftasync %ctx, <vscale x 2 x i64> %foo) {
+; CHECK-LABEL: f16:
+; CHECK:       .seh_proc f16
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    orr x29, x29, #0x1000000000000000
+; CHECK-NEXT:    .seh_nop
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .seh_allocz 1
+; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    .seh_save_zreg z8, 0
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .seh_stackalloc 32
+; CHECK-NEXT:    stp x29, x30, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT:    .seh_save_fplr 8
+; CHECK-NEXT:    str x22, [sp]
+; CHECK-NEXT:    .seh_nop
+; CHECK-NEXT:    add x29, sp, #8
+; CHECK-NEXT:    .seh_add_fp 8
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr x8, [x22]
+; CHECK-NEXT:    stur x8, [x29, #-8]
+; CHECK-NEXT:    .seh_startepilogue
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    .seh_stackalloc 16
+; CHECK-NEXT:    ldp x29, x30, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    .seh_stackalloc 32
+; CHECK-NEXT:    .seh_save_fplr 8
+; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    .seh_save_zreg z8, 0
+; CHECK-NEXT:    and x29, x29, #0xefffffffffffffff
+; CHECK-NEXT:    .seh_nop
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .seh_allocz 1
+; CHECK-NEXT:    .seh_endepilogue
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .seh_endfunclet
+; CHECK-NEXT:    .seh_endproc
+  tail call void asm sideeffect "", "~{z8}"()
+  %1 = load ptr, ptr %ctx, align 8
+  %2 = tail call ptr @llvm.swift.async.context.addr()
+  store ptr %1, ptr %2, align 8
+  ret void
+}



More information about the llvm-commits mailing list