[flang-commits] [compiler-rt] [flang] [llvm] [AArch64] fix trampoline implementation: use X15 (PR #126743)

Tue Feb 11 07:32:25 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Jameson Nash (vtjnash)

<details>
<summary>Changes</summary>

AAPCS64 reserves any of X9-X15 for a compiler to choose to use for this purpose, and says not to use X16 or X18 like GCC (and the previous implementation) chose to use. The X18 register may need to get used by the kernel in some circumstances, as specified by the platform ABI, so it is generally an unwise choice. Simply choosing a different register fixes the problem of this being broken on any platform that actually follows the platform ABI (which is all of them except EABI, if I am reading this linux kernel bug correctly https://lkml2.uits.iu.edu/hypermail/linux/kernel/2001.2/01502.html). As a side benefit, also generate slightly better code and avoids needing the compiler-rt to be present. I did that by following the XCore implementation instead of PPC (although in hindsight, following the RISCV might have been slightly more readable). That X18 is wrong to use for this purpose has been known for many years (e.g. https://www.mail-archive.com/gcc@gcc.gnu.org/msg76934.html) and also known that fixing this to use one of the correct registers is not an ABI break, since this only appears inside of a translation unit. Some of the other temporary registers (e.g. X9) are already reserved inside llvm for internal use as a generic temporary register in the prologue before saving registers, while X15 was already used in rare cases as a scratch register in the prologue as well, so I felt that seemed the most logical choice to choose here.

@ceseo Is there any tests you'd recommend I run to show that this works correctly?

@vchuravy @llvm/issue-subscribers-julialang

---

Patch is 43.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126743.diff


13 Files Affected:

- (modified) compiler-rt/lib/builtins/README.txt (-5) 
- (modified) compiler-rt/lib/builtins/trampoline_setup.c (-42) 
- (modified) compiler-rt/test/builtins/Unit/trampoline_setup_test.c (+1-1) 
- (modified) flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp (+2-2) 
- (modified) llvm/lib/Target/AArch64/AArch64CallingConvention.td (+23-13) 
- (modified) llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (+26) 
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+46-39) 
- (modified) llvm/test/CodeGen/AArch64/nest-register.ll (+10-6) 
- (modified) llvm/test/CodeGen/AArch64/preserve_nonecc_call.ll (+64-52) 
- (modified) llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/trampoline.ll (+245-12) 
- (modified) llvm/test/CodeGen/AArch64/win64cc-x18.ll (+9-18) 
- (modified) llvm/test/CodeGen/AArch64/zero-call-used-regs.ll (+8-8) 


``````````diff

diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt
index 19f26c92a0f94f..2d213d95f333af 100644
--- a/compiler-rt/lib/builtins/README.txt
+++ b/compiler-rt/lib/builtins/README.txt
@@ -272,11 +272,6 @@ switch32
 switch8
 switchu8
 
-// This function generates a custom trampoline function with the specific
-// realFunc and localsPtr values.
-void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
-                        const void* realFunc, void* localsPtr);
-
 // There is no C interface to the *_vfp_d8_d15_regs functions.  There are
 // called in the prolog and epilog of Thumb1 functions.  When the C++ ABI use
 // SJLJ for exceptions, each function with a catch clause or destructors needs
diff --git a/compiler-rt/lib/builtins/trampoline_setup.c b/compiler-rt/lib/builtins/trampoline_setup.c
index 830e25e4c0303a..844eb279441428 100644
--- a/compiler-rt/lib/builtins/trampoline_setup.c
+++ b/compiler-rt/lib/builtins/trampoline_setup.c
@@ -41,45 +41,3 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
   __clear_cache(trampOnStack, &trampOnStack[10]);
 }
 #endif // __powerpc__ && !defined(__powerpc64__)
-
-// The AArch64 compiler generates calls to __trampoline_setup() when creating
-// trampoline functions on the stack for use with nested functions.
-// This function creates a custom 36-byte trampoline function on the stack
-// which loads x18 with a pointer to the outer function's locals
-// and then jumps to the target nested function.
-// Note: x18 is a reserved platform register on Windows and macOS.
-
-#if defined(__aarch64__) && defined(__ELF__)
-COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
-                                        int trampSizeAllocated,
-                                        const void *realFunc, void *localsPtr) {
-  // This should never happen, but if compiler did not allocate
-  // enough space on stack for the trampoline, abort.
-  if (trampSizeAllocated < 36)
-    compilerrt_abort();
-
-  // create trampoline
-  // Load realFunc into x17. mov/movk 16 bits at a time.
-  trampOnStack[0] =
-      0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
-  trampOnStack[1] =
-      0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
-  trampOnStack[2] =
-      0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
-  trampOnStack[3] =
-      0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
-  // Load localsPtr into x18
-  trampOnStack[4] =
-      0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
-  trampOnStack[5] =
-      0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
-  trampOnStack[6] =
-      0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
-  trampOnStack[7] =
-      0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
-  trampOnStack[8] = 0xd61f0220; // br x17
-
-  // Clear instruction cache.
-  __clear_cache(trampOnStack, &trampOnStack[9]);
-}
-#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)
diff --git a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
index d51d35acaa02f1..da115fe7642718 100644
--- a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
+++ b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c
@@ -7,7 +7,7 @@
 
 /*
  * Tests nested functions
- * The ppc and aarch64 compilers generates a call to __trampoline_setup
+ * The ppc compiler generates a call to __trampoline_setup
  * The i386 and x86_64 compilers generate a call to ___enable_execute_stack
  */
 
diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index 26f4aee21d8bda..f402404121da08 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -274,10 +274,10 @@ class BoxedProcedurePass
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
-            // For AArch64, PPC32 and PPC64, the thunk is populated by a call to
+            // For PPC32 and PPC64, the thunk is populated by a call to
             // __trampoline_setup, which is defined in
             // compiler-rt/lib/builtins/trampoline_setup.c and requires the
-            // thunk size greater than 32 bytes.  For RISCV and x86_64, the
+            // thunk size greater than 32 bytes.  For Aarch64, RISCV and x86_64, the
             // thunk setup doesn't go through __trampoline_setup and fits in 32
             // bytes.
             fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 7cca6d9bc6b9c3..8355463dea94ea 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -28,6 +28,12 @@ class CCIfSubtarget<string F, CCAction A>
 //===----------------------------------------------------------------------===//
 
 defvar AArch64_Common = [
+  // The 'nest' parameter, if any, is passed in X15.
+  // The previous register used here (X18) is also defined to be unavailable
+  // for this purpose, while all of X9-X15 were defined to be free for LLVM to
+  // use for this, so use X15 (which LLVM often already clobbers anyways).
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
@@ -117,16 +123,12 @@ defvar AArch64_Common = [
 ];
 
 let Entry = 1 in
-def CC_AArch64_AAPCS : CallingConv<!listconcat(
-  // The 'nest' parameter, if any, is passed in X18.
-  // Darwin and Windows use X18 as the platform register and hence 'nest' isn't
-  // currently supported there.
-  [CCIfNest<CCAssignToReg<[X18]>>],
-  AArch64_Common
-)>;
+def CC_AArch64_AAPCS : CallingConv<AArch64_Common>;
 
 let Entry = 1 in
 def RetCC_AArch64_AAPCS : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
@@ -177,6 +179,8 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
 // a stack layout compatible with the x64 calling convention.
 let Entry = 1 in
 def CC_AArch64_Arm64EC_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   // Convert small floating-point values to integer.
   CCIfType<[f16, bf16], CCBitConvertToType<i16>>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -295,6 +299,8 @@ def CC_AArch64_Arm64EC_Thunk_Native : CallingConv<[
 
 let Entry = 1 in
 def RetCC_AArch64_Arm64EC_Thunk : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   // The X86-Win64 calling convention always returns __m64 values in RAX.
   CCIfType<[x86mmx], CCBitConvertToType<i64>>,
 
@@ -353,6 +359,8 @@ def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
 //     + Stack slots are sized as needed rather than being at least 64-bit.
 let Entry = 1 in
 def CC_AArch64_DarwinPCS : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -427,6 +435,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
 
 let Entry = 1 in
 def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -450,6 +460,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
 // same as the normal Darwin VarArgs handling.
 let Entry = 1 in
 def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
@@ -494,6 +506,8 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
 
 let Entry = 1 in
 def CC_AArch64_GHC : CallingConv<[
+  CCIfNest<CCAssignToReg<[X15]>>,
+
   CCIfType<[iPTR], CCBitConvertToType<i64>>,
 
   // Handle all vector types as either f64 or v2f64.
@@ -523,6 +537,7 @@ def CC_AArch64_Preserve_None : CallingConv<[
   // We can pass arguments in all general registers, except:
   // - X8, used for sret
   // - X16/X17, used by the linker as IP0/IP1
+  // - X15, the nest register and used by Windows for stack allocation
   // - X18, the platform register
   // - X19, the base pointer
   // - X29, the frame pointer
@@ -533,6 +548,7 @@ def CC_AArch64_Preserve_None : CallingConv<[
   // normal functions without saving and reloading arguments.
   // X9 is assigned last as it is used in FrameLowering as the first
   // choice for a scratch register.
+  CCIfNest<CCAssignToReg<[X15]>>,
   CCIfType<[i32], CCAssignToReg<[W20, W21, W22, W23,
                                  W24, W25, W26, W27, W28,
                                  W0, W1, W2, W3, W4, W5,
@@ -544,12 +560,6 @@ def CC_AArch64_Preserve_None : CallingConv<[
                                  X6, X7, X10, X11,
                                  X12, X13, X14, X9]>>,
 
-  // Windows uses X15 for stack allocation
-  CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
-    CCIfType<[i32], CCAssignToReg<[W15]>>>,
-  CCIf<"!State.getMachineFunction().getSubtarget<AArch64Subtarget>().isTargetWindows()",
-    CCIfType<[i64], CCAssignToReg<[X15]>>>,
-
   CCDelegateTo<CC_AArch64_AAPCS>
 ]>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index d3abd79b85a75f..ced3ff7b742ad1 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2044,6 +2044,25 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           : 0;
 
   if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) {
+    // Find an available register to store value of VG to.
+    unsigned X15Scratch = AArch64::NoRegister;
+    if (LiveRegs.contains(AArch64::X15)) {
+        // if (llvm::any_of(
+        //         MBB.liveins(),
+        //         [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
+        //           return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
+        //               AArch64::X15, LiveIn.PhysReg);
+        //         }))
+      X15Scratch = findScratchNonCalleeSaveRegister(&MBB);
+      assert(X15Scratch != AArch64::NoRegister);
+      LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::X15, RegState::Undef)
+          .addReg(AArch64::X15, RegState::Implicit)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+
     uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
     if (NeedsWinCFI) {
       HasWinCFI = true;
@@ -2166,6 +2185,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // we've set a frame pointer and already finished the SEH prologue.
       assert(!NeedsWinCFI);
     }
+    if (X15Scratch != AArch64::NoRegister) {
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
+          .addReg(AArch64::XZR)
+          .addReg(X15Scratch, RegState::Undef)
+          .addReg(X15Scratch, RegState::Implicit)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
   }
 
   StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0d1608a97bfd30..1404077446420d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7290,59 +7290,66 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) {
 
 SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
                                                       SelectionDAG &DAG) const {
-  // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
-  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
-    report_fatal_error(
-        "ADJUST_TRAMPOLINE operation is only supported on Linux.");
-
   return Op.getOperand(0);
 }
 
 SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                                     SelectionDAG &DAG) const {
-
-  // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
-  if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
-    report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
-
   SDValue Chain = Op.getOperand(0);
-  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue Trmp = Op.getOperand(1); // trampoline, 36 bytes
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
-  SDLoc dl(Op);
 
-  EVT PtrVT = getPointerTy(DAG.getDataLayout());
-  Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
-
-  TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
-  Entry.Ty = IntPtrTy;
-  Entry.Node = Trmp;
-  Args.push_back(Entry);
+  // ldr x15, .+16
+  // ldr x17, .+20
+  // br x17
+  // 0
+  // .nest: .qword nest
+  // .fptr: .qword fptr
+  SDValue OutChains[5];
 
-  if (auto *FI = dyn_cast<FrameIndexSDNode>(Trmp.getNode())) {
-    MachineFunction &MF = DAG.getMachineFunction();
-    MachineFrameInfo &MFI = MF.getFrameInfo();
-    Entry.Node =
-        DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64);
-  } else
-    Entry.Node = DAG.getConstant(36, dl, MVT::i64);
+  const char X15 = 0x0f;
+  const char X17 = 0x11;
 
-  Args.push_back(Entry);
-  Entry.Node = FPtr;
-  Args.push_back(Entry);
-  Entry.Node = Nest;
-  Args.push_back(Entry);
+  SDValue Addr = Trmp;
 
-  // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
-      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-      DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
+  SDLoc dl(Op);
+  OutChains[0] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0x58000080u | X15, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr));
 
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.second;
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(4, dl, MVT::i64));
+  OutChains[1] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0x580000b0u | X17, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr, 4));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(8, dl, MVT::i64));
+  OutChains[2] =
+      DAG.getStore(Chain, dl, DAG.getConstant(0xd61f0220u, dl, MVT::i32), Addr,
+                   MachinePointerInfo(TrmpAddr, 8));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(16, dl, MVT::i64));
+  OutChains[3] =
+      DAG.getStore(Chain, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
+
+  Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(24, dl, MVT::i64));
+  OutChains[4] =
+      DAG.getStore(Chain, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
+
+  SDValue StoreToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+
+  SDValue EndOfTrmp = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                     DAG.getConstant(12, dl, MVT::i64));
+
+  // Call clear cache on the trampoline instructions.
+  return DAG.getNode(ISD::CLEAR_CACHE, dl, MVT::Other, StoreToken,
+                              Trmp, EndOfTrmp);
 }
 
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
diff --git a/llvm/test/CodeGen/AArch64/nest-register.ll b/llvm/test/CodeGen/AArch64/nest-register.ll
index 1e1c1b044bab65..2e94dfba1fa523 100644
--- a/llvm/test/CodeGen/AArch64/nest-register.ll
+++ b/llvm/test/CodeGen/AArch64/nest-register.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -disable-post-ra -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 ; Tests that the 'nest' parameter attribute causes the relevant parameter to be
@@ -5,18 +6,21 @@
 
 define ptr @nest_receiver(ptr nest %arg) nounwind {
 ; CHECK-LABEL: nest_receiver:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: mov x0, x18
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, x15
+; CHECK-NEXT:    ret
 
   ret ptr %arg
 }
 
 define ptr @nest_caller(ptr %arg) nounwind {
 ; CHECK-LABEL: nest_caller:
-; CHECK: mov x18, x0
-; CHECK-NEXT: bl nest_receiver
-; CHECK: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x15, x0
+; CHECK-NEXT:    bl nest_receiver
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 
   %result = call ptr @nest_receiver(ptr nest %arg)
   ret ptr %result
diff --git a/llvm/test/CodeGen/AArch64/preserve_nonecc_call.ll b/llvm/test/CodeGen/AArch64/preserve_nonecc_call.ll
index 9b9717c19321e7..e0d7b5abe7bea2 100644
--- a/llvm/test/CodeGen/AArch64/preserve_nonecc_call.ll
+++ b/llvm/test/CodeGen/AArch64/preserve_nonecc_call.ll
@@ -184,10 +184,11 @@ declare preserve_nonecc i64 @callee_with_many_param2(i64 %a1, i64 %a2, i64 %a3,
 define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, i64 %a7, i64 %a8, i64 %a9, i64 %a10, i64 %a11, i64 %a12, i64 %a13, i64 %a14, i64 %a15, i64 %a16, i64 %a17, i64 %a18, i64 %a19, i64 %a20, i64 %a21, i64 %a22, i64 %a23, i64 %a24) {
 ; CHECK-LABEL: callee_with_many_param:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov x8, x15
+; CHECK-NEXT:    ldr x8, [sp, #32]
 ; CHECK-NEXT:    mov x15, x20
 ; CHECK-NEXT:    mov x20, x21
 ; CHECK-NEXT:    mov x21, x22
@@ -212,17 +213,20 @@ define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i6
 ; CHECK-NEXT:    mov x13, x14
 ; CHECK-NEXT:    mov x14, x9
 ; CHECK-NEXT:    mov x9, x8
+; CHECK-NEXT:    str x15, [sp]
 ; CHECK-NEXT:    bl callee_with_many_param2
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
 ;
 ; DARWIN-LABEL: callee_with_many_param:
 ; DARWIN:       ; %bb.0:
-; DARWIN-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; DARWIN-NEXT:    .cfi_def_cfa_offset 16
+; DARWIN-NEXT:    sub sp, sp, #32
+; DARWIN-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; DARWIN-NEXT:    .cfi_def_cfa_offset 32
 ; DARWIN-NEXT:    .cfi_offset w30, -8
 ; DARWIN-NEXT:    .cfi_offset w29, -16
-; DARWIN-NEXT:    mov x8, x15
+; DARWIN-NEXT:    ldr x8, [sp, #32]
 ; DARWIN-NEXT:    mov x15, x20
 ; DARWIN-NEXT:    mov x20, x21
 ; DARWIN-NEXT:    mov x21, x22
@@ -247,8 +251,10 @@ define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i6
 ; DARWIN-NEXT:    mov x13, x14
 ; DARWIN-NEXT:    mov x14, x9
 ; DARWIN-NEXT:    mov x9, x8
+; DARWIN-NEXT:    str x15, [sp]
 ; DARWIN-NEXT:    bl _callee_with_many_param2
-; DARWIN-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; DARWIN-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; DARWIN-NEXT:    add sp, sp, #32
 ; DARWIN-NEXT:    ret
 ;
 ; WIN-LABEL: callee_with_many_param:
@@ -302,17 +308,18 @@ define preserve_nonecc i64 @callee_with_many_param(i64 %a1, i64 %a2, i64 %a3, i6
 define i64 @caller3() {
 ; CHECK-LABEL: caller3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-160]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Fold...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/126743