[llvm] [clang] [AArch64] Stack probing for dynamic allocas in SelectionDAG (PR #66525)

Fri Nov 17 04:05:43 PST 2023

https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/66525

>From b42de31d5584cddb90c22c94e9d971feaaf0b624 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Wed, 11 Oct 2023 17:22:51 +0100
Subject: [PATCH 1/9] [clang][AArch64] Pass down stack clash protection options
 to LLVM/Backend

---
 clang/lib/CodeGen/CodeGenModule.cpp         | 12 +++++++++++-
 clang/lib/Driver/ToolChains/Clang.cpp       |  2 +-
 clang/test/CodeGen/stack-clash-protection.c | 16 ++++++++++++----
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index f1b900be74b2cdf..bbde22fb5d82ee3 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1085,6 +1085,16 @@ void CodeGenModule::Release() {
                                 "sign-return-address-with-bkey", 1);
   }
 
+  if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be) {
+    auto *InlineAsm = llvm::MDString::get(TheModule.getContext(), "inline-asm");
+    if (CodeGenOpts.StackClashProtector)
+      getModule().addModuleFlag(llvm::Module::Override, "probe-stack",
+                                InlineAsm);
+    if (CodeGenOpts.StackProbeSize && CodeGenOpts.StackProbeSize != 4096)
+      getModule().addModuleFlag(llvm::Module::Min, "stack-probe-size",
+                                CodeGenOpts.StackProbeSize);
+  }
+
   if (!CodeGenOpts.MemoryProfileOutput.empty()) {
     llvm::LLVMContext &Ctx = TheModule.getContext();
     getModule().addModuleFlag(
@@ -2296,7 +2306,7 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
   if ((!D || !D->hasAttr<NoUwtableAttr>()) && CodeGenOpts.UnwindTables)
     B.addUWTableAttr(llvm::UWTableKind(CodeGenOpts.UnwindTables));
 
-  if (CodeGenOpts.StackClashProtector)
+  if (CodeGenOpts.StackClashProtector && !getTarget().getTriple().isAArch64())
     B.addAttribute("probe-stack", "inline-asm");
 
   if (!hasUnwindExceptions(LangOpts))
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index b462f5a44057d94..7add64ac6f2dfd5 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3507,7 +3507,7 @@ static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args,
     return;
 
   if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ() &&
-      !EffectiveTriple.isPPC64())
+      !EffectiveTriple.isPPC64() && !EffectiveTriple.isAArch64())
     return;
 
   Args.addOptInFlag(CmdArgs, options::OPT_fstack_clash_protection,
diff --git a/clang/test/CodeGen/stack-clash-protection.c b/clang/test/CodeGen/stack-clash-protection.c
index 67571f5cdb2c14c..2f502ef453d42f4 100644
--- a/clang/test/CodeGen/stack-clash-protection.c
+++ b/clang/test/CodeGen/stack-clash-protection.c
@@ -1,10 +1,12 @@
 // Check the correct function attributes are generated
-// RUN: %clang_cc1 -triple x86_64-linux -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
-// RUN: %clang_cc1 -triple s390x-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
-// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
-// RUN: %clang_cc1 -triple powerpc64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple s390x-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s --check-prefixes CHECK-AARCH64
 
 // CHECK: define{{.*}} void @large_stack() #[[A:.*]] {
+// CHECK-AARCH64: define{{.*}} void @large_stack() #[[A:.*]] {
 void large_stack(void) {
   volatile int stack[20000], i;
   for (i = 0; i < sizeof(stack) / sizeof(int); ++i)
@@ -12,14 +14,20 @@ void large_stack(void) {
 }
 
 // CHECK: define{{.*}} void @vla({{.*}}) #[[A:.*]] {
+// CHECK-AARCH64: define{{.*}} void @vla({{.*}}) #[[A:.*]] {
 void vla(int n) {
   volatile int vla[n];
   __builtin_memset(&vla[0], 0, 1);
 }
 
 // CHECK: define{{.*}} void @builtin_alloca({{.*}}) #[[A:.*]] {
+// CHECK-AARCH64: define{{.*}} void @builtin_alloca({{.*}}) #[[A:.*]] {
 void builtin_alloca(int n) {
   volatile void *mem = __builtin_alloca(n);
 }
 
 // CHECK: attributes #[[A]] = {{.*}} "probe-stack"="inline-asm"
+// CHECK-AARCH64-NOT: attributes #[[A]] = {{.*}} "probe-stack"
+
+// CHECK-AARCH64: !{i32 4, !"probe-stack", !"inline-asm"}
+// CHECK-AARCH64: !{i32 8, !"stack-probe-size", i32 8192}

>From c2b0cc3b824d79d29bdbcd5f284c13f11a56d6ef Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 15 Sep 2023 12:46:44 +0100
Subject: [PATCH 2/9] [AArch64] Stack probing for function prologues

This adds code to AArch64 function prologues to protect against stack
clash attacks by probing (writing to) the stack at regular enough
intervals to ensure that the guard page cannot be skipped over.

The patch depends on and maintains the following invariants:

* Upon function entry the caller guarantees that it has probed the stack
  (e.g. performed a store) at some address [sp, #N],
  where`0 <= N <= 1024`. This invariant comes from a requirement for
  compatibility with GCC.

* Any address range in the allocated stack, no smaller than
  stack-probe-size bytes contains at least one probe

* At any time the stack pointer is above or in the guard page

* Probes are performed in descreasing address order

The `stack-probe-size` is a function attribute that can be set by a
platform to correspond to the guard page size. By default, the stack
probe size is 4KiB, which is a safe default as this is the smallest
possible page size for AArch64. Linux uses a 64KiB guard for AArch64, so
this can be overridden by the `stack-probe-size` function attribute.

For small frames without a frame pointer (<= 240 bytes), no probes are
needed.

For larger frame sizes, LLVM always stores `x29` to the stack. This serves
as an implicit stack probe. Thus, while allocating stack objects the
compiler assumes that the stack has been probed at `[sp]`.

There are multiple probing sequences that can be emitted, depending on
the size of the stack allocation:

* A straight-line sequence of subtracts and stores, used when the
  allocation size is smaller than 5 guard pages.

* A loop allocating and probing one page size per iteration, plus at
  most a single probe to deal with the remainder, used when the
  allocation size is larger but still known at compile time.

* A loop which moves the SP down to the target value held in a register,
  used when the allocation size is not known at compile-time, such as
  when allocating space for SVE values, or when over-aligning the stack.
  This is emitted in AArch64InstrInfo because it will also be used for
  dynamic allocas in a future patch.

* A single probe where the amount of stack adjustment is unknown, but is
  known to be less than or equal to the page size.

Change-Id: Ib31bc23d2806835bc6da822599f2bd2ecc18e4bf
Co-authored-by: Oliver Stannard <oliver.stannard at linaro.org>
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 344 +++++++++-
 .../lib/Target/AArch64/AArch64FrameLowering.h |  25 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   6 +
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  10 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  91 ++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   7 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  25 +-
 .../AArch64/AArch64MachineFunctionInfo.cpp    |  43 +-
 .../AArch64/AArch64MachineFunctionInfo.h      |   6 +
 .../test/CodeGen/AArch64/stack-probing-64k.ll | 392 +++++++++++
 .../test/CodeGen/AArch64/stack-probing-sve.ll | 644 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/stack-probing.ll    | 539 +++++++++++++++
 12 files changed, 2094 insertions(+), 38 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-64k.ll
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-sve.ll
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing.ll

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index b036f7582323700..61847d2e7a6007e 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -689,9 +689,16 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores(
   emitCalleeSavedRestores(MBB, MBBI, true);
 }
 
+// Return the maximum possible number of bytes for `Size` due to the
+// architectural limit on the size of a SVE register.
+static int64_t upperBound(StackOffset Size) {
+  static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
+  return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed();
+}
+
 void AArch64FrameLowering::allocateStackSpace(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    bool NeedsRealignment, StackOffset AllocSize, bool NeedsWinCFI,
+    int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI,
     bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const {
 
   if (!AllocSize)
@@ -704,27 +711,126 @@ void AArch64FrameLowering::allocateStackSpace(
   AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  Register TargetReg =
-      NeedsRealignment ? findScratchNonCalleeSaveRegister(&MBB) : AArch64::SP;
-  // SUB Xd/SP, SP, AllocSize
+  const int64_t MaxAlign = MFI.getMaxAlign().value();
+  const uint64_t AndMask = ~(MaxAlign - 1);
+
+  if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) {
+    Register TargetReg = RealignmentPadding
+                             ? findScratchNonCalleeSaveRegister(&MBB)
+                             : AArch64::SP;
+    // SUB Xd/SP, SP, AllocSize
+    emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
+                    MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
+                    EmitCFI, InitialOffset);
+
+    if (RealignmentPadding) {
+      // AND SP, X9, 0b11111...0000
+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
+          .addReg(TargetReg, RegState::Kill)
+          .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
+          .setMIFlags(MachineInstr::FrameSetup);
+      AFI.setStackRealigned(true);
+
+      // No need for SEH instructions here; if we're realigning the stack,
+      // we've set a frame pointer and already finished the SEH prologue.
+      assert(!NeedsWinCFI);
+    }
+    return;
+  }
+
+  //
+  // Stack probing allocation.
+  //
+
+  // Fixed length allocation. If we don't need to re-align the stack and don't
+  // have SVE objects, we can use a more efficient sequence for stack probing.
+  if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) {
+    Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+    assert(ScratchReg != AArch64::NoRegister);
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC))
+        .addDef(ScratchReg)
+        .addImm(AllocSize.getFixed())
+        .addImm(InitialOffset.getFixed())
+        .addImm(InitialOffset.getScalable());
+    // The fixed allocation may leave unprobed bytes at the top of the
+    // stack. If we have variable-sized objects, we need to issue an extra
+    // probe, so their allocations starts in a known state.
+    if (MFI.hasVarSizedObjects()) {
+      // STR XZR, [SP]
+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::SP)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+
+    return;
+  }
+
+  // Variable length allocation.
+
+  // If the (unknown) allocation size cannot exceed the probe size, decrement
+  // the stack pointer right away.
+  int64_t ProbeSize = AFI.getStackProbeSize();
+  if (upperBound(AllocSize) + RealignmentPadding <= ProbeSize) {
+    Register ScratchReg = RealignmentPadding
+                              ? findScratchNonCalleeSaveRegister(&MBB)
+                              : AArch64::SP;
+    assert(ScratchReg != AArch64::NoRegister);
+    // SUB Xd, SP, AllocSize
+    emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII,
+                    MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
+                    EmitCFI, InitialOffset);
+    if (RealignmentPadding) {
+      // AND SP, Xn, 0b11111...0000
+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
+          .addReg(ScratchReg, RegState::Kill)
+          .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
+          .setMIFlags(MachineInstr::FrameSetup);
+      if (MFI.hasVarSizedObjects() ||
+          upperBound(AllocSize) + RealignmentPadding >
+              AArch64::StackProbeMaxUnprobedStack) {
+        // STR XZR, [SP]
+        BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
+            .addReg(AArch64::XZR)
+            .addReg(AArch64::SP)
+            .addImm(0)
+            .setMIFlags(MachineInstr::FrameSetup);
+      }
+      AFI.setStackRealigned(true);
+    }
+    return;
+  }
+
+// Emit a variable-length allocation probing loop.
+  Register TargetReg = findScratchNonCalleeSaveRegister(&MBB);
+  assert(TargetReg != AArch64::NoRegister);
+  // SUB Xd, SP, AllocSize
   emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
                   MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
                   EmitCFI, InitialOffset);
-
-  if (NeedsRealignment) {
-    const int64_t MaxAlign = MFI.getMaxAlign().value();
-    const uint64_t AndMask = ~(MaxAlign - 1);
-    // AND SP, Xd, 0b11111...0000
-    BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
+  if (RealignmentPadding) {
+    // AND Xn, Xn, 0b11111...0000
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), TargetReg)
         .addReg(TargetReg, RegState::Kill)
         .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
         .setMIFlags(MachineInstr::FrameSetup);
-    AFI.setStackRealigned(true);
+  }
 
-    // No need for SEH instructions here; if we're realigning the stack,
-    // we've set a frame pointer and already finished the SEH prologue.
-    assert(!NeedsWinCFI);
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR))
+      .addReg(TargetReg);
+  if (EmitCFI) {
+    // Set the CFA register back to SP.
+    unsigned Reg =
+        Subtarget.getRegisterInfo()->getDwarfRegNum(AArch64::SP, true);
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
   }
+  if (RealignmentPadding)
+    AFI.setStackRealigned(true);
 }
 
 static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
@@ -905,9 +1011,11 @@ bool AArch64FrameLowering::canUseAsPrologue(
   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
 
-  // Don't need a scratch register if we're not going to re-align the stack.
-  if (!RegInfo->hasStackRealignment(*MF))
+  // Don't need a scratch register if we're not going to re-align the stack or
+  // emit stack probes.
+  if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF))
     return true;
   // Otherwise, we can use any block as long as it has a scratch register
   // available.
@@ -917,15 +1025,11 @@ bool AArch64FrameLowering::canUseAsPrologue(
 static bool windowsRequiresStackProbe(MachineFunction &MF,
                                       uint64_t StackSizeInBytes) {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  if (!Subtarget.isTargetWindows())
-    return false;
-  const Function &F = MF.getFunction();
+  const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
   // TODO: When implementing stack protectors, take that into account
   // for the probe threshold.
-  unsigned StackProbeSize =
-      F.getFnAttributeAsParsedInteger("stack-probe-size", 4096);
-  return (StackSizeInBytes >= StackProbeSize) &&
-         !F.hasFnAttribute("no-stack-arg-probe");
+  return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
+         StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
 }
 
 static bool needsWinCFI(const MachineFunction &MF) {
@@ -1683,7 +1787,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // Alignment is required for the parent frame, not the funclet
   const bool NeedsRealignment =
       NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF);
-  int64_t RealignmentPadding =
+  const int64_t RealignmentPadding =
       (NeedsRealignment && MFI.getMaxAlign() > Align(16))
           ? MFI.getMaxAlign().value() - 16
           : 0;
@@ -1819,6 +1923,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // Process the SVE callee-saves to determine what space needs to be
   // allocated.
   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
+    LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
+                      << "\n");
     // Find callee save instructions in frame.
     CalleeSavesBegin = MBBI;
     assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
@@ -1833,7 +1939,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // Allocate space for the callee saves (if any).
   StackOffset CFAOffset =
       StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
-  allocateStackSpace(MBB, CalleeSavesBegin, false, SVECalleeSavesSize, false,
+  allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false,
                      nullptr, EmitAsyncCFI && !HasFP, CFAOffset);
   CFAOffset += SVECalleeSavesSize;
 
@@ -1848,7 +1954,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
     // the correct value here, as NumBytes also includes padding bytes,
     // which shouldn't be counted here.
-    allocateStackSpace(MBB, CalleeSavesEnd, NeedsRealignment,
+    allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
                        SVELocalsSize + StackOffset::getFixed(NumBytes),
                        NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
                        CFAOffset);
@@ -4061,3 +4167,189 @@ void AArch64FrameLowering::orderFrameObjects(
     dbgs() << "\n";
   });
 }
+
+/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
+/// least every ProbeSize bytes. Returns an iterator of the first instruction
+/// after the loop. The difference between SP and TargetReg must be an exact
+/// multiple of ProbeSize.
+MachineBasicBlock::iterator
+AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
+    MachineBasicBlock::iterator MBBI, int64_t ProbeSize,
+    Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable
+  // in SUB).
+  emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP,
+                  StackOffset::getFixed(-ProbeSize), TII,
+                  MachineInstr::FrameSetup);
+  // STR XZR, [SP]
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui))
+      .addReg(AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addImm(0)
+      .setMIFlags(MachineInstr::FrameSetup);
+  // CMP SP, TargetReg
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
+          AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addReg(TargetReg)
+      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+      .setMIFlags(MachineInstr::FrameSetup);
+  // B.CC Loop
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc))
+      .addImm(AArch64CC::NE)
+      .addMBB(LoopMBB)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  LoopMBB->addSuccessor(ExitMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+  // Synthesize the exit MBB.
+  ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(LoopMBB);
+  // Update liveins.
+  recomputeLiveIns(*LoopMBB);
+  recomputeLiveIns(*ExitMBB);
+
+  return ExitMBB->begin();
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
+    MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize,
+    StackOffset CFAOffset) const {
+  MachineBasicBlock *MBB = MBBI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
+  bool HasFP = hasFP(MF);
+
+  DebugLoc DL;
+  int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
+  int64_t NumBlocks = FrameSize / ProbeSize;
+  int64_t ResidualSize = FrameSize % ProbeSize;
+
+  LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, "
+                    << NumBlocks << " blocks of " << ProbeSize
+                    << " bytes, plus " << ResidualSize << " bytes\n");
+
+  // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or
+  // ordinary loop.
+  if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) {
+    for (int i = 0; i < NumBlocks; ++i) {
+      // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not
+      // encodable in a SUB).
+      emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+                      StackOffset::getFixed(-ProbeSize), TII,
+                      MachineInstr::FrameSetup, false, false, nullptr,
+                      EmitAsyncCFI && !HasFP, CFAOffset);
+      CFAOffset += StackOffset::getFixed(ProbeSize);
+      // STR XZR, [SP]
+      BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::SP)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  } else if (NumBlocks != 0) {
+    // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not
+    // encodable in ADD). ScrathReg may temporarily become the CFA register.
+    emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP,
+                    StackOffset::getFixed(-ProbeSize * NumBlocks), TII,
+                    MachineInstr::FrameSetup, false, false, nullptr,
+                    EmitAsyncCFI && !HasFP, CFAOffset);
+    CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks);
+    MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg);
+    MBB = MBBI->getParent();
+    if (EmitAsyncCFI && !HasFP) {
+      // Set the CFA register back to SP.
+      const AArch64RegisterInfo &RegInfo =
+          *MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+      unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+      BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  }
+
+  if (ResidualSize != 0) {
+    // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable
+    // in SUB).
+    emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(-ResidualSize), TII,
+                    MachineInstr::FrameSetup, false, false, nullptr,
+                    EmitAsyncCFI && !HasFP, CFAOffset);
+    if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) {
+      // STR XZR, [SP]
+      BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::SP)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  }
+
+  MachineBasicBlock::iterator Next = std::next(MBBI);
+  return Next;
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
+    MachineBasicBlock::iterator MBBI) const {
+
+  Register ScratchReg = MBBI->getOperand(0).getReg();
+  int64_t FrameSize = MBBI->getOperand(1).getImm();
+  StackOffset CFAOffset = StackOffset::get(MBBI->getOperand(2).getImm(),
+                                           MBBI->getOperand(3).getImm());
+
+  MachineBasicBlock::iterator NextInst =
+      inlineStackProbeFixed(MBBI, ScratchReg, FrameSize, CFAOffset);
+
+  MBBI->eraseFromParent();
+  return NextInst;
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeVar(
+    MachineBasicBlock::iterator MBBI) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  Register TargetReg = MBBI->getOperand(0).getReg();
+
+  MachineBasicBlock::iterator NextInst =
+      TII->probedStackAlloc(MBBI, TargetReg, true);
+
+  MBBI->eraseFromParent();
+  return NextInst;
+}
+
+void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
+                                            MachineBasicBlock &MBB) const {
+  for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) {
+    if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC) {
+      MBBI = inlineStackProbeFixed(MBBI);
+      E = MBBI->getParent()->end();
+    } else if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR) {
+      MBBI = inlineStackProbeVar(MBBI);
+      E = MBBI->getParent()->end();
+    } else {
+      ++MBBI;
+    }
+  }
+}
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index f3313f3b53fffe0..0ff06abca363ac3 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -152,13 +152,32 @@ class AArch64FrameLowering : public TargetFrameLowering {
                                   MachineBasicBlock::iterator MBBI) const;
   void allocateStackSpace(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI,
-                          bool NeedsRealignment, StackOffset AllocSize,
-                          bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
-                          StackOffset InitialOffset) const;
+                          int64_t RealignmentPadding, StackOffset AllocSize,
+                          bool NeedsWinCFI, bool *HasWinCFI,
+                          bool EmitCFI, StackOffset InitialOffset) const;
 
   /// Emit target zero call-used regs.
   void emitZeroCallUsedRegs(BitVector RegsToZero,
                             MachineBasicBlock &MBB) const override;
+
+  /// Replace a StackProbe stub (if any) with the actual probe code inline
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologueMBB) const override;
+
+  MachineBasicBlock::iterator
+  inlineStackProbeFixed(MachineBasicBlock::iterator MBBI, Register ScratchReg,
+                        int64_t FrameSize, StackOffset CFAOffset) const;
+
+  MachineBasicBlock::iterator
+  inlineStackProbeFixed(MachineBasicBlock::iterator MBBI) const;
+
+  MachineBasicBlock::iterator
+  inlineStackProbeVar(MachineBasicBlock::iterator MBBI) const;
+
+  MachineBasicBlock::iterator
+  inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,
+                                    int64_t NegProbeSize,
+                                    Register TargetReg) const;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e21d5da5a2357c1..53b32513444d069 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26587,3 +26587,9 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
 unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
   return Subtarget->getMinimumJumpTableEntries();
 }
+
+bool AArch64TargetLowering::hasInlineStackProbe(
+    const MachineFunction &MF) const {
+  return !Subtarget->isTargetWindows() &&
+         MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f7d004fa3cbcc3a..5d7f315d84928af 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -506,6 +506,13 @@ const unsigned RoundingBitsPos = 22;
 ArrayRef<MCPhysReg> getGPRArgRegs();
 ArrayRef<MCPhysReg> getFPRArgRegs();
 
+/// Maximum allowed number of unprobed bytes above SP at an ABI
+/// boundary.
+const unsigned StackProbeMaxUnprobedStack = 1024;
+
+/// Maximum number of iterations to unroll for a constant size probing loop.
+const unsigned StackProbeMaxLoopUnroll = 4;
+
 } // namespace AArch64
 
 class AArch64Subtarget;
@@ -946,6 +953,9 @@ class AArch64TargetLowering : public TargetLowering {
   // used for 64bit and 128bit vectors as well.
   bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const;
 
+  /// True if stack clash protection is enabled for this functions.
+  bool hasInlineStackProbe(const MachineFunction &MF) const override;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 6fdf5363bae2928..34da52d5a017e34 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64ExpandImm.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64ExpandImm.h"
 #include "AArch64FrameLowering.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64PointerAuth.h"
@@ -21,6 +21,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -9461,6 +9462,94 @@ bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
   return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
 }
 
+MachineBasicBlock::iterator
+AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
+                                   Register TargetReg, bool FrameSetup) const {
+  assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
+
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopTestMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopTestMBB);
+  MachineBasicBlock *LoopBodyMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopBodyMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+  MachineInstr::MIFlag Flags =
+      FrameSetup ? MachineInstr::FrameSetup : MachineInstr::NoFlags;
+
+  // LoopTest:
+  //   SUB SP, SP, #ProbeSize
+  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
+                  AArch64::SP, StackOffset::getFixed(-ProbeSize), TII, Flags);
+
+  //   CMP SP, TargetReg
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
+          AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addReg(TargetReg)
+      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+      .setMIFlags(Flags);
+
+  //   B.<Cond> LoopExit
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
+      .addImm(AArch64CC::LE)
+      .addMBB(ExitMBB)
+      .setMIFlags(Flags);
+
+  //   STR XZR, [SP]
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
+      .addReg(AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addImm(0)
+      .setMIFlags(Flags);
+
+  //   B loop
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
+      .addMBB(LoopTestMBB)
+      .setMIFlags(Flags);
+
+  // LoopExit:
+  //   MOV SP, TargetReg
+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
+      .addReg(TargetReg)
+      .addImm(0)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+      .setMIFlags(Flags);
+
+  //   STR XZR, [SP]
+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui))
+      .addReg(AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addImm(0)
+      .setMIFlags(Flags);
+
+  ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+  LoopTestMBB->addSuccessor(ExitMBB);
+  LoopTestMBB->addSuccessor(LoopBodyMBB);
+  LoopBodyMBB->addSuccessor(LoopTestMBB);
+  MBB.addSuccessor(LoopTestMBB);
+
+  // Update liveins.
+  if (MF.getRegInfo().reservedRegsFrozen()) {
+    recomputeLiveIns(*LoopTestMBB);
+    recomputeLiveIns(*LoopBodyMBB);
+    recomputeLiveIns(*ExitMBB);
+  }
+
+  return ExitMBB->begin();
+}
+
 #define GET_INSTRINFO_HELPERS
 #define GET_INSTRMAP_INFO
 #include "AArch64GenInstrInfo.inc"
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index a934103c90cbf92..b973b47bde344ad 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -382,6 +382,13 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
                              unsigned Scale) const;
 
+  // Decrement the SP, issuing probes along the way. `TargetReg` is the new top
+  // of the stack. `FrameSetup` is passed as true, if the allocation is a part
+  // of constructing the activation frame of a function.
+  MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI,
+                                               Register TargetReg,
+                                               bool FrameSetup) const;
+
 #define GET_INSTRINFO_HELPER_DECLS
 #include "AArch64GenInstrInfo.inc"
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 290c79f7bacdb8f..7100ba3d9c061f4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -932,7 +932,8 @@ include "SMEInstrFormats.td"
 // Miscellaneous instructions.
 //===----------------------------------------------------------------------===//
 
-let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+let hasSideEffects = 1, isCodeGenOnly = 1 in {
+let Defs = [SP], Uses = [SP] in {
 // We set Sched to empty list because we expect these instructions to simply get
 // removed in most cases.
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
@@ -941,7 +942,27 @@ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
                             Sched<[]>;
-} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
+
+}
+
+let Defs = [SP, NZCV], Uses = [SP] in {
+// Probed stack allocation of a constant size, used in function prologues when
+// stack-clash protection is enabled.
+def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch),
+                               (ins i64imm:$stacksize, i64imm:$fixed_offset,
+                                i64imm:$scalable_offset),
+                               []>,
+                               Sched<[]>;
+
+// Probed stack allocation of a variable size, used in function prologues when
+// stack-clash protection is enabled.
+def PROBED_STACKALLOC_VAR : Pseudo<(outs),
+                                   (ins GPR64sp:$target),
+                                   []>,
+                                   Sched<[]>;
+
+} // Defs = [SP, NZCV], Uses = [SP] in 
+} // hasSideEffects = 1, isCodeGenOnly = 1
 
 let isReMaterializable = 1, isCodeGenOnly = 1 in {
 // FIXME: The following pseudo instructions are only needed because remat
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 7bb5041b8ba9481..f8b73ba6dda3c2a 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -97,14 +97,45 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
     if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
             F.getParent()->getModuleFlag("branch-target-enforcement")))
       BranchTargetEnforcement = BTE->getZExtValue();
-    return;
+  } else {
+    const StringRef BTIEnable =
+        F.getFnAttribute("branch-target-enforcement").getValueAsString();
+    assert(BTIEnable.equals_insensitive("true") ||
+           BTIEnable.equals_insensitive("false"));
+    BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
   }
 
-  const StringRef BTIEnable =
-      F.getFnAttribute("branch-target-enforcement").getValueAsString();
-  assert(BTIEnable.equals_insensitive("true") ||
-         BTIEnable.equals_insensitive("false"));
-  BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
+  // The default stack probe size is 4096 if the function has no
+  // stack-probe-size attribute. This is a safe default because it is the
+  // smallest possible guard page size.
+  uint64_t ProbeSize = 4096;
+  if (F.hasFnAttribute("stack-probe-size"))
+    ProbeSize = F.getFnAttributeAsParsedInteger("stack-probe-size");
+  else if (const auto *PS = mdconst::extract_or_null<ConstantInt>(
+               F.getParent()->getModuleFlag("stack-probe-size")))
+    ProbeSize = PS->getZExtValue();
+  assert(int64_t(ProbeSize) > 0 && "Invalid stack probe size");
+
+  if (STI->isTargetWindows()) {
+    if (!F.hasFnAttribute("no-stack-arg-probe"))
+      StackProbeSize = ProbeSize;
+  } else {
+    // Round down to the stack alignment.
+    uint64_t StackAlign =
+        STI->getFrameLowering()->getTransientStackAlign().value();
+    ProbeSize = std::max(StackAlign, ProbeSize & ~(StackAlign - 1U));
+    StringRef ProbeKind;
+    if (F.hasFnAttribute("probe-stack"))
+      ProbeKind = F.getFnAttribute("probe-stack").getValueAsString();
+    else if (const auto *PS = dyn_cast_or_null<MDString>(
+                 F.getParent()->getModuleFlag("probe-stack")))
+      ProbeKind = PS->getString();
+    if (ProbeKind.size()) {
+      if (ProbeKind != "inline-asm")
+        report_fatal_error("Unsupported stack probing method");
+      StackProbeSize = ProbeSize;
+    }
+  }
 }
 
 MachineFunctionInfo *AArch64FunctionInfo::clone(
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 0b8bfb04a572c77..219f83cfd32e0ee 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -194,6 +194,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// True if the function need asynchronous unwind information.
   mutable std::optional<bool> NeedsAsyncDwarfUnwindInfo;
 
+  int64_t StackProbeSize = 0;
+
 public:
   AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
 
@@ -456,6 +458,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     HasStreamingModeChanges = HasChanges;
   }
 
+  bool hasStackProbing() const { return StackProbeSize != 0; }
+
+  int64_t getStackProbeSize() const { return StackProbeSize; }
+
 private:
   // Hold the lists of LOHs.
   MILOHContainer LOHContainerSet;
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
new file mode 100644
index 000000000000000..945c271d375001b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
@@ -0,0 +1,392 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
+
+; Tests for prolog sequences for stack probing, when using a 64KiB stack guard.
+
+; 64k bytes is the largest frame we can probe in one go.
+define void @static_65536(ptr %out) #0 {
+; CHECK-LABEL: static_65536:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 65536, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 64k+16 bytes, still needs just one probe.
+define void @static_65552(ptr %out) #0 {
+; CHECK-LABEL: static_65552:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp], #-16
+; CHECK-NEXT:    .cfi_def_cfa_offset 65568
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 65552, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 64k+1024 bytes, the largest frame which needs just one probe.
+define void @static_66560(ptr %out) #0 {
+; CHECK-LABEL: static_66560:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 66576
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 66560, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 64k+1024+16 bytes, the smallest frame which needs two probes.
+define void @static_66576(ptr %out) #0 {
+; CHECK-LABEL: static_66576:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 66592
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 66576, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 2*64k+1024, the largest frame needing two probes.
+define void @static_132096(ptr %out) #0 {
+; CHECK-LABEL: static_132096:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 131088
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 132112
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #32, lsl #12 // =131072
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 132096, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 5*64k-16, the largest frame probed without a loop.
+define void @static_327664(ptr %out) #0 {
+; CHECK-LABEL: static_327664:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 131088
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 196624
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 262160
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #15, lsl #12 // =61440
+; CHECK-NEXT:    .cfi_def_cfa_offset 323600
+; CHECK-NEXT:    sub sp, sp, #4080
+; CHECK-NEXT:    .cfi_def_cfa_offset 327680
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #79, lsl #12 // =323584
+; CHECK-NEXT:    .cfi_def_cfa_offset 4096
+; CHECK-NEXT:    add sp, sp, #4080
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 327664, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 5*64k, smallest frame probed with a loop.
+define void @static_327680(ptr %out) #0 {
+; CHECK-LABEL: static_327680:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa w9, 327696
+; CHECK-NEXT:  .LBB6_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB6_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 327680, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 5*64k+1024, large enough to use a loop, but not a multiple of 64KiB
+; so has a reminder, but no extra probe.
+define void @static_328704(ptr %out) #0 {
+; CHECK-LABEL: static_328704:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa w9, 327696
+; CHECK-NEXT:  .LBB7_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB7_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 328720
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 328704, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 5*64k+1040, large enough to use a loop, has a reminder and
+; an extra probe.
+define void @static_328720(ptr %out) #0 {
+; CHECK-LABEL: static_328720:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa w9, 327696
+; CHECK-NEXT:  .LBB8_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB8_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 328736
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 328720, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; A small allocation, but with a very large alignment requirement. We do this
+; by moving SP far enough that a sufficiently-aligned block will exist
+; somewhere in the stack frame, so must probe the whole of that larger SP move.
+define void @static_16_align_131072(ptr %out) #0 {
+; CHECK-LABEL: static_16_align_131072:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #31, lsl #12 // =126976
+; CHECK-NEXT:    sub x9, x9, #4080
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffe0000
+; CHECK-NEXT:  .LBB9_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB9_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB9_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB9_1
+; CHECK-NEXT:  .LBB9_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 16, align 131072
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; A small allocation, but with a very large alignment requirement which
+; is nevertheless small enough as to not need a loop.
+define void @static_16_align_8192(ptr %out) #0 {
+; CHECK-LABEL: static_16_align_8192:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4080
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffe000
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 16, align 8192
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; A large allocation with a very large alignment requirement which
+; is nevertheless small enough as to not need a loop.
+define void @static_32752_align_32k(ptr %out) #0 {
+; CHECK-LABEL: static_32752_align_32k:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #7, lsl #12 // =28672
+; CHECK-NEXT:    sub x9, x9, #4080
+; CHECK-NEXT:    and sp, x9, #0xffffffffffff8000
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 32752, align 32768
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" }
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
new file mode 100644
index 000000000000000..28e442439d8860b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -0,0 +1,644 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
+
+; Test prolog sequences for stack probing when SVE objects are involved.
+
+; The space for SVE objects needs probing in the general case, because
+; the stack adjustment may happen to be too big (i.e. greater than the
+; probe size) to allocate with a single `addvl`.
+; When we do know that the stack adjustment cannot exceed the probe size
+; we can avoid emitting a probe loop and emit a simple `addvl; str`
+; sequence instead.
+
+define void @sve_1_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_1_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 4 SVE vectors of stack space.
+define void @sve_4_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_4_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 16 SVE vectors of stack space.
+; The stack adjustment is less than or equal to 16 x 256 = 4096, so
+; we can allocate the locals at once.
+define void @sve_16_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_16_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
+; CHECK-NEXT:    addvl sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  %vec5 = alloca <vscale x 4 x float>, align 16
+  %vec6 = alloca <vscale x 4 x float>, align 16
+  %vec7 = alloca <vscale x 4 x float>, align 16
+  %vec8 = alloca <vscale x 4 x float>, align 16
+  %vec9 = alloca <vscale x 4 x float>, align 16
+  %vec10 = alloca <vscale x 4 x float>, align 16
+  %vec11 = alloca <vscale x 4 x float>, align 16
+  %vec12 = alloca <vscale x 4 x float>, align 16
+  %vec13 = alloca <vscale x 4 x float>, align 16
+  %vec14 = alloca <vscale x 4 x float>, align 16
+  %vec15 = alloca <vscale x 4 x float>, align 16
+  %vec16 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 17 SVE vectors of stack space. Now we need
+; a probing loops since stack adjustment may be greater than
+; the probe size (17 x 256 = 4354 bytes)
+define void @sve_17_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_17_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x9, sp, #-17
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
+; CHECK-NEXT:  .LBB3_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB3_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB3_1
+; CHECK-NEXT:  .LBB3_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    addvl sp, sp, #17
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  %vec5 = alloca <vscale x 4 x float>, align 16
+  %vec6 = alloca <vscale x 4 x float>, align 16
+  %vec7 = alloca <vscale x 4 x float>, align 16
+  %vec8 = alloca <vscale x 4 x float>, align 16
+  %vec9 = alloca <vscale x 4 x float>, align 16
+  %vec10 = alloca <vscale x 4 x float>, align 16
+  %vec11 = alloca <vscale x 4 x float>, align 16
+  %vec12 = alloca <vscale x 4 x float>, align 16
+  %vec13 = alloca <vscale x 4 x float>, align 16
+  %vec14 = alloca <vscale x 4 x float>, align 16
+  %vec15 = alloca <vscale x 4 x float>, align 16
+  %vec16 = alloca <vscale x 4 x float>, align 16
+  %vec17 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; Space for callee-saved SVE register is allocated similarly to allocating
+; space for SVE locals. When we know the stack adjustment cannot exceed the
+; probe size we can skip the explict probe, since saving SVE registers serves
+; as an implicit probe.
+define void @sve_1v_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_1v_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{z8}" ()
+  ret void
+}
+
+define void @sve_4v_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_4v_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    .cfi_restore z9
+; CHECK-NEXT:    .cfi_restore z10
+; CHECK-NEXT:    .cfi_restore z11
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" ()
+  ret void
+}
+
+define void @sve_16v_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_16v_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
+; CHECK-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    .cfi_restore z9
+; CHECK-NEXT:    .cfi_restore z10
+; CHECK-NEXT:    .cfi_restore z11
+; CHECK-NEXT:    .cfi_restore z12
+; CHECK-NEXT:    .cfi_restore z13
+; CHECK-NEXT:    .cfi_restore z14
+; CHECK-NEXT:    .cfi_restore z15
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
+  ret void
+}
+
+define void @sve_1p_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_1p_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{p8}" ()
+  ret void
+}
+
+define void @sve_4p_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_4p_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str p11, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" ()
+  ret void
+}
+
+define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_16v_1p_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x9, sp, #-17
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
+; CHECK-NEXT:  .LBB9_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB9_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB9_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB9_1
+; CHECK-NEXT:  .LBB9_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #17
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    .cfi_restore z9
+; CHECK-NEXT:    .cfi_restore z10
+; CHECK-NEXT:    .cfi_restore z11
+; CHECK-NEXT:    .cfi_restore z12
+; CHECK-NEXT:    .cfi_restore z13
+; CHECK-NEXT:    .cfi_restore z14
+; CHECK-NEXT:    .cfi_restore z15
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
+  ret void
+}
+
+; A SVE vector and a 16-byte fixed size object.
+define void @sve_1_vector_16_arr(ptr %out) #0 {
+; CHECK-LABEL: sve_1_vector_16_arr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  %arr = alloca i8, i64 16, align 1
+  ret void
+}
+
+; A large SVE stack object and a large stack slot, both of which need probing.
+; TODO: This could be optimised by combining the fixed-size offset into the
+; loop.
+define void @sve_1_vector_4096_arr(ptr %out) #0 {
+; CHECK-LABEL: sve_1_vector_4096_arr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #3, lsl #12 // =12288
+; CHECK-NEXT:    .cfi_def_cfa w9, 12304
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG
+; CHECK-NEXT:  .LBB11_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB11_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB11_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB11_1
+; CHECK-NEXT:  .LBB11_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    .cfi_def_cfa wsp, 12304
+; CHECK-NEXT:    add sp, sp, #3, lsl #12 // =12288
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 256 x float>, align 16
+  %arr = alloca i8, i64 12288, align 1
+  ret void
+}
+
+; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently
+; supported even without stack-probing.
+
+; An SVE vector, and a 16-byte fixed size object, which
+; has a large alignment requirement.
+define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 {
+; CHECK-LABEL: sve_1_vector_16_arr_align_8192:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4080
+; CHECK-NEXT:    addvl x9, x9, #-1
+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT:  .LBB12_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB12_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB12_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB12_1
+; CHECK-NEXT:  .LBB12_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  %arr = alloca i8, i64 16, align 8192
+  ret void
+}
+
+; With 64k guard pages, we can allocate bigger SVE space without a probing loop.
+define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
+; CHECK-LABEL: sve_1024_64k_guard:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
+; CHECK-NEXT:    addvl sp, sp, #8
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 1024 x float>, align 16
+  ret void
+}
+
+define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
+; CHECK-LABEL: sve_1028_64k_guard:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x9, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG
+; CHECK-NEXT:    addvl x9, x9, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG
+; CHECK-NEXT:  .LBB14_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB14_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB14_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB14_1
+; CHECK-NEXT:  .LBB14_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 1024 x float>, align 16
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll
new file mode 100644
index 000000000000000..5c5d9321a56e581
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll
@@ -0,0 +1,539 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
+
+; Tests for prolog sequences for stack probing, when using a 4KiB stack guard.
+
+; The stack probing parameters in function attributes take precedence over
+; ones in the module flags.
+
+; Small stack frame, no probing required.
+define void @static_64(ptr %out) #0 {
+; CHECK-LABEL: static_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 64, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; At 256 bytes we start to always create a frame pointer. No frame smaller then
+; this needs a probe, so we can use the saving of at least one CSR as a probe
+; at the top of our frame.
+define void @static_256(ptr %out) #0 {
+; CHECK-LABEL: static_256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #272
+; CHECK-NEXT:    .cfi_def_cfa_offset 272
+; CHECK-NEXT:    str x29, [sp, #256] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #272
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 256, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; At 1024 bytes, this is the largest frame which doesn't need probing.
+define void @static_1024(ptr %out) #0 {
+; CHECK-LABEL: static_1024:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 1024, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; At 1024+16 bytes, this is the smallest frame which needs probing.
+define void @static_1040(ptr %out) #0 {
+; CHECK-LABEL: static_1040:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 1040, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 4k bytes is the largest frame we can probe in one go.
+define void @static_4096(ptr %out) #0 {
+; CHECK-LABEL: static_4096:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 4096, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 4k+16 bytes, still needs just one probe.
+define void @static_4112(ptr %out) #0 {
+; CHECK-LABEL: static_4112:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp], #-16
+; CHECK-NEXT:    .cfi_def_cfa_offset 4128
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 4112, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 4k+1024 bytes, the largest frame which needs just one probe.
+define void @static_5120(ptr %out) #0 {
+; CHECK-LABEL: static_5120:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 5136
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 5120, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 4k+1024+16, the smallest frame which needs two probes.
+define void @static_5136(ptr %out) #0 {
+; CHECK-LABEL: static_5136:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 5152
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 5136, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 2*4k+1024, the largest frame needing two probes
+define void @static_9216(ptr %out) #0 {
+; CHECK-LABEL: static_9216:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 8208
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 9232
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #2, lsl #12 // =8192
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 9216, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 5*4k-16, the largest frame probed without a loop
+define void @static_20464(ptr %out) #0 {
+; CHECK-LABEL: static_20464:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 8208
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 12304
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 16400
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #4080
+; CHECK-NEXT:    .cfi_def_cfa_offset 20480
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #4, lsl #12 // =16384
+; CHECK-NEXT:    .cfi_def_cfa_offset 4096
+; CHECK-NEXT:    add sp, sp, #4080
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 20464, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 5*4k, the smallest frame probed with a loop
+define void @static_20480(ptr %out) #0 {
+; CHECK-LABEL: static_20480:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa w9, 20496
+; CHECK-NEXT:  .LBB10_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB10_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 20480, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 5*4k + 1024, large enough to use a loop, but not a multiple of 4KiB
+; so has a reminder, but no extra probe.
+define void @static_21504(ptr %out) #0 {
+; CHECK-LABEL: static_21504:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa w9, 20496
+; CHECK-NEXT:  .LBB11_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB11_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 21520
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 21504, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 5*4k+1040, large enough to use a loop, has a reminder and
+; an extra probe.
+define void @static_21520(ptr %out) #0 {
+; CHECK-LABEL: static_21520:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa w9, 20496
+; CHECK-NEXT:  .LBB12_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB12_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 21536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 21520, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; A small allocation, but with a very large alignment requirement. We do this
+; by moving SP far enough that a sufficiently-aligned block will exist
+; somewhere in the stack frame, so must probe the whole of that larger SP move.
+define void @static_16_align_8192(ptr %out) #0 {
+; CHECK-LABEL: static_16_align_8192:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4080
+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT:  .LBB13_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB13_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB13_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB13_1
+; CHECK-NEXT:  .LBB13_3: // %entry
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 16, align 8192
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; A small allocation with a very large alignment requirement, but
+; nevertheless small enough as to not need a loop.
+define void @static_16_align_2048(ptr %out) #0 {
+; CHECK-LABEL: static_16_align_2048:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #2032
+; CHECK-NEXT:    and sp, x9, #0xfffffffffffff800
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 16, align 2048
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; A large(-ish) allocation with a very large alignment requirement, but
+; nevertheless small enough as to not need a loop.
+define void @static_2032_align_2048(ptr %out) #0 {
+; CHECK-LABEL: static_2032_align_2048:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #2032
+; CHECK-NEXT:    and sp, x9, #0xfffffffffffff800
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 2032, align 2048
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; Test stack probing is enabled by module flags
+define void @static_9232(ptr %out) uwtable(async) {
+; CHECK-LABEL: static_9232:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #2, lsl #12 // =8192
+; CHECK-NEXT:    .cfi_def_cfa_offset 8208
+; CHECK-NEXT:    sub sp, sp, #800
+; CHECK-NEXT:    .cfi_def_cfa_offset 9008
+; CHECK-NEXT:    str xzr, [sp], #-240
+; CHECK-NEXT:    .cfi_def_cfa_offset 9248
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #2, lsl #12 // =8192
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 9232, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; Test for a tight upper bound on the amount of stack adjustment
+; due to stack realignment. No probes should appear.
+define void @static_1008(ptr %out) #0 {
+; CHECK-LABEL: static_1008:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #1008
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i32 1008, align 32
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 4, !"probe-stack", !"inline-asm"}
+!1 = !{i32 8, !"stack-probe-size", i32 9000}

>From efefa3b52ac61f736210770eebfdc17bd1c5d667 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Mon, 13 Nov 2023 14:46:08 +0000
Subject: [PATCH 3/9] Fix a failure to issue a probe when there can be more
 than 1024 unprobed bytes at top of stack

---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 19 ++++++------
 .../test/CodeGen/AArch64/stack-probing-sve.ll | 30 +++++++++++++++++++
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 61847d2e7a6007e..e2d687570f5349b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -787,18 +787,17 @@ void AArch64FrameLowering::allocateStackSpace(
           .addReg(ScratchReg, RegState::Kill)
           .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
           .setMIFlags(MachineInstr::FrameSetup);
-      if (MFI.hasVarSizedObjects() ||
-          upperBound(AllocSize) + RealignmentPadding >
-              AArch64::StackProbeMaxUnprobedStack) {
-        // STR XZR, [SP]
-        BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
-            .addReg(AArch64::XZR)
-            .addReg(AArch64::SP)
-            .addImm(0)
-            .setMIFlags(MachineInstr::FrameSetup);
-      }
       AFI.setStackRealigned(true);
     }
+    if (MFI.hasVarSizedObjects() || upperBound(AllocSize) + RealignmentPadding >
+                                        AArch64::StackProbeMaxUnprobedStack) {
+      // STR XZR, [SP]
+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::SP)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
     return;
   }
 
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 28e442439d8860b..3e7f5e599bcedf8 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -64,6 +64,7 @@ define void @sve_16_vector(ptr %out) #0 {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    addvl sp, sp, #-16
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -215,6 +216,7 @@ define void @sve_16v_csr(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    addvl sp, sp, #-16
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str z23, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
 ; CHECK-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
@@ -549,6 +551,7 @@ define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG
 ; CHECK-NEXT:    addvl sp, sp, #-32
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #31
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG
 ; CHECK-NEXT:    addvl sp, sp, #31
@@ -641,4 +644,31 @@ entry:
   ret void
 }
 
+; With 5 SVE vectors of stack space the unprobed area
+; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280),
+; hence we need to issue a probe.
+define void @sve_5_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_5_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-5
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    addvl sp, sp, #5
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  %vec5 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
 attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" }

>From c051ef19d54d4fd8e4abdef2cbdbd1a8eddda077 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 14 Nov 2023 13:33:53 +0000
Subject: [PATCH 4/9] Fix for a missing probe to the SVE callee-saved registers
 area

In some cases a probe may be elided, resulting in more than 1024
unprobed area at the top of the stack or decrement of the SP by more
than a guard area size.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 20 ++++----
 .../lib/Target/AArch64/AArch64FrameLowering.h |  4 +-
 .../test/CodeGen/AArch64/stack-probing-sve.ll | 48 +++++++++++++++++++
 3 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index e2d687570f5349b..0cb8c7d1b53b652 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -699,7 +699,8 @@ static int64_t upperBound(StackOffset Size) {
 void AArch64FrameLowering::allocateStackSpace(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI,
-    bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset) const {
+    bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset,
+    bool FollowupAllocs) const {
 
   if (!AllocSize)
     return;
@@ -753,9 +754,10 @@ void AArch64FrameLowering::allocateStackSpace(
         .addImm(InitialOffset.getFixed())
         .addImm(InitialOffset.getScalable());
     // The fixed allocation may leave unprobed bytes at the top of the
-    // stack. If we have variable-sized objects, we need to issue an extra
-    // probe, so their allocations starts in a known state.
-    if (MFI.hasVarSizedObjects()) {
+    // stack. If we have subsequent alocation (e.g. if we have variable-sized
+    // objects), we need to issue an extra probe, so these allocations start in
+    // a known state.
+    if (FollowupAllocs) {
       // STR XZR, [SP]
       BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
           .addReg(AArch64::XZR)
@@ -789,8 +791,8 @@ void AArch64FrameLowering::allocateStackSpace(
           .setMIFlags(MachineInstr::FrameSetup);
       AFI.setStackRealigned(true);
     }
-    if (MFI.hasVarSizedObjects() || upperBound(AllocSize) + RealignmentPadding >
-                                        AArch64::StackProbeMaxUnprobedStack) {
+    if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding >
+                              AArch64::StackProbeMaxUnprobedStack) {
       // STR XZR, [SP]
       BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
           .addReg(AArch64::XZR)
@@ -1938,8 +1940,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // Allocate space for the callee saves (if any).
   StackOffset CFAOffset =
       StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
+  StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
   allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false,
-                     nullptr, EmitAsyncCFI && !HasFP, CFAOffset);
+                     nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
+                     MFI.hasVarSizedObjects() || LocalsSize);
   CFAOffset += SVECalleeSavesSize;
 
   if (EmitAsyncCFI)
@@ -1956,7 +1960,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
                        SVELocalsSize + StackOffset::getFixed(NumBytes),
                        NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
-                       CFAOffset);
+                       CFAOffset, MFI.hasVarSizedObjects());
   }
 
   // If we need a base pointer, set it up here. It's whatever the value of the
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 0ff06abca363ac3..f0c3106cb7017be 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -153,8 +153,8 @@ class AArch64FrameLowering : public TargetFrameLowering {
   void allocateStackSpace(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI,
                           int64_t RealignmentPadding, StackOffset AllocSize,
-                          bool NeedsWinCFI, bool *HasWinCFI,
-                          bool EmitCFI, StackOffset InitialOffset) const;
+                          bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
+                          StackOffset InitialOffset, bool FollowupAllocs) const;
 
   /// Emit target zero call-used regs.
   void emitZeroCallUsedRegs(BitVector RegsToZero,
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 3e7f5e599bcedf8..f49755e709ae68f 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -671,4 +671,52 @@ entry:
   ret void
 }
 
+; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed
+; are bellow the save location of `p9`.
+define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
+; CHECK-LABEL: sve_unprobed_area:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    str p9, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT:    ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    .cfi_restore z9
+; CHECK-NEXT:    .cfi_restore z10
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" ()
+
+  %v0 = alloca <vscale x 4 x float>, align 16
+  %v1 = alloca <vscale x 4 x float>, align 16
+  %v2 = alloca <vscale x 4 x float>, align 16
+  %v3 = alloca <vscale x 4 x float>, align 16
+
+  ret void
+}
+
 attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" }

>From 60e918b1ee25be38043b2e2b99fca997b6f3c993 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 17 Nov 2023 11:15:02 +0000
Subject: [PATCH 5/9] Fix a crash when the probing instruction to replace is
 last in block

---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  69 +++------
 .../lib/Target/AArch64/AArch64FrameLowering.h |  12 +-
 .../AArch64/stack-probing-last-in-block.mir   | 146 ++++++++++++++++++
 3 files changed, 174 insertions(+), 53 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0cb8c7d1b53b652..9c710ebd85862a1 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4228,7 +4228,7 @@ AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
   return ExitMBB->begin();
 }
 
-MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
+void AArch64FrameLowering::inlineStackProbeFixed(
     MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize,
     StackOffset CFAOffset) const {
   MachineBasicBlock *MBB = MBBI->getParent();
@@ -4305,54 +4305,35 @@ MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
           .setMIFlags(MachineInstr::FrameSetup);
     }
   }
-
-  MachineBasicBlock::iterator Next = std::next(MBBI);
-  return Next;
-}
-
-MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
-    MachineBasicBlock::iterator MBBI) const {
-
-  Register ScratchReg = MBBI->getOperand(0).getReg();
-  int64_t FrameSize = MBBI->getOperand(1).getImm();
-  StackOffset CFAOffset = StackOffset::get(MBBI->getOperand(2).getImm(),
-                                           MBBI->getOperand(3).getImm());
-
-  MachineBasicBlock::iterator NextInst =
-      inlineStackProbeFixed(MBBI, ScratchReg, FrameSize, CFAOffset);
-
-  MBBI->eraseFromParent();
-  return NextInst;
-}
-
-MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeVar(
-    MachineBasicBlock::iterator MBBI) const {
-  MachineBasicBlock &MBB = *MBBI->getParent();
-  MachineFunction &MF = *MBB.getParent();
-  const AArch64InstrInfo *TII =
-      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
-
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-  Register TargetReg = MBBI->getOperand(0).getReg();
-
-  MachineBasicBlock::iterator NextInst =
-      TII->probedStackAlloc(MBBI, TargetReg, true);
-
-  MBBI->eraseFromParent();
-  return NextInst;
 }
 
 void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
                                             MachineBasicBlock &MBB) const {
-  for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) {
-    if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC) {
-      MBBI = inlineStackProbeFixed(MBBI);
-      E = MBBI->getParent()->end();
-    } else if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR) {
-      MBBI = inlineStackProbeVar(MBBI);
-      E = MBBI->getParent()->end();
+  // Get the instructions that need to be replaced. We emit at most two of
+  // these. Remember them in order to avoid complications coming from the need
+  // to traverse the block while potentially creating more blocks.
+  SmallVector<MachineInstr *, 4> ToReplace;
+  for (MachineInstr &MI : MBB)
+    if (MI.getOpcode() == AArch64::PROBED_STACKALLOC ||
+        MI.getOpcode() == AArch64::PROBED_STACKALLOC_VAR)
+      ToReplace.push_back(&MI);
+
+  for (MachineInstr *MI : ToReplace) {
+    if (MI->getOpcode() == AArch64::PROBED_STACKALLOC) {
+      Register ScratchReg = MI->getOperand(0).getReg();
+      int64_t FrameSize = MI->getOperand(1).getImm();
+      StackOffset CFAOffset = StackOffset::get(MI->getOperand(2).getImm(),
+                                               MI->getOperand(3).getImm());
+      inlineStackProbeFixed(MI->getIterator(), ScratchReg, FrameSize,
+                            CFAOffset);
     } else {
-      ++MBBI;
+      assert(MI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR &&
+             "Stack probe pseudo-instruction expected");
+      const AArch64InstrInfo *TII =
+          MI->getMF()->getSubtarget<AArch64Subtarget>().getInstrInfo();
+      Register TargetReg = MI->getOperand(0).getReg();
+      (void)TII->probedStackAlloc(MI->getIterator(), TargetReg, true);
     }
+    MI->eraseFromParent();
   }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index f0c3106cb7017be..941af03a78b7387 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -164,15 +164,9 @@ class AArch64FrameLowering : public TargetFrameLowering {
   void inlineStackProbe(MachineFunction &MF,
                         MachineBasicBlock &PrologueMBB) const override;
 
-  MachineBasicBlock::iterator
-  inlineStackProbeFixed(MachineBasicBlock::iterator MBBI, Register ScratchReg,
-                        int64_t FrameSize, StackOffset CFAOffset) const;
-
-  MachineBasicBlock::iterator
-  inlineStackProbeFixed(MachineBasicBlock::iterator MBBI) const;
-
-  MachineBasicBlock::iterator
-  inlineStackProbeVar(MachineBasicBlock::iterator MBBI) const;
+  void inlineStackProbeFixed(MachineBasicBlock::iterator MBBI,
+                             Register ScratchReg, int64_t FrameSize,
+                             StackOffset CFAOffset) const;
 
   MachineBasicBlock::iterator
   inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
new file mode 100644
index 000000000000000..6c8ec7e4c4fa924
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-last-in-block.mir
@@ -0,0 +1,146 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -run-pass=prologepilog %s -o - | FileCheck %s
+# Regression test for a crash when the probing instruction
+# to replace is last in the block.
+--- |
+  source_filename = "tt.ll"
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-linux"
+
+  declare i1 @g(ptr)
+
+  define void @f(ptr %out) #0 {
+  entry:
+    %p = alloca i32, i32 50000, align 4
+    br label %loop
+
+  loop:                                             ; preds = %loop, %entry
+    %c = call i1 @g(ptr %p)
+    br i1 %c, label %loop, label %exit
+
+  exit:                                             ; preds = %loop
+    ret void
+  }
+
+  attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" "target-features"="+sve" }
+
+...
+---
+name:            f
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: true
+registers:       []
+liveins:         []
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  200000
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: p, type: default, offset: 0, size: 200000, alignment: 4,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -200000, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: f
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $fp
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.2), (store (s64) into %stack.1)
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $w30, -8
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $w29, -16
+  ; CHECK-NEXT:   $x9 = frame-setup SUBXri $sp, 48, 12
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa $w9, 196624
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.entry:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT:   liveins: $x9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $sp = frame-setup SUBXri $sp, 1, 12
+  ; CHECK-NEXT:   frame-setup STRXui $xzr, $sp, 0
+  ; CHECK-NEXT:   $xzr = frame-setup SUBSXrx64 $sp, $x9, 24, implicit-def $nzcv
+  ; CHECK-NEXT:   frame-setup Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_register $wsp
+  ; CHECK-NEXT:   $sp = frame-setup SUBXri $sp, 3392, 0
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 200016
+  ; CHECK-NEXT:   frame-setup STRXui $xzr, $sp, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.loop:
+  ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $x0 = ADDXri $sp, 0, 0
+  ; CHECK-NEXT:   BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   TBNZW killed renamable $w0, 0, %bb.1
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.exit:
+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 48, 12
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 3408
+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 3392, 0
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK-NEXT:   early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w30
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w29
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+
+
+  bb.1.loop:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    $x0 = ADDXri %stack.0.p, 0, 0
+    BL @g, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $w0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    TBNZW killed renamable $w0, 0, %bb.1
+    B %bb.2
+
+  bb.2.exit:
+    RET_ReallyLR
+
+...

>From a896a9581b3086879e7b1efec289e4c2230f6f48 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 15 Sep 2023 12:48:41 +0100
Subject: [PATCH 6/9] [AArch64] Stack probing for dynamic allocas in
 SelectionDAG

Change-Id: I1ef19ce40702a789d220c4bbfd5560220fa329f5
Co-authored-by: Oliver Stannard <oliver.stannard at linaro.org>
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  23 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 160 ++++++---
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  13 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +
 .../CodeGen/AArch64/stack-probing-dynamic.ll  | 320 ++++++++++++++++++
 5 files changed, 473 insertions(+), 57 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 9c710ebd85862a1..a9a0a5aea2cf8a1 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -487,6 +487,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
     MachineBasicBlock::iterator I) const {
   const AArch64InstrInfo *TII =
       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const AArch64TargetLowering *TLI =
+      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   DebugLoc DL = I->getDebugLoc();
   unsigned Opc = I->getOpcode();
   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
@@ -513,8 +516,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
       // Most call frames will be allocated at the start of a function so
       // this is OK, but it is a limitation that needs dealing with.
       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
-      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
-                      StackOffset::getFixed(Amount), TII);
+
+      if (TLI->hasInlineStackProbe(MF) &&
+          -Amount >= AArch64::StackProbeMaxUnprobedStack) {
+        // When stack probing is enabled, the decrement of SP may need to be
+        // probed. We only need to do this if the call site needs 1024 bytes of
+        // space or more, because a region smaller than that is allowed to be
+        // unprobed at an ABI boundary. We rely on the fact that SP has been
+        // probed exactly at this point, either by the prologue or most recent
+        // dynamic allocation.
+        assert(MFI.hasVarSizedObjects() &&
+               "non-reserved call frame without var sized objects?");
+        Register ScratchReg =
+            MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+        inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
+      } else {
+        emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
+                        StackOffset::getFixed(Amount), TII);
+      }
     }
   } else if (CalleePopAmount != 0) {
     // If the calling convention demands that the callee pops arguments from the
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 53b32513444d069..70a75578c25de1e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -568,10 +568,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FSHL, MVT::i32, Custom);
   setOperationAction(ISD::FSHL, MVT::i64, Custom);
 
-  if (Subtarget->isTargetWindows())
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
-  else
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
 
   // Constant pool entries
   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
@@ -2352,6 +2349,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::CSINC)
     MAKE_CASE(AArch64ISD::THREAD_POINTER)
     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
+    MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
     MAKE_CASE(AArch64ISD::ABDS_PRED)
     MAKE_CASE(AArch64ISD::ABDU_PRED)
     MAKE_CASE(AArch64ISD::HADDS_PRED)
@@ -2712,6 +2710,28 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
   return BB;
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
+                                              MachineBasicBlock *MBB) const {
+  MachineFunction &MF = *MBB->getParent();
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  DebugLoc DL = MBB->findDebugLoc(MBBI);
+  const AArch64InstrInfo &TII =
+      *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  Register TargetReg = MI.getOperand(0).getReg();
+  MachineBasicBlock::iterator NextInst =
+      TII.insertStackProbingLoop(MBBI, AArch64::SP, TargetReg);
+
+  // MOV SP, TargetReg
+  BuildMI(*NextInst->getParent(), std::next(NextInst), DL,
+          TII.get(AArch64::ADDXri), AArch64::SP)
+      .addReg(TargetReg)
+      .addImm(0)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+  MI.eraseFromParent();
+  return NextInst->getParent();
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
                                     MachineInstr &MI,
@@ -2840,6 +2860,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
 
   case AArch64::CATCHRET:
     return EmitLoweredCatchRet(MI, BB);
+
+  case AArch64::PROBED_STACKALLOC_DYN:
+    return EmitDynamicProbedAlloc(MI, BB);
+
   case AArch64::LD1_MXIPXX_H_PSEUDO_B:
     return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
   case AArch64::LD1_MXIPXX_H_PSEUDO_H:
@@ -13907,9 +13931,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                        AN->getMemOperand());
 }
 
-SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
-    SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
+SDValue
+AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+
   SDLoc dl(Op);
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  MaybeAlign Align =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+  EVT VT = Node->getValueType(0);
+
+  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+          "no-stack-arg-probe")) {
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+    Chain = SP.getValue(1);
+    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+    if (Align)
+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+    SDValue Ops[2] = {SP, Chain};
+    return DAG.getMergeValues(Ops, dl);
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
                                                PtrVT, 0);
@@ -13933,7 +13982,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
 
   Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
                      DAG.getConstant(4, dl, MVT::i64));
-  return Chain;
+
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+  if (Align)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+
+  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
+
+  SDValue Ops[2] = {SP, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue
+AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+
+  MaybeAlign Align =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+  SDLoc dl(Op);
+  EVT VT = Node->getValueType(0);
+
+  // Construct the new SP value in a GPR.
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+  if (Align)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+
+  // Set the real SP to the new value with a probing loop.
+  Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
+  SDValue Ops[2] = {SP, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue
+AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (Subtarget->isTargetWindows())
+    return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
+  else if (hasInlineStackProbe(MF))
+    return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
+  else
+    return SDValue();
 }
 
 // When x and y are extended, lower:
@@ -13987,51 +14088,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
 }
 
-SDValue
-AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetWindows() &&
-         "Only Windows alloca probing supported");
-  SDLoc dl(Op);
-  // Get the inputs.
-  SDNode *Node = Op.getNode();
-  SDValue Chain = Op.getOperand(0);
-  SDValue Size = Op.getOperand(1);
-  MaybeAlign Align =
-      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
-  EVT VT = Node->getValueType(0);
-
-  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
-          "no-stack-arg-probe")) {
-    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
-    Chain = SP.getValue(1);
-    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
-    if (Align)
-      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
-    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
-    SDValue Ops[2] = {SP, Chain};
-    return DAG.getMergeValues(Ops, dl);
-  }
-
-  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
-
-  Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
-
-  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
-  Chain = SP.getValue(1);
-  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
-  if (Align)
-    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
-  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
-
-  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
-
-  SDValue Ops[2] = {SP, Chain};
-  return DAG.getMergeValues(Ops, dl);
-}
-
 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
                                            SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 5d7f315d84928af..e8591860689d934 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -83,6 +83,10 @@ enum NodeType : unsigned {
   ADC,
   SBC, // adc, sbc instructions
 
+  // To avoid stack clash, allocation is performed by block and each block is
+  // probed.
+  PROBED_ALLOCA,
+
   // Predicated instructions where inactive lanes produce undefined results.
   ABDS_PRED,
   ABDU_PRED,
@@ -608,6 +612,9 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
 
+  MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) const;
+
   MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
                                   MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
@@ -1116,10 +1123,10 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
-                                         SDValue &Size,
-                                         SelectionDAG &DAG) const;
+
   SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
 
   SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 7100ba3d9c061f4..74f10375ec8c015 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -861,6 +861,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain,
 def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
+
+def AArch64probedalloca
+    : SDNode<"AArch64ISD::PROBED_ALLOCA",
+             SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+             [SDNPHasChain]>;
+
 def AArch64mrs : SDNode<"AArch64ISD::MRS",
                         SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
                         [SDNPHasChain, SDNPOutGlue]>;
@@ -961,6 +967,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs),
                                    []>,
                                    Sched<[]>;
 
+// Probed stack allocations of a variable size, used for allocas of unknown size
+// when stack-clash protection is enabled.
+let usesCustomInserter = 1 in
+def PROBED_STACKALLOC_DYN : Pseudo<(outs),
+                                   (ins GPR64common:$target),
+                                   [(AArch64probedalloca GPR64common:$target)]>,
+                                   Sched<[]>;
+
 } // Defs = [SP, NZCV], Uses = [SP] in 
 } // hasSideEffects = 1, isCodeGenOnly = 1
 
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
new file mode 100644
index 000000000000000..561ab81059b9dd7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -0,0 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+
+; Dynamically-sized allocation, needs a loop which can handle any size at
+; runtime. The final iteration of the loop will temporarily put SP below the
+; target address, but this doesn't break any of the ABI constraints on the
+; stack, and also doesn't probe below the target SP value.
+define void @dynamic(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB0_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; This function has a fixed-size stack slot and a dynamic one. The fixed size
+; slot isn't large enough that we would normally probe it, but we need to do so
+; here otherwise the gap between the CSR save and the first probe of the
+; dynamic allocation could be too far apart when the size of the dynamic
+; allocation is close to the guard size.
+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
+; CHECK-LABEL: dynamic_fixed:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    str xzr, [sp, #-64]!
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    sub x10, x29, #64
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    str x10, [x1]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB1_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB1_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x2]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v1 = alloca i8, i64 64, align 1
+  store ptr %v1, ptr %out1, align 8
+  %v2 = alloca i8, i64 %size, align 1
+  store ptr %v2, ptr %out2, align 8
+  ret void
+}
+
+; Dynamic allocation, with an alignment requirement greater than the alignment
+; of SP. Done by ANDing the target SP with a constant to align it down, then
+; doing the loop as normal. Note that we also re-align the stack in the prolog,
+; which isn't actually needed because the only aligned allocations are dynamic,
+; this is done even without stack probing.
+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic_align_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    sub x9, sp, #32
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffc0
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    and x8, x8, #0xffffffffffffffc0
+; CHECK-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB2_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 64
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; Dynamic allocation, with an alignment greater than the stack guard size. The
+; only difference to the dynamic allocation is the constant used for aligning
+; the target SP, the loop will probe the whole allocation without needing to
+; know about the alignment padding.
+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic_align_8192:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4064
+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT:  .LBB3_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB3_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB3_1
+; CHECK-NEXT:  .LBB3_3:
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    and x8, x8, #0xffffffffffffe000
+; CHECK-NEXT:  .LBB3_4: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB3_6
+; CHECK-NEXT:  // %bb.5: // in Loop: Header=BB3_4 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB3_4
+; CHECK-NEXT:  .LBB3_6:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 8192
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; For 64k guard pages, the only difference is the constant subtracted from SP
+; in the loop.
+define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" {
+; CHECK-LABEL: dynamic_64k_guard:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB4_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB4_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB4_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB4_1
+; CHECK-NEXT:  .LBB4_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; If a function has variable-sized stack objects, then any function calls which
+; need to pass arguments on the stack must allocate the stack space for them
+; dynamically, to ensure they are at the bottom of the frame. We need to probe
+; that space when it is larger than the unprobed space allowed by the ABI (1024
+; bytes), so this needs a very large number of arguments.
+define void @no_reserved_call_frame(i64 %n) #0 {
+; CHECK-LABEL: no_reserved_call_frame:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    lsl x9, x0, #2
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    add x9, x9, #15
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x0, x8, x9
+; CHECK-NEXT:  .LBB5_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x0
+; CHECK-NEXT:    b.le .LBB5_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB5_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB5_1
+; CHECK-NEXT:  .LBB5_3: // %entry
+; CHECK-NEXT:    str xzr, [x0]
+; CHECK-NEXT:    mov sp, x0
+; CHECK-NEXT:    sub sp, sp, #1104
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    bl callee_stack_args
+; CHECK-NEXT:    add sp, sp, #1104
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i32, i64 %n
+  call void @callee_stack_args(ptr %v, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef) #2
+  ret void
+}
+
+; Same as above but without a variable-sized allocation, so the reserved call
+; frame can be folded into the fixed-size allocation in the prologue.
+define void @reserved_call_frame(i64 %n) #0 {
+; CHECK-LABEL: reserved_call_frame:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w28, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    sub sp, sp, #1504
+; CHECK-NEXT:    add x0, sp, #1104
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    bl callee_stack_args
+; CHECK-NEXT:    add sp, sp, #1504
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i32, i64 100
+  call void @callee_stack_args(ptr %v,
+    i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef) #2
+  ret void
+}
+
+declare void @callee_stack_args(ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64)
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }

>From bf43746bf096d3fa62551d03befa1511fc6f7008 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Sat, 28 Oct 2023 16:12:46 +0100
Subject: [PATCH 7/9] Update after the bugfix about writing below SP

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp  | 11 ++---------
 .../CodeGen/AArch64/stack-probing-dynamic.ll     | 16 ++++++++--------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 70a75578c25de1e..4e9d9c39388be2e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2719,15 +2719,8 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
   const AArch64InstrInfo &TII =
       *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   Register TargetReg = MI.getOperand(0).getReg();
-  MachineBasicBlock::iterator NextInst =
-      TII.insertStackProbingLoop(MBBI, AArch64::SP, TargetReg);
-
-  // MOV SP, TargetReg
-  BuildMI(*NextInst->getParent(), std::next(NextInst), DL,
-          TII.get(AArch64::ADDXri), AArch64::SP)
-      .addReg(TargetReg)
-      .addImm(0)
-      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+  MachineBasicBlock::iterator NextInst = TII.probedStackAlloc(MBBI, TargetReg);
+
   MI.eraseFromParent();
   return NextInst->getParent();
 }
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
index 561ab81059b9dd7..2c6e14adb0b06ac 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -26,8 +26,8 @@ define void @dynamic(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB0_1
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -70,8 +70,8 @@ define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB1_1
 ; CHECK-NEXT:  .LBB1_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x2]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -120,8 +120,8 @@ define void @dynamic_align_64(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB2_1
 ; CHECK-NEXT:  .LBB2_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 32
@@ -163,12 +163,12 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB3_1
 ; CHECK-NEXT:  .LBB3_3:
-; CHECK-NEXT:    str xzr, [x9]
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    add x9, x0, #15
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    mov x19, sp
 ; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    and x8, x8, #0xffffffffffffe000
 ; CHECK-NEXT:  .LBB3_4: // =>This Inner Loop Header: Depth=1
@@ -179,8 +179,8 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB3_4
 ; CHECK-NEXT:  .LBB3_6:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 32
@@ -219,8 +219,8 @@ define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB4_1
 ; CHECK-NEXT:  .LBB4_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -263,8 +263,8 @@ define void @no_reserved_call_frame(i64 %n) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB5_1
 ; CHECK-NEXT:  .LBB5_3: // %entry
-; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    mov sp, x0
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    sub sp, sp, #1104
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    bl callee_stack_args

>From baeead480815638107540cb4ceb545244c8551bd Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 9 Nov 2023 14:33:42 +0000
Subject: [PATCH 8/9] Update after the change to not always set FrameSetup
 flags

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    |  2 +-
 .../stack-probing-dynamic-no-frame-setup.ll        | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4e9d9c39388be2e..e89c593ec8afab2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2719,7 +2719,7 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
   const AArch64InstrInfo &TII =
       *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   Register TargetReg = MI.getOperand(0).getReg();
-  MachineBasicBlock::iterator NextInst = TII.probedStackAlloc(MBBI, TargetReg);
+  MachineBasicBlock::iterator NextInst = TII.probedStackAlloc(MBBI, TargetReg, false);
 
   MI.eraseFromParent();
   return NextInst->getParent();
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
new file mode 100644
index 000000000000000..96f2f63d703c7de
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
@@ -0,0 +1,14 @@
+; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s
+target triple = "aarch64-linux"
+
+; Check dynamic stack allocation and probing instructions do not have
+; the FrameSetup flag.
+
+; CHECK-NOT: frame-setup
+define void @no_frame_setup(i64 %size, ptr %out) #0 {
+  %v = alloca i8, i64 %size, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }

>From ce36a309cfbbe426efa3806390a78937d951a0ed Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 22 Sep 2023 13:04:05 +0100
Subject: [PATCH 9/9] Add a test for stack probing of dynamically alocated
 space for SVE vector

Change-Id: I1f9645b7ca786259bb7276806a9cfa17ba543a5a
---
 .../CodeGen/AArch64/stack-probing-dynamic.ll  | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
index 2c6e14adb0b06ac..7cf677c904fe920 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -317,4 +317,47 @@ entry:
 
 declare void @callee_stack_args(ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64)
 
+; Dynamic allocation of SVE vectors
+define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" {
+; CHECK-LABEL: dynamic_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    rdvl x9, #1
+; CHECK-NEXT:    mov x10, #15 // =0xf
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    madd x9, x0, x9, x10
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB7_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB7_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB7_1
+; CHECK-NEXT:  .LBB7_3:
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca <vscale x 4 x float>, i64 %size, align 16
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
 attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }