[clang] [AArch64] Stack probing for dynamic allocas in SelectionDAG (PR #66525)

Sat Oct 28 11:29:55 PDT 2023

https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/66525

>From ed580b95157d7f423c5384fa2d51af00f1359a10 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Mon, 2 Oct 2023 14:46:27 +0100
Subject: [PATCH 01/11] [CFIFixup] Allow function prologues to span more than
 one basic block

The CFIFixup pass assumes a function prologue is contained in a single
basic block. This assumption is broken with upcoming support for stack
probing (`-fstack-clash-protection`) in AArch64 - the emitted probing
sequence in a prologue may contain loops, i.e. more than one basic
block. The generated CFG is not arbitrary though:
 * CFI instructions are outside of any loops
 * for any two CFI instructions of the function prologue one dominates
   and is post-dominated by the other

Thus, for the prologue CFI instructions, if one is
executed then all are executed, there is a total order of
executions, and the last instruction in that order can be considered
the end of the prologoue for the purpose of inserting the initial
`.cfi_remember_state` directive.

That last instruction is found by finding the first block in the
post-order traversal which contains prologue CFI instructions.
---
 llvm/lib/CodeGen/CFIFixup.cpp                 |  62 ++--
 .../cfi-fixup-multi-block-prologue.mir        | 308 ++++++++++++++++++
 2 files changed, 347 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/cfi-fixup-multi-block-prologue.mir

diff --git a/llvm/lib/CodeGen/CFIFixup.cpp b/llvm/lib/CodeGen/CFIFixup.cpp
index 837dbd77d07361a..964a8d56511fa1b 100644
--- a/llvm/lib/CodeGen/CFIFixup.cpp
+++ b/llvm/lib/CodeGen/CFIFixup.cpp
@@ -10,20 +10,25 @@
 // This pass inserts the necessary  instructions to adjust for the inconsistency
 // of the call-frame information caused by final machine basic block layout.
 // The pass relies in constraints LLVM imposes on the placement of
-// save/restore points (cf. ShrinkWrap):
-// * there is a single basic block, containing the function prologue
+// save/restore points (cf. ShrinkWrap) and has certain preconditions about
+// placement of CFI instructions:
+// * for any two CFI instructions of the function prologue one dominates
+//   and is post-dominated by the other
 // * possibly multiple epilogue blocks, where each epilogue block is
 //   complete and self-contained, i.e. CSR restore instructions (and the
 //   corresponding CFI instructions are not split across two or more blocks.
-// * prologue and epilogue blocks are outside of any loops
-// Thus, during execution, at the beginning and at the end of each basic block
-// the function can be in one of two states:
+// * CFI instructions are not contained in any loops
+// Thus, during execution, at the beginning and at the end of each basic block,
+// following the prologue, the function can be in one of two states:
 //  - "has a call frame", if the function has executed the prologue, and
 //    has not executed any epilogue
 //  - "does not have a call frame", if the function has not executed the
 //    prologue, or has executed an epilogue
 // which can be computed by a single RPO traversal.
 
+// The location of the prologue is determined by finding the first block in the
+// post-order traversal which contains CFI instructions.
+
 // In order to accommodate backends which do not generate unwind info in
 // epilogues we compute an additional property "strong no call frame on entry",
 // which is set for the entry point of the function and for every block
@@ -85,10 +90,6 @@ static bool isPrologueCFIInstruction(const MachineInstr &MI) {
          MI.getFlag(MachineInstr::FrameSetup);
 }
 
-static bool containsPrologue(const MachineBasicBlock &MBB) {
-  return llvm::any_of(MBB.instrs(), isPrologueCFIInstruction);
-}
-
 static bool containsEpilogue(const MachineBasicBlock &MBB) {
   return llvm::any_of(llvm::reverse(MBB), [](const auto &MI) {
     return MI.getOpcode() == TargetOpcode::CFI_INSTRUCTION &&
@@ -96,6 +97,25 @@ static bool containsEpilogue(const MachineBasicBlock &MBB) {
   });
 }
 
+static MachineBasicBlock *
+findPrologueEnd(MachineFunction &MF, MachineBasicBlock::iterator &PrologueEnd) {
+  MachineBasicBlock *PrologueBlock = nullptr;
+  for (auto It = po_begin(&MF.front()), End = po_end(&MF.front()); It != End;
+       ++It) {
+    MachineBasicBlock *MBB = *It;
+    llvm::for_each(MBB->instrs(), [&](MachineInstr &MI) {
+      if (isPrologueCFIInstruction(MI)) {
+        PrologueBlock = MBB;
+        PrologueEnd = std::next(MI.getIterator());
+      }
+    });
+    if (PrologueBlock)
+      return PrologueBlock;
+  }
+
+  return nullptr;
+}
+
 bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
   const TargetFrameLowering &TFL = *MF.getSubtarget().getFrameLowering();
   if (!TFL.enableCFIFixup(MF))
@@ -105,6 +125,14 @@ bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
   if (NumBlocks < 2)
     return false;
 
+  // Find the prologue and the point where we can issue the first
+  // `.cfi_remember_state`.
+
+  MachineBasicBlock::iterator PrologueEnd;
+  MachineBasicBlock *PrologueBlock = findPrologueEnd(MF, PrologueEnd);
+  if (PrologueBlock == nullptr)
+    return false;
+
   struct BlockFlags {
     bool Reachable : 1;
     bool StrongNoFrameOnEntry : 1;
@@ -116,21 +144,15 @@ bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
   BlockInfo[0].StrongNoFrameOnEntry = true;
 
   // Compute the presence/absence of frame at each basic block.
-  MachineBasicBlock *PrologueBlock = nullptr;
   ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
   for (MachineBasicBlock *MBB : RPOT) {
     BlockFlags &Info = BlockInfo[MBB->getNumber()];
 
     // Set to true if the current block contains the prologue or the epilogue,
     // respectively.
-    bool HasPrologue = false;
+    bool HasPrologue = MBB == PrologueBlock;
     bool HasEpilogue = false;
 
-    if (!PrologueBlock && !Info.HasFrameOnEntry && containsPrologue(*MBB)) {
-      PrologueBlock = MBB;
-      HasPrologue = true;
-    }
-
     if (Info.HasFrameOnEntry || HasPrologue)
       HasEpilogue = containsEpilogue(*MBB);
 
@@ -149,9 +171,6 @@ bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  if (!PrologueBlock)
-    return false;
-
   // Walk the blocks of the function in "physical" order.
   // Every block inherits the frame state (as recorded in the unwind tables)
   // of the previous block. If the intended frame state is different, insert
@@ -162,10 +181,7 @@ bool CFIFixup::runOnMachineFunction(MachineFunction &MF) {
   // insert a `.cfi_remember_state`, in the case that the current block needs a
   // `.cfi_restore_state`.
   MachineBasicBlock *InsertMBB = PrologueBlock;
-  MachineBasicBlock::iterator InsertPt = PrologueBlock->begin();
-  for (MachineInstr &MI : *PrologueBlock)
-    if (isPrologueCFIInstruction(MI))
-      InsertPt = std::next(MI.getIterator());
+  MachineBasicBlock::iterator InsertPt = PrologueEnd;
 
   assert(InsertPt != PrologueBlock->begin() &&
          "Inconsistent notion of \"prologue block\"");
diff --git a/llvm/test/CodeGen/AArch64/cfi-fixup-multi-block-prologue.mir b/llvm/test/CodeGen/AArch64/cfi-fixup-multi-block-prologue.mir
new file mode 100644
index 000000000000000..31fa3832367becc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cfi-fixup-multi-block-prologue.mir
@@ -0,0 +1,308 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -run-pass=cfi-fixup %s -o - | FileCheck %s
+--- |
+  source_filename = "cfi-fixup.ll"
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-linux"
+
+  define i32 @f(i32 %x) #0 {
+  entry:
+    %p = alloca i8, i32 30000, align 1
+    switch i32 %x, label %if.end7 [
+      i32 0, label %return
+      i32 1, label %if.then2
+      i32 2, label %if.then5
+    ]
+
+  if.then2:                                         ; preds = %entry
+    %call = tail call i32 @g1(i32 1)
+    %add = add nsw i32 %call, 1
+    br label %return
+
+  if.then5:                                         ; preds = %entry
+    %call6 = tail call i32 @g0(i32 2)
+    %sub = sub nsw i32 1, %call6
+    br label %return
+
+  if.end7:                                          ; preds = %entry
+    br label %return
+
+  return:                                           ; preds = %if.end7, %if.then5, %if.then2, %entry
+    %retval.0 = phi i32 [ %add, %if.then2 ], [ %sub, %if.then5 ], [ 0, %if.end7 ], [ 1, %entry ]
+    ret i32 %retval.0
+  }
+
+  declare i32 @g1(i32)
+
+  declare i32 @g0(i32)
+
+  attributes #0 = { uwtable "probe-stack"="inline-asm" }
+
+...
+---
+name:            f
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: true
+registers:       []
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       30016
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  30000
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: p, type: default, offset: -30016, size: 30000, alignment: 1,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      local-offset: -30000, debug-info-variable: '', debug-info-expression: '',
+      debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$fp', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  ; CHECK-LABEL: name: f
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $lr, $fp
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.2), (store (s64) into %stack.1)
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $w30, -8
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $w29, -16
+  ; CHECK-NEXT:   $x9 = frame-setup SUBXri $sp, 7, 12
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa $w9, 28688
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.entry:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $x9
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $sp = frame-setup SUBXri $sp, 1, 12
+  ; CHECK-NEXT:   $xzr = frame-setup SUBSXrx64 $sp, $x9, 24, implicit-def $nzcv
+  ; CHECK-NEXT:   frame-setup STRXui $xzr, $sp, 0
+  ; CHECK-NEXT:   frame-setup Bcc 1, %bb.1, implicit killed $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.entry:
+  ; CHECK-NEXT:   successors: %bb.6(0x20000000), %bb.3(0x60000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_register $wsp
+  ; CHECK-NEXT:   $sp = frame-setup SUBXri $sp, 1328, 0
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 30016
+  ; CHECK-NEXT:   CFI_INSTRUCTION remember_state
+  ; CHECK-NEXT:   frame-setup STRXui $xzr, $sp, 0
+  ; CHECK-NEXT:   CBZW renamable $w0, %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.entry:
+  ; CHECK-NEXT:   successors: %bb.7(0x2aaaaaab), %bb.4(0x55555555)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri renamable $w0, 2, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 0, %bb.7, implicit killed $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.entry:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.8(0x40000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead $wzr = SUBSWri renamable $w0, 1, 0, implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.8, implicit killed $nzcv
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.if.then2:
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   BL @g1, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   renamable $w0 = nsw ADDWri killed renamable $w0, 1, 0
+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 7, 12
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 1344
+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 1328, 0
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK-NEXT:   early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w30
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w29
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   CFI_INSTRUCTION restore_state
+  ; CHECK-NEXT:   CFI_INSTRUCTION remember_state
+  ; CHECK-NEXT:   renamable $w0 = MOVZWi 1, 0
+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 7, 12
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 1344
+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 1328, 0
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK-NEXT:   early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w30
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w29
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.if.then5:
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CFI_INSTRUCTION restore_state
+  ; CHECK-NEXT:   CFI_INSTRUCTION remember_state
+  ; CHECK-NEXT:   BL @g0, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0
+  ; CHECK-NEXT:   renamable $w8 = MOVZWi 1, 0
+  ; CHECK-NEXT:   $w0 = SUBWrs killed renamable $w8, killed renamable $w0, 0
+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 7, 12
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 1344
+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 1328, 0
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK-NEXT:   early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w30
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w29
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8.if.end7:
+  ; CHECK-NEXT:   CFI_INSTRUCTION restore_state
+  ; CHECK-NEXT:   $w0 = ORRWrs $wzr, $wzr, 0
+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 7, 12
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 1344
+  ; CHECK-NEXT:   $sp = frame-destroy ADDXri $sp, 1328, 0
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK-NEXT:   early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w30
+  ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $w29
+  ; CHECK-NEXT:   RET undef $lr, implicit killed $w0
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $w0, $lr, $fp
+
+    early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.2), (store (s64) into %stack.1)
+    frame-setup CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup CFI_INSTRUCTION offset $w30, -8
+    frame-setup CFI_INSTRUCTION offset $w29, -16
+    $x9 = frame-setup SUBXri $sp, 7, 12
+    frame-setup CFI_INSTRUCTION def_cfa $w9, 28688
+
+  bb.1.entry:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $x9
+
+    $sp = frame-setup SUBXri $sp, 1, 12
+    $xzr = frame-setup SUBSXrx64 $sp, $x9, 24, implicit-def $nzcv
+    frame-setup STRXui $xzr, $sp, 0
+    frame-setup Bcc 1, %bb.1, implicit killed $nzcv
+
+  bb.2.entry:
+    successors: %bb.6(0x20000000), %bb.3(0x60000000)
+    liveins: $w0
+
+    frame-setup CFI_INSTRUCTION def_cfa_register $wsp
+    $sp = frame-setup SUBXri $sp, 1328, 0
+    frame-setup CFI_INSTRUCTION def_cfa_offset 30016
+    frame-setup STRXui $xzr, $sp, 0
+    CBZW renamable $w0, %bb.6
+
+  bb.3.entry:
+    successors: %bb.7(0x2aaaaaab), %bb.4(0x55555555)
+    liveins: $w0
+
+    dead $wzr = SUBSWri renamable $w0, 2, 0, implicit-def $nzcv
+    Bcc 0, %bb.7, implicit killed $nzcv
+
+  bb.4.entry:
+    successors: %bb.5(0x40000000), %bb.8(0x40000000)
+    liveins: $w0
+
+    dead $wzr = SUBSWri renamable $w0, 1, 0, implicit-def $nzcv
+    Bcc 1, %bb.8, implicit killed $nzcv
+
+  bb.5.if.then2:
+    liveins: $w0
+
+    BL @g1, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0
+    renamable $w0 = nsw ADDWri killed renamable $w0, 1, 0
+    $sp = frame-destroy ADDXri $sp, 7, 12
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 1344
+    $sp = frame-destroy ADDXri $sp, 1328, 0
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+    early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+    frame-destroy CFI_INSTRUCTION restore $w30
+    frame-destroy CFI_INSTRUCTION restore $w29
+    RET undef $lr, implicit killed $w0
+
+  bb.6:
+    renamable $w0 = MOVZWi 1, 0
+    $sp = frame-destroy ADDXri $sp, 7, 12
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 1344
+    $sp = frame-destroy ADDXri $sp, 1328, 0
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+    early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+    frame-destroy CFI_INSTRUCTION restore $w30
+    frame-destroy CFI_INSTRUCTION restore $w29
+    RET undef $lr, implicit killed $w0
+
+  bb.7.if.then5:
+    liveins: $w0
+
+    BL @g0, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit-def $sp, implicit-def $w0
+    renamable $w8 = MOVZWi 1, 0
+    $w0 = SUBWrs killed renamable $w8, killed renamable $w0, 0
+    $sp = frame-destroy ADDXri $sp, 7, 12
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 1344
+    $sp = frame-destroy ADDXri $sp, 1328, 0
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+    early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+    frame-destroy CFI_INSTRUCTION restore $w30
+    frame-destroy CFI_INSTRUCTION restore $w29
+    RET undef $lr, implicit killed $w0
+
+  bb.8.if.end7:
+    $w0 = ORRWrs $wzr, $wzr, 0
+    $sp = frame-destroy ADDXri $sp, 7, 12
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 1344
+    $sp = frame-destroy ADDXri $sp, 1328, 0
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+    early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.1)
+    frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+    frame-destroy CFI_INSTRUCTION restore $w30
+    frame-destroy CFI_INSTRUCTION restore $w29
+    RET undef $lr, implicit killed $w0
+
+...

>From c0c2083d623f56c3929c0101d8b1f1a85d0c888a Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Sat, 28 Oct 2023 13:33:28 +0100
Subject: [PATCH 02/11] Reverse iteration within a block when looking for
 prologue CFI insns

---
 llvm/lib/CodeGen/CFIFixup.cpp | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/CFIFixup.cpp b/llvm/lib/CodeGen/CFIFixup.cpp
index 964a8d56511fa1b..40a2a3a142e1758 100644
--- a/llvm/lib/CodeGen/CFIFixup.cpp
+++ b/llvm/lib/CodeGen/CFIFixup.cpp
@@ -99,20 +99,16 @@ static bool containsEpilogue(const MachineBasicBlock &MBB) {
 
 static MachineBasicBlock *
 findPrologueEnd(MachineFunction &MF, MachineBasicBlock::iterator &PrologueEnd) {
-  MachineBasicBlock *PrologueBlock = nullptr;
   for (auto It = po_begin(&MF.front()), End = po_end(&MF.front()); It != End;
        ++It) {
     MachineBasicBlock *MBB = *It;
-    llvm::for_each(MBB->instrs(), [&](MachineInstr &MI) {
-      if (isPrologueCFIInstruction(MI)) {
-        PrologueBlock = MBB;
-        PrologueEnd = std::next(MI.getIterator());
-      }
-    });
-    if (PrologueBlock)
-      return PrologueBlock;
+    for (MachineInstr &MI : reverse(MBB->instrs())) {
+      if (!isPrologueCFIInstruction(MI))
+        continue;
+      PrologueEnd = std::next(MI.getIterator());
+      return MBB;
+    }
   }
-
   return nullptr;
 }
 

>From 2440b603aa8307364a4897e6782a60c2738c5912 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Wed, 11 Oct 2023 17:22:51 +0100
Subject: [PATCH 03/11] [clang][AArch64] Pass down stack clash protection
 options to LLVM/Backend

---
 clang/lib/CodeGen/CodeGenModule.cpp         | 12 +++++++++++-
 clang/lib/Driver/ToolChains/Clang.cpp       |  2 +-
 clang/test/CodeGen/stack-clash-protection.c | 16 ++++++++++++----
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b1a6683a66bd052..517021cb3c89bf1 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1076,6 +1076,16 @@ void CodeGenModule::Release() {
                                 "sign-return-address-with-bkey", 1);
   }
 
+  if (Arch == llvm::Triple::aarch64 || Arch == llvm::Triple::aarch64_be) {
+    auto *InlineAsm = llvm::MDString::get(TheModule.getContext(), "inline-asm");
+    if (CodeGenOpts.StackClashProtector)
+      getModule().addModuleFlag(llvm::Module::Override, "probe-stack",
+                                InlineAsm);
+    if (CodeGenOpts.StackProbeSize && CodeGenOpts.StackProbeSize != 4096)
+      getModule().addModuleFlag(llvm::Module::Min, "stack-probe-size",
+                                CodeGenOpts.StackProbeSize);
+  }
+
   if (!CodeGenOpts.MemoryProfileOutput.empty()) {
     llvm::LLVMContext &Ctx = TheModule.getContext();
     getModule().addModuleFlag(
@@ -2287,7 +2297,7 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
   if ((!D || !D->hasAttr<NoUwtableAttr>()) && CodeGenOpts.UnwindTables)
     B.addUWTableAttr(llvm::UWTableKind(CodeGenOpts.UnwindTables));
 
-  if (CodeGenOpts.StackClashProtector)
+  if (CodeGenOpts.StackClashProtector && !getTarget().getTriple().isAArch64())
     B.addAttribute("probe-stack", "inline-asm");
 
   if (!hasUnwindExceptions(LangOpts))
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 43a92adbef64ba8..83a6e679fa19a6d 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3507,7 +3507,7 @@ static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args,
     return;
 
   if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ() &&
-      !EffectiveTriple.isPPC64())
+      !EffectiveTriple.isPPC64() && !EffectiveTriple.isAArch64())
     return;
 
   Args.addOptInFlag(CmdArgs, options::OPT_fstack_clash_protection,
diff --git a/clang/test/CodeGen/stack-clash-protection.c b/clang/test/CodeGen/stack-clash-protection.c
index 67571f5cdb2c14c..2f502ef453d42f4 100644
--- a/clang/test/CodeGen/stack-clash-protection.c
+++ b/clang/test/CodeGen/stack-clash-protection.c
@@ -1,10 +1,12 @@
 // Check the correct function attributes are generated
-// RUN: %clang_cc1 -triple x86_64-linux -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
-// RUN: %clang_cc1 -triple s390x-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
-// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
-// RUN: %clang_cc1 -triple powerpc64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple s390x-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -O0 -S -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s --check-prefixes CHECK-AARCH64
 
 // CHECK: define{{.*}} void @large_stack() #[[A:.*]] {
+// CHECK-AARCH64: define{{.*}} void @large_stack() #[[A:.*]] {
 void large_stack(void) {
   volatile int stack[20000], i;
   for (i = 0; i < sizeof(stack) / sizeof(int); ++i)
@@ -12,14 +14,20 @@ void large_stack(void) {
 }
 
 // CHECK: define{{.*}} void @vla({{.*}}) #[[A:.*]] {
+// CHECK-AARCH64: define{{.*}} void @vla({{.*}}) #[[A:.*]] {
 void vla(int n) {
   volatile int vla[n];
   __builtin_memset(&vla[0], 0, 1);
 }
 
 // CHECK: define{{.*}} void @builtin_alloca({{.*}}) #[[A:.*]] {
+// CHECK-AARCH64: define{{.*}} void @builtin_alloca({{.*}}) #[[A:.*]] {
 void builtin_alloca(int n) {
   volatile void *mem = __builtin_alloca(n);
 }
 
 // CHECK: attributes #[[A]] = {{.*}} "probe-stack"="inline-asm"
+// CHECK-AARCH64-NOT: attributes #[[A]] = {{.*}} "probe-stack"
+
+// CHECK-AARCH64: !{i32 4, !"probe-stack", !"inline-asm"}
+// CHECK-AARCH64: !{i32 8, !"stack-probe-size", i32 8192}

>From c053ef7155c05368982053667bdae3fa1adec850 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 15 Sep 2023 12:46:44 +0100
Subject: [PATCH 04/11] [AArch64] Stack probing for function prologues

This adds code to AArch64 function prologues to protect against stack
clash attacks by probing (writing to) the stack at regular enough
intervals to ensure that the guard page cannot be skipped over.

The patch depends on and maintains the following invariants:

* Upon function entry the caller guarantees that it has probed the stack
  (e.g. performed a store) at some address [sp, #N],
  where`0 <= N <= 1024`. This invariant comes from a requirement for
  compatibility with GCC.

* Any address range in the allocated stack, no smaller than
  stack-probe-size bytes contains at least one probe

* At any time the stack pointer is above or in the guard page

* Probes are performed in descreasing address order

The `stack-probe-size` is a function attribute that can be set by a
platform to correspond to the guard page size. By default, the stack
probe size is 4KiB, which is a safe default as this is the smallest
possible page size for AArch64. Linux uses a 64KiB guard for AArch64, so
this can be overridden by the `stack-probe-size` function attribute.

For small frames without a frame pointer (<= 240 bytes), no probes are
needed.

For larger frame sizes, LLVM always stores `x29` to the stack. This serves
as an implicit stack probe. Thus, while allocating stack objects the
compiler assumes that the stack has been probed at `[sp]`.

There are multiple probing sequences that can be emitted, depending on
the size of the stack allocation:

* A straight-line sequence of subtracts and stores, used when the
  allocation size is smaller than 5 guard pages.

* A loop allocating and probing one page size per iteration, plus at
  most a single probe to deal with the remainder, used when the
  allocation size is larger but still known at compile time.

* A loop which moves the SP down to the target value held in a register,
  used when the allocation size is not known at compile-time, such as
  when allocating space for SVE values, or when over-aligning the stack.
  This is emitted in AArch64InstrInfo because it will also be used for
  dynamic allocas in a future patch.

* A single probe where the amount of stack adjustment is unknown, but is
  known to be less than or equal to the page size.

Change-Id: Ib31bc23d2806835bc6da822599f2bd2ecc18e4bf
Co-authored-by: Oliver Stannard <oliver.stannard at linaro.org>
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 392 ++++++++++-
 .../lib/Target/AArch64/AArch64FrameLowering.h |  23 +
 .../Target/AArch64/AArch64ISelLowering.cpp    |  34 +
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  14 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  93 ++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   4 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  21 +-
 llvm/test/CodeGen/AArch64/framelayout-sve.mir |   4 +-
 .../AArch64/spill-stack-realignment.mir       |   2 +-
 .../test/CodeGen/AArch64/stack-probing-64k.ll | 392 +++++++++++
 .../test/CodeGen/AArch64/stack-probing-sve.ll | 661 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/stack-probing.ll    | 474 +++++++++++++
 12 files changed, 2077 insertions(+), 37 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-64k.ll
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-sve.ll
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing.ll

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 880de7d0306a7e1..338fb28b447c158 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -296,6 +296,7 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
 static bool produceCompactUnwindFrame(MachineFunction &MF);
 static bool needsWinCFI(const MachineFunction &MF);
 static StackOffset getSVEStackSize(const MachineFunction &MF);
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB);
 
 /// Returns true if a homogeneous prolog or epilog code can be emitted
 /// for the size optimization. If possible, a frame helper call is injected.
@@ -688,6 +689,74 @@ void AArch64FrameLowering::emitCalleeSavedSVERestores(
   emitCalleeSavedRestores(MBB, MBBI, true);
 }
 
+void AArch64FrameLowering::allocateSVEStackSpace(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    StackOffset AllocSize, StackOffset InitialOffset, bool EmitCFI) const {
+  DebugLoc DL;
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
+  const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+
+  // If not probing the stack or the (uknown) allocation size is less than the
+  // probe size decrement the stack pointer right away. This avoids having to
+  // emit a probing loop when allocating space for up to 16 SVE registers when
+  // using 4k probes.
+
+  // The bit-length of SVE registers is architecturally limited.
+  const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
+  int64_t ProbeSize = TLI.getStackProbeSize(MF);
+  if (!TLI.hasInlineStackProbe(MF) ||
+      AllocSize.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE +
+              AllocSize.getFixed() <=
+          ProbeSize) {
+    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -AllocSize, &TII,
+                    MachineInstr::FrameSetup, false, false, nullptr, EmitCFI,
+                    InitialOffset);
+    if (TLI.hasInlineStackProbe(MF)) {
+      // Issue a probe at the top of the stack to prepare for subsequent
+      // allocations.
+      // STR XZR, [TargetReg]
+      BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::SP)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+    return;
+  }
+
+  // If we can't be sure the allocation size if less than the probe size, we
+  // have to emit a stack probing loop.
+  Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+  assert(ScratchReg != AArch64::NoRegister);
+  // Get the new top of the stack into a scratch register.
+  emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII,
+                  MachineInstr::FrameSetup, false, false, nullptr, EmitCFI,
+                  InitialOffset);
+  // Arrange to emit a probing loop by decrementing SP until it reaches that
+  // new top of the stack.
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR), AArch64::SP)
+      .addReg(ScratchReg);
+  // Set SP to its new value.
+  // MOV SP, Xs
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::ADDXri), AArch64::SP)
+      .addReg(ScratchReg)
+      .addImm(0)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+      .setMIFlags(MachineInstr::FrameSetup);
+  if (EmitCFI) {
+    // Set the CFA register back to SP.
+    unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+}
+
 static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
   switch (Reg.id()) {
   default:
@@ -866,9 +935,11 @@ bool AArch64FrameLowering::canUseAsPrologue(
   MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
 
-  // Don't need a scratch register if we're not going to re-align the stack.
-  if (!RegInfo->hasStackRealignment(*MF))
+  // Don't need a scratch register if we're not going to re-align the stack or
+  // emit stack probes.
+  if (!RegInfo->hasStackRealignment(*MF) && TLI->hasInlineStackProbe(*MF))
     return true;
   // Otherwise, we can use any block as long as it has a scratch register
   // available.
@@ -1391,6 +1462,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   const Function &F = MF.getFunction();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -1774,12 +1846,14 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     }
   }
 
-  StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
+  StackOffset SVECalleeSavedSize = {}, SVELocalsSize = SVEStackSize;
   MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
 
   // Process the SVE callee-saves to determine what space needs to be
   // allocated.
   if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
+    LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
+                      << "\n");
     // Find callee save instructions in frame.
     CalleeSavesBegin = MBBI;
     assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
@@ -1787,33 +1861,40 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       ++MBBI;
     CalleeSavesEnd = MBBI;
 
-    AllocateBefore = StackOffset::getScalable(CalleeSavedSize);
-    AllocateAfter = SVEStackSize - AllocateBefore;
+    SVECalleeSavedSize = StackOffset::getScalable(CalleeSavedSize);
+    SVELocalsSize = SVEStackSize - SVECalleeSavedSize;
+
+    // Allocate space for the SVE callee saves.
+    if (SVECalleeSavedSize) {
+      allocateSVEStackSpace(
+          MBB, CalleeSavesBegin, SVECalleeSavedSize,
+          StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes),
+          EmitAsyncCFI && !HasFP);
+      if (EmitAsyncCFI)
+        emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
+    }
   }
 
-  // Allocate space for the callee saves (if any).
-  emitFrameOffset(
-      MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, -AllocateBefore, TII,
-      MachineInstr::FrameSetup, false, false, nullptr,
-      EmitAsyncCFI && !HasFP && AllocateBefore,
-      StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
-
-  if (EmitAsyncCFI)
-    emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
-
-  // Finally allocate remaining SVE stack space.
-  emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
-                  -AllocateAfter, TII, MachineInstr::FrameSetup, false, false,
-                  nullptr, EmitAsyncCFI && !HasFP && AllocateAfter,
-                  AllocateBefore + StackOffset::getFixed(
-                                       (int64_t)MFI.getStackSize() - NumBytes));
+  // Allocate stack space for the local SVE objects.
+  if (SVELocalsSize)
+    allocateSVEStackSpace(
+        MBB, CalleeSavesEnd, SVELocalsSize,
+        SVECalleeSavedSize +
+            StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes),
+        EmitAsyncCFI && !HasFP);
 
   // Allocate space for the rest of the frame.
   if (NumBytes) {
     unsigned scratchSPReg = AArch64::SP;
+    bool NeedsStackProbe = TLI.hasInlineStackProbe(MF) &&
+                           (NumBytes > AArch64::StackProbeMaxUnprobedStack ||
+                            MFI.hasVarSizedObjects());
 
     if (NeedsRealignment) {
       scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
+      NeedsStackProbe |= TLI.hasInlineStackProbe(MF) &&
+                         (NumBytes + MFI.getMaxAlign().value()) >
+                             AArch64::StackProbeMaxUnprobedStack;
       assert(scratchSPReg != AArch64::NoRegister);
     }
 
@@ -1822,12 +1903,36 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
       // the correct value here, as NumBytes also includes padding bytes,
       // which shouldn't be counted here.
-      emitFrameOffset(
-          MBB, MBBI, DL, scratchSPReg, AArch64::SP,
-          StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup,
-          false, NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
+      StackOffset CFAOffset =
           SVEStackSize +
-              StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes));
+          StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
+      if (NeedsStackProbe && !NeedsRealignment) {
+        // If we don't need to re-align the stack, we can use a more efficient
+        // sequence for stack probing.
+        Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
+        assert(ScratchReg != AArch64::NoRegister);
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC))
+            .addDef(ScratchReg)
+            .addImm(NumBytes)
+            .addImm(CFAOffset.getFixed())
+            .addImm(CFAOffset.getScalable());
+        // The fixed allocation may leave unprobed bytes at the top of the
+        // stack. If we have variable-sized objects, we need to issue an extra
+        // probe, so their allocations starts in a known state.
+        if (MFI.hasVarSizedObjects()) {
+          // STR XZR, [SP]
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXui))
+              .addReg(AArch64::XZR)
+              .addReg(AArch64::SP)
+              .addImm(0)
+              .setMIFlags(MachineInstr::FrameSetup);
+        }
+      } else {
+        emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP,
+                        StackOffset::getFixed(-NumBytes), TII,
+                        MachineInstr::FrameSetup, false, NeedsWinCFI,
+                        &HasWinCFI, EmitAsyncCFI && !HasFP, CFAOffset);
+      }
     }
     if (NeedsRealignment) {
       assert(MFI.getMaxAlign() > Align(1));
@@ -1836,12 +1941,48 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       // SUB X9, SP, NumBytes
       //   -- X9 is temporary register, so shouldn't contain any live data here,
       //   -- free to use. This is already produced by emitFrameOffset above.
-      // AND SP, X9, 0b11111...0000
-      uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1);
 
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
-          .addReg(scratchSPReg, RegState::Kill)
-          .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64));
+      const uint64_t MaxAlign = MFI.getMaxAlign().value();
+      const uint64_t AndMask = ~(MaxAlign - 1);
+
+      if (NeedsStackProbe) {
+        // If allocation size is known to not exceed the probe size, don't emit
+        // a probing loop.
+        if (NumBytes + MaxAlign - 1 <= TLI.getStackProbeSize(MF)) {
+          // AND SP, X9, 0b11111...0000
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+              .addReg(scratchSPReg, RegState::Kill)
+              .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
+              .setMIFlags(MachineInstr::FrameSetup);
+          // STR XZR, [SP]
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::STRXui))
+              .addReg(AArch64::XZR)
+              .addReg(AArch64::SP)
+              .addImm(0)
+              .setMIFlags(MachineInstr::FrameSetup);
+        } else {
+          // AND X9, X9, 0b11111...0000
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), scratchSPReg)
+              .addReg(scratchSPReg, RegState::Kill)
+              .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
+              .setMIFlags(MachineInstr::FrameSetup);
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR),
+                  AArch64::SP)
+              .addReg(scratchSPReg);
+          // MOV SP, X9
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::SP)
+              .addReg(scratchSPReg)
+              .addImm(0)
+              .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+              .setMIFlags(MachineInstr::FrameSetup);
+        }
+      } else {
+        // AND SP, X9, 0b11111...0000
+        BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+            .addReg(scratchSPReg, RegState::Kill)
+            .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
+            .setMIFlags(MachineInstr::FrameSetup);
+      }
       AFI->setStackRealigned(true);
 
       // No need for SEH instructions here; if we're realigning the stack,
@@ -4038,3 +4179,192 @@ void AArch64FrameLowering::orderFrameObjects(
     dbgs() << "\n";
   });
 }
+
+/// Emit a loop to decrement SP until it is equal to TargetReg, with probes at
+/// least every ProbeSize bytes. Returns an iterator of the first instruction
+/// after the loop. The difference between SP and TargetReg must be an exact
+/// multiple of ProbeSize.
+MachineBasicBlock::iterator
+AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
+    MachineBasicBlock::iterator MBBI, int64_t ProbeSize,
+    Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not encodable
+  // in SUB).
+  emitFrameOffset(*LoopMBB, LoopMBB->end(), DL, AArch64::SP, AArch64::SP,
+                  StackOffset::getFixed(-ProbeSize), TII,
+                  MachineInstr::FrameSetup);
+  // STR XZR, [SP]
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::STRXui))
+      .addReg(AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addImm(0)
+      .setMIFlags(MachineInstr::FrameSetup);
+  // CMP SP, TargetReg
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
+          AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addReg(TargetReg)
+      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+      .setMIFlags(MachineInstr::FrameSetup);
+  // B.CC Loop
+  BuildMI(*LoopMBB, LoopMBB->end(), DL, TII->get(AArch64::Bcc))
+      .addImm(AArch64CC::NE)
+      .addMBB(LoopMBB)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  LoopMBB->addSuccessor(ExitMBB);
+  LoopMBB->addSuccessor(LoopMBB);
+  // Synthesize the exit MBB.
+  ExitMBB->splice(ExitMBB->end(), &MBB, MBBI, MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(LoopMBB);
+  // Update liveins.
+  recomputeLiveIns(*LoopMBB);
+  recomputeLiveIns(*ExitMBB);
+
+  return ExitMBB->begin();
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
+    MachineBasicBlock::iterator MBBI, Register ScratchReg, int64_t FrameSize,
+    StackOffset CFAOffset) const {
+  MachineBasicBlock *MBB = MBBI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const AArch64TargetLowering *TLI =
+      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
+  bool HasFP = hasFP(MF);
+
+  DebugLoc DL;
+  int64_t ProbeSize = TLI->getStackProbeSize(MF);
+  int64_t NumBlocks = FrameSize / ProbeSize;
+  int64_t ResidualSize = FrameSize % ProbeSize;
+
+  LLVM_DEBUG(dbgs() << "Stack probing: total " << FrameSize << " bytes, "
+                    << NumBlocks << " blocks of " << ProbeSize
+                    << " bytes, plus " << ResidualSize << " bytes\n");
+
+  // Decrement SP by NumBlock * ProbeSize bytes, with either unrolled or
+  // ordinary loop.
+  if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) {
+    for (int i = 0; i < NumBlocks; ++i) {
+      // SUB SP, SP, #FrameSize (or equivalent if FrameSize is not
+      // encodable in a SUB).
+      emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+                      StackOffset::getFixed(-ProbeSize), TII,
+                      MachineInstr::FrameSetup, false, false, nullptr,
+                      EmitAsyncCFI && !HasFP, CFAOffset);
+      CFAOffset += StackOffset::getFixed(ProbeSize);
+      // STR XZR, [SP]
+      BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::SP)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  } else if (NumBlocks != 0) {
+    // SUB ScratchReg, SP, #FrameSize (or equivalent if FrameSize is not
+    // encodable in ADD). ScrathReg may temporarily become the CFA register.
+    emitFrameOffset(*MBB, MBBI, DL, ScratchReg, AArch64::SP,
+                    StackOffset::getFixed(-ProbeSize * NumBlocks), TII,
+                    MachineInstr::FrameSetup, false, false, nullptr,
+                    EmitAsyncCFI && !HasFP, CFAOffset);
+    CFAOffset += StackOffset::getFixed(ProbeSize * NumBlocks);
+    MBBI = inlineStackProbeLoopExactMultiple(MBBI, ProbeSize, ScratchReg);
+    MBB = MBBI->getParent();
+    if (EmitAsyncCFI && !HasFP) {
+      // Set the CFA register back to SP.
+      const AArch64RegisterInfo &RegInfo =
+          *MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+      unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+      BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  }
+
+  if (ResidualSize != 0) {
+    // SUB SP, SP, #ResidualSize (or equivalent if ResidualSize is not encodable
+    // in SUB).
+    emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
+                    StackOffset::getFixed(-ResidualSize), TII,
+                    MachineInstr::FrameSetup, false, false, nullptr,
+                    EmitAsyncCFI && !HasFP, CFAOffset);
+    if (ResidualSize > AArch64::StackProbeMaxUnprobedStack) {
+      // STR XZR, [SP]
+      BuildMI(*MBB, MBBI, DL, TII->get(AArch64::STRXui))
+          .addReg(AArch64::XZR)
+          .addReg(AArch64::SP)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  }
+
+  MachineBasicBlock::iterator Next = std::next(MBBI);
+  return Next;
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
+    MachineBasicBlock::iterator MBBI) const {
+
+  Register ScratchReg = MBBI->getOperand(0).getReg();
+  int64_t FrameSize = MBBI->getOperand(1).getImm();
+  StackOffset CFAOffset = StackOffset::get(MBBI->getOperand(2).getImm(),
+                                           MBBI->getOperand(3).getImm());
+
+  MachineBasicBlock::iterator NextInst =
+      inlineStackProbeFixed(MBBI, ScratchReg, FrameSize, CFAOffset);
+
+  MBBI->eraseFromParent();
+  return NextInst;
+}
+
+MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeVar(
+    MachineBasicBlock::iterator MBBI) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+  Register ScratchReg = MBBI->getOperand(0).getReg();
+  Register TargetReg = MBBI->getOperand(1).getReg();
+
+  MachineBasicBlock::iterator NextInst =
+      TII->insertStackProbingLoop(MBBI, ScratchReg, TargetReg);
+
+  MBBI->eraseFromParent();
+  return NextInst;
+}
+
+void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
+                                            MachineBasicBlock &MBB) const {
+  for (auto MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) {
+    if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC) {
+      MBBI = inlineStackProbeFixed(MBBI);
+      E = MBBI->getParent()->end();
+    } else if (MBBI->getOpcode() == AArch64::PROBED_STACKALLOC_VAR) {
+      MBBI = inlineStackProbeVar(MBBI);
+      E = MBBI->getParent()->end();
+    } else {
+      ++MBBI;
+    }
+  }
+}
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 147b5c181be5e53..3f53ef3dd502c0f 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -150,10 +150,33 @@ class AArch64FrameLowering : public TargetFrameLowering {
                                   MachineBasicBlock::iterator MBBI) const;
   void emitCalleeSavedSVERestores(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MBBI) const;
+  void allocateSVEStackSpace(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             StackOffset AllocSize, StackOffset InitialOffset,
+                             bool EmitCFI) const;
 
   /// Emit target zero call-used regs.
   void emitZeroCallUsedRegs(BitVector RegsToZero,
                             MachineBasicBlock &MBB) const override;
+
+  /// Replace a StackProbe stub (if any) with the actual probe code inline
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologueMBB) const override;
+
+  MachineBasicBlock::iterator
+  inlineStackProbeFixed(MachineBasicBlock::iterator MBBI, Register ScratchReg,
+                        int64_t FrameSize, StackOffset CFAOffset) const;
+
+  MachineBasicBlock::iterator
+  inlineStackProbeFixed(MachineBasicBlock::iterator MBBI) const;
+
+  MachineBasicBlock::iterator
+  inlineStackProbeVar(MachineBasicBlock::iterator MBBI) const;
+
+  MachineBasicBlock::iterator
+  inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,
+                                    int64_t NegProbeSize,
+                                    Register TargetReg) const;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4daaf21d42b3ce4..3db288eb83787a0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26531,3 +26531,37 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
   }
   return true;
 }
+
+bool AArch64TargetLowering::hasInlineStackProbe(
+    const MachineFunction &MF) const {
+  // If the function specifically requests inline stack probes, emit them.
+  if (MF.getFunction().hasFnAttribute("probe-stack")) {
+    if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+        "inline-asm")
+      return true;
+    else
+      llvm_unreachable("Unsupported stack probing method");
+  }
+
+  return false;
+}
+
+unsigned
+AArch64TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = Subtarget->getFrameLowering();
+  unsigned StackAlign = TFI->getStackAlignment();
+  assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
+         "Unexpected stack alignment");
+  // The default stack probe size is 4096 if the function has no
+  // stack-probe-size attribute. This is a safe default because it is the
+  // smallest possible guard page size.
+  unsigned StackProbeSize = 4096;
+  const Function &Fn = MF.getFunction();
+  if (Fn.hasFnAttribute("stack-probe-size"))
+    Fn.getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  // Round down to the stack alignment.
+  StackProbeSize &= ~(StackAlign - 1);
+  return StackProbeSize ? StackProbeSize : StackAlign;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 52e519cd8a0c93c..b33598c1a1ba0b9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -504,6 +504,13 @@ const unsigned RoundingBitsPos = 22;
 ArrayRef<MCPhysReg> getGPRArgRegs();
 ArrayRef<MCPhysReg> getFPRArgRegs();
 
+/// Maximum allowed number of unprobed bytes above SP at an ABI
+/// boundary.
+const unsigned StackProbeMaxUnprobedStack = 1024;
+
+/// Maximum number of iterations to unroll for a constant size probing loop.
+const unsigned StackProbeMaxLoopUnroll = 4;
+
 } // namespace AArch64
 
 class AArch64Subtarget;
@@ -942,6 +949,13 @@ class AArch64TargetLowering : public TargetLowering {
   // used for 64bit and 128bit vectors as well.
   bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON = false) const;
 
+  /// True if stack clash protection is enabled for this functions.
+  bool hasInlineStackProbe(const MachineFunction &MF) const override;
+
+  /// Get the interval between stack-clash probes, which is equal to the stack
+  /// guard size, in bytes.
+  unsigned getStackProbeSize(const MachineFunction &MF) const;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index e9b9f8013abeaf6..99410cf9da7564d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64ExpandImm.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64ExpandImm.h"
 #include "AArch64FrameLowering.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64PointerAuth.h"
@@ -21,6 +21,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -9443,6 +9444,96 @@ bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
   return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
 }
 
+MachineBasicBlock::iterator
+AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
+                                         Register ScratchReg,
+                                         Register TargetReg) const {
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64TargetLowering *TLI =
+      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
+  const AArch64InstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  int64_t ProbeSize = (int64_t)TLI->getStackProbeSize(MF);
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+  MachineBasicBlock *LoopTestMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopTestMBB);
+  MachineBasicBlock *LoopBodyMBB =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopBodyMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+
+  // LoopTest:
+  //   SUB ScratchReg, ScratchReg, #ProbeSize
+  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg,
+                  StackOffset::getFixed(-ProbeSize), TII,
+                  MachineInstr::FrameSetup);
+
+  //   CMP ScratchReg, TargetReg
+  AArch64CC::CondCode Cond = AArch64CC::LE;
+  Register Op1 = ScratchReg;
+  Register Op2 = TargetReg;
+  if (Op2 == AArch64::SP) {
+    assert(Op1 != AArch64::SP && "At most one of the registers can be SP");
+    // CMP TargetReg, ScratchReg
+    std::swap(Op1, Op2);
+    Cond = AArch64CC::GT;
+  }
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
+          AArch64::XZR)
+      .addReg(Op1)
+      .addReg(Op2)
+      .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  //   B.<Cond> LoopExit
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
+      .addImm(Cond)
+      .addMBB(ExitMBB)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  //   STR XZR, [ScratchReg]
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
+      .addReg(AArch64::XZR)
+      .addReg(ScratchReg)
+      .addImm(0)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  //   B loop
+  BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::B))
+      .addMBB(LoopTestMBB)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  // LoopExit:
+  //   STR XZR, [TargetReg]
+  BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui))
+      .addReg(AArch64::XZR)
+      .addReg(TargetReg)
+      .addImm(0)
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+  LoopTestMBB->addSuccessor(ExitMBB);
+  LoopTestMBB->addSuccessor(LoopBodyMBB);
+  LoopBodyMBB->addSuccessor(LoopTestMBB);
+  MBB.addSuccessor(LoopTestMBB);
+
+  // Update liveins.
+  if (MF.getRegInfo().reservedRegsFrozen()) {
+    recomputeLiveIns(*LoopTestMBB);
+    recomputeLiveIns(*LoopBodyMBB);
+    recomputeLiveIns(*ExitMBB);
+  }
+
+  return ExitMBB->begin();
+}
+
 #define GET_INSTRINFO_HELPERS
 #define GET_INSTRMAP_INFO
 #include "AArch64GenInstrInfo.inc"
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index a934103c90cbf92..6972002defc0a3e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -382,6 +382,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
                              unsigned Scale) const;
 
+  MachineBasicBlock::iterator
+  insertStackProbingLoop(MachineBasicBlock::iterator MBBI, Register ScratchReg,
+                         Register TargetReg) const;
+
 #define GET_INSTRINFO_HELPER_DECLS
 #include "AArch64GenInstrInfo.inc"
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 97c5e89a29a9728..ba06e28ab2cf7b4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -900,7 +900,8 @@ include "SMEInstrFormats.td"
 // Miscellaneous instructions.
 //===----------------------------------------------------------------------===//
 
-let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+let hasSideEffects = 1, isCodeGenOnly = 1 in {
+let Defs = [SP], Uses = [SP] in {
 // We set Sched to empty list because we expect these instructions to simply get
 // removed in most cases.
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
@@ -909,7 +910,23 @@ def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
                             Sched<[]>;
-} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
+
+// Probed stack allocation of a constant size, used in function prologues when
+// stack-clash protection is enabled.
+def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch),
+                               (ins i64imm:$stacksize, i64imm:$fixed_offset,
+                                i64imm:$scalable_offset),
+                               []>,
+                               Sched<[]>;
+} // Defs = [SP], Uses = [SP]
+
+// Probed stack allocation of a variable size, used in function prologues when
+// stack-clash protection is enabled.
+def PROBED_STACKALLOC_VAR : Pseudo<(outs GPR64sp:$scratch),
+                                   (ins GPR64sp:$target),
+                                   []>,
+                                   Sched<[]>;
+} //hasSideEffects = 1, isCodeGenOnly = 1
 
 let isReMaterializable = 1, isCodeGenOnly = 1 in {
 // FIXME: The following pseudo instructions are only needed because remat
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 7c87587c6dc4e2c..74834516f641f5a 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -207,7 +207,7 @@ body:             |
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0
-# CHECK-NEXT: $sp = ANDXri killed $[[TMP]]
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]]
 # CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2
@@ -1033,7 +1033,7 @@ body:             |
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22
 # CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0
-# CHECK-NEXT: $sp = ANDXri killed $[[TMP]]
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]]
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -18
 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
diff --git a/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir b/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir
index 1b9411d07f433ab..f6fc627ac2d3d87 100644
--- a/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir
+++ b/llvm/test/CodeGen/AArch64/spill-stack-realignment.mir
@@ -21,7 +21,7 @@ stack:
   - { id: 1, size: 4, alignment: 4, local-offset: -68 }
 
 # CHECK: body:
-# CHECK:   $sp = ANDXri killed ${{x[0-9]+}}, 7865
+# CHECK:   $sp = frame-setup ANDXri killed ${{x[0-9]+}}, 7865
 # CHECK:   STRSui $s0, $sp, 0
 # CHECK:   STRSui $s0, $fp, 7
 body:             |
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
new file mode 100644
index 000000000000000..912d0c273c66180
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
@@ -0,0 +1,392 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
+
+; Tests for prolog sequences for stack probing, when using a 64KiB stack guard.
+
+; 64k bytes is the largest frame we can probe in one go.
+define void @static_65536(ptr %out) #0 {
+; CHECK-LABEL: static_65536:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 65536, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 64k+16 bytes, still needs just one probe.
+define void @static_65552(ptr %out) #0 {
+; CHECK-LABEL: static_65552:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp], #-16
+; CHECK-NEXT:    .cfi_def_cfa_offset 65568
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 65552, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 64k+1024 bytes, the largest frame which needs just one probe.
+define void @static_66560(ptr %out) #0 {
+; CHECK-LABEL: static_66560:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 66576
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 66560, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 64k+1024+16 bytes, the smallest frame which needs two probes.
+define void @static_66576(ptr %out) #0 {
+; CHECK-LABEL: static_66576:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 66592
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 66576, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 2*64k+1024, the largest frame needing two probes.
+define void @static_132096(ptr %out) #0 {
+; CHECK-LABEL: static_132096:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 131088
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 132112
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #32, lsl #12 // =131072
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 132096, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 5*64k-16, the largest frame probed without a loop.
+define void @static_327664(ptr %out) #0 {
+; CHECK-LABEL: static_327664:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 65552
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 131088
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 196624
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    .cfi_def_cfa_offset 262160
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #15, lsl #12 // =61440
+; CHECK-NEXT:    .cfi_def_cfa_offset 323600
+; CHECK-NEXT:    sub sp, sp, #4080
+; CHECK-NEXT:    .cfi_def_cfa_offset 327680
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #79, lsl #12 // =323584
+; CHECK-NEXT:    .cfi_def_cfa_offset 4096
+; CHECK-NEXT:    add sp, sp, #4080
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 327664, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 5*64k, smallest frame probed with a loop.
+define void @static_327680(ptr %out) #0 {
+; CHECK-LABEL: static_327680:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa w9, 327696
+; CHECK-NEXT:  .LBB6_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB6_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 327680, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 5*64k+1024, large enough to use a loop, but not a multiple of 64KiB
+; so has a reminder, but no extra probe.
+define void @static_328704(ptr %out) #0 {
+; CHECK-LABEL: static_328704:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa w9, 327696
+; CHECK-NEXT:  .LBB7_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB7_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 328720
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 328704, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; 5*64k+1040, large enough to use a loop, has a reminder and
+; an extra probe.
+define void @static_328720(ptr %out) #0 {
+; CHECK-LABEL: static_328720:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa w9, 327696
+; CHECK-NEXT:  .LBB8_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB8_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 328736
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #80, lsl #12 // =327680
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 328720, align 1
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; A small allocation, but with a very large alignment requirement. We do this
+; by moving SP far enough that a sufficiently-aligned block will exist
+; somewhere in the stack frame, so must probe the whole of that larger SP move.
+define void @static_16_align_131072(ptr %out) #0 {
+; CHECK-LABEL: static_16_align_131072:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #31, lsl #12 // =126976
+; CHECK-NEXT:    sub x9, x9, #4080
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffe0000
+; CHECK-NEXT:  .LBB9_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB9_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB9_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB9_1
+; CHECK-NEXT:  .LBB9_3: // %entry
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 16, align 131072
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; A small allocation, but with a very large alignment requirement which
+; is nevertheless small enough as to not need a loop.
+define void @static_16_align_8192(ptr %out) #0 {
+; CHECK-LABEL: static_16_align_8192:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4080
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffe000
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 16, align 8192
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+; A large allocation with a very large alignment requirement which
+; is nevertheless small enough as to not need a loop.
+define void @static_32752_align_32k(ptr %out) #0 {
+; CHECK-LABEL: static_32752_align_32k:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #7, lsl #12 // =28672
+; CHECK-NEXT:    sub x9, x9, #4080
+; CHECK-NEXT:    and sp, x9, #0xffffffffffff8000
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 32752, align 32768
+  store i8* %v, ptr %out, align 8
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="65536" "frame-pointer"="none" }
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
new file mode 100644
index 000000000000000..6f7420d04d2830f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -0,0 +1,661 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
+
+; Test prolog sequences for stack probing when SVE objects are involved.
+
+; The space for SVE objects needs probing in the general case, because
+; the stack adjustment may happen to be too big (i.e. greater than the
+; probe size) to allocate with a single `addvl`.
+; When we do know that the stack adjustment cannot exceed the probe size
+; we can avoid emitting a probe loop and emit a simple `addvl; str`
+; sequence instead.
+
+define void @sve_1_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_1_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 4 SVE vectors of stack space.
+define void @sve_4_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_4_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 16 SVE vectors of stack space.
+; The stack adjustment is less than or equal to 16 x 256 = 4096, so
+; we can allocate the locals at once.
+define void @sve_16_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_16_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    addvl sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  %vec5 = alloca <vscale x 4 x float>, align 16
+  %vec6 = alloca <vscale x 4 x float>, align 16
+  %vec7 = alloca <vscale x 4 x float>, align 16
+  %vec8 = alloca <vscale x 4 x float>, align 16
+  %vec9 = alloca <vscale x 4 x float>, align 16
+  %vec10 = alloca <vscale x 4 x float>, align 16
+  %vec11 = alloca <vscale x 4 x float>, align 16
+  %vec12 = alloca <vscale x 4 x float>, align 16
+  %vec13 = alloca <vscale x 4 x float>, align 16
+  %vec14 = alloca <vscale x 4 x float>, align 16
+  %vec15 = alloca <vscale x 4 x float>, align 16
+  %vec16 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 17 SVE vectors of stack space. Now we need
+; a probing loops since stack adjustment may be greater than
+; the probe size (17 x 256 = 4354 bytes)
+define void @sve_17_vector(ptr %out) #0 {
+; CHECK-LABEL: sve_17_vector:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x9, sp, #-17
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
+; CHECK-NEXT:  .LBB3_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB3_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB3_1
+; CHECK-NEXT:  .LBB3_3: // %entry
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    addvl sp, sp, #17
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  %vec5 = alloca <vscale x 4 x float>, align 16
+  %vec6 = alloca <vscale x 4 x float>, align 16
+  %vec7 = alloca <vscale x 4 x float>, align 16
+  %vec8 = alloca <vscale x 4 x float>, align 16
+  %vec9 = alloca <vscale x 4 x float>, align 16
+  %vec10 = alloca <vscale x 4 x float>, align 16
+  %vec11 = alloca <vscale x 4 x float>, align 16
+  %vec12 = alloca <vscale x 4 x float>, align 16
+  %vec13 = alloca <vscale x 4 x float>, align 16
+  %vec14 = alloca <vscale x 4 x float>, align 16
+  %vec15 = alloca <vscale x 4 x float>, align 16
+  %vec16 = alloca <vscale x 4 x float>, align 16
+  %vec17 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; Space for callee-saved SVE register is allocated similarly to allocating
+; space for SVE locals. When we know the stack adjustment cannot exceed the
+; probe size we can skip the explict probe, since saving SVE registers serves
+; as an implicit probe.
+define void @sve_1v_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_1v_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{z8}" ()
+  ret void
+}
+
+define void @sve_4v_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_4v_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    .cfi_restore z9
+; CHECK-NEXT:    .cfi_restore z10
+; CHECK-NEXT:    .cfi_restore z11
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" ()
+  ret void
+}
+
+define void @sve_16v_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_16v_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    .cfi_restore z9
+; CHECK-NEXT:    .cfi_restore z10
+; CHECK-NEXT:    .cfi_restore z11
+; CHECK-NEXT:    .cfi_restore z12
+; CHECK-NEXT:    .cfi_restore z13
+; CHECK-NEXT:    .cfi_restore z14
+; CHECK-NEXT:    .cfi_restore z15
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
+  ret void
+}
+
+define void @sve_1p_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_1p_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{p8}" ()
+  ret void
+}
+
+define void @sve_4p_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_4p_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    str p11, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p10, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p9, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" ()
+  ret void
+}
+
+define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: sve_16v_1p_csr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x9, sp, #-17
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
+; CHECK-NEXT:  .LBB9_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB9_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB9_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB9_1
+; CHECK-NEXT:  .LBB9_3: // %entry
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #17
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    .cfi_restore z8
+; CHECK-NEXT:    .cfi_restore z9
+; CHECK-NEXT:    .cfi_restore z10
+; CHECK-NEXT:    .cfi_restore z11
+; CHECK-NEXT:    .cfi_restore z12
+; CHECK-NEXT:    .cfi_restore z13
+; CHECK-NEXT:    .cfi_restore z14
+; CHECK-NEXT:    .cfi_restore z15
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
+  ret void
+}
+
+; A SVE vector and a 16-byte fixed size object.
+define void @sve_1_vector_16_arr(ptr %out) #0 {
+; CHECK-LABEL: sve_1_vector_16_arr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str xzr, [sp], #-16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  %arr = alloca i8, i64 16, align 1
+  ret void
+}
+
+; A large SVE stack object and a large stack slot, both of which need probing.
+; TODO: This could be optimised by combining the fixed-size offset into the
+; loop.
+define void @sve_1_vector_4096_arr(ptr %out) #0 {
+; CHECK-LABEL: sve_1_vector_4096_arr:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x9, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG
+; CHECK-NEXT:  .LBB11_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB11_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB11_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB11_1
+; CHECK-NEXT:  .LBB11_3: // %entry
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0x20, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 4112 + 512 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xc0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 8208 + 512 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 512 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    .cfi_def_cfa wsp, 12304
+; CHECK-NEXT:    add sp, sp, #3, lsl #12 // =12288
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 256 x float>, align 16
+  %arr = alloca i8, i64 12288, align 1
+  ret void
+}
+
+; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently
+; supported even without stack-probing.
+
+; An SVE vector, and a 16-byte fixed size object, which
+; has a large alignment requirement.
+define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 {
+; CHECK-LABEL: sve_1_vector_16_arr_align_8192:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub x9, x9, #4080
+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT:  .LBB12_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB12_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB12_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB12_1
+; CHECK-NEXT:  .LBB12_3: // %entry
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  %arr = alloca i8, i64 16, align 8192
+  ret void
+}
+
+; With 64k guard pages, we can allocate bigger SVE space without a probing loop.
+define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
+; CHECK-LABEL: sve_1024_64k_guard:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG
+; CHECK-NEXT:    addvl sp, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
+; CHECK-NEXT:    addvl sp, sp, #8
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 1024 x float>, align 16
+  ret void
+}
+
+define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
+; CHECK-LABEL: sve_1028_64k_guard:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl x9, sp, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG
+; CHECK-NEXT:    addvl x9, x9, #-32
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG
+; CHECK-NEXT:    addvl x9, x9, #-1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG
+; CHECK-NEXT:  .LBB14_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB14_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB14_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB14_1
+; CHECK-NEXT:  .LBB14_3: // %entry
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG
+; CHECK-NEXT:    addvl sp, sp, #31
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 1024 x float>, align 16
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll
new file mode 100644
index 000000000000000..fb4c4e00b719197
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll
@@ -0,0 +1,474 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -enable-post-misched=false -global-isel | FileCheck %s
+
+; Tests for prolog sequences for stack probing, when using a 4KiB stack guard.
+
+; Small stack frame, no probing required.
+define void @static_64(ptr %out) #0 {
+; CHECK-LABEL: static_64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 64, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; At 256 bytes we start to always create a frame pointer. No frame smaller then
+; this needs a probe, so we can use the saving of at least one CSR as a probe
+; at the top of our frame.
+define void @static_256(ptr %out) #0 {
+; CHECK-LABEL: static_256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #272
+; CHECK-NEXT:    .cfi_def_cfa_offset 272
+; CHECK-NEXT:    str x29, [sp, #256] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #272
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 256, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; At 1024 bytes, this is the largest frame which doesn't need probing.
+define void @static_1024(ptr %out) #0 {
+; CHECK-LABEL: static_1024:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 1024, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; At 1024+16 bytes, this is the smallest frame which needs probing.
+define void @static_1040(ptr %out) #0 {
+; CHECK-LABEL: static_1040:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 1040, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 4k bytes is the largest frame we can probe in one go.
+define void @static_4096(ptr %out) #0 {
+; CHECK-LABEL: static_4096:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 4096, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 4k+16 bytes, still needs just one probe.
+define void @static_4112(ptr %out) #0 {
+; CHECK-LABEL: static_4112:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp], #-16
+; CHECK-NEXT:    .cfi_def_cfa_offset 4128
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 4112, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 4k+1024 bytes, the largest frame which needs just one probe.
+define void @static_5120(ptr %out) #0 {
+; CHECK-LABEL: static_5120:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 5136
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 5120, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 4k+1024+16, the smallest frame which needs two probes.
+define void @static_5136(ptr %out) #0 {
+; CHECK-LABEL: static_5136:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 5152
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 5136, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 2*4k+1024, the largest frame needing two probes
+define void @static_9216(ptr %out) #0 {
+; CHECK-LABEL: static_9216:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 8208
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 9232
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #2, lsl #12 // =8192
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 9216, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 5*4k-16, the largest frame probed without a loop
+define void @static_20464(ptr %out) #0 {
+; CHECK-LABEL: static_20464:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 4112
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 8208
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 12304
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    .cfi_def_cfa_offset 16400
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    sub sp, sp, #4080
+; CHECK-NEXT:    .cfi_def_cfa_offset 20480
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #4, lsl #12 // =16384
+; CHECK-NEXT:    .cfi_def_cfa_offset 4096
+; CHECK-NEXT:    add sp, sp, #4080
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 20464, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 5*4k, the smallest frame probed with a loop
+define void @static_20480(ptr %out) #0 {
+; CHECK-LABEL: static_20480:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa w9, 20496
+; CHECK-NEXT:  .LBB10_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB10_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 20480, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 5*4k + 1024, large enough to use a loop, but not a multiple of 4KiB
+; so has a reminder, but no extra probe.
+define void @static_21504(ptr %out) #0 {
+; CHECK-LABEL: static_21504:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa w9, 20496
+; CHECK-NEXT:  .LBB11_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB11_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 21520
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa_offset 1040
+; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 21504, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; 5*4k+1040, large enough to use a loop, has a reminder and
+; an extra probe.
+define void @static_21520(ptr %out) #0 {
+; CHECK-LABEL: static_21520:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa w9, 20496
+; CHECK-NEXT:  .LBB12_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.ne .LBB12_1
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    .cfi_def_cfa_register wsp
+; CHECK-NEXT:    sub sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 21536
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #5, lsl #12 // =20480
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 21520, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; A small allocation, but with a very large alignment requirement. We do this
+; by moving SP far enough that a sufficiently-aligned block will exist
+; somewhere in the stack frame, so must probe the whole of that larger SP move.
+define void @static_16_align_8192(ptr %out) #0 {
+; CHECK-LABEL: static_16_align_8192:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4080
+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT:  .LBB13_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB13_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB13_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB13_1
+; CHECK-NEXT:  .LBB13_3: // %entry
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 16, align 8192
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; A small allocation with a very large alignment requirement, but
+; nevertheless small enough as to not need a loop.
+define void @static_16_align_2048(ptr %out) #0 {
+; CHECK-LABEL: static_16_align_2048:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #2032
+; CHECK-NEXT:    and sp, x9, #0xfffffffffffff800
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 16, align 2048
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; A large(-ish) allocation with a very large alignment requirement, but
+; nevertheless small enough as to not need a loop.
+define void @static_2032_align_2048(ptr %out) #0 {
+; CHECK-LABEL: static_2032_align_2048:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #2032
+; CHECK-NEXT:    and sp, x9, #0xfffffffffffff800
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 2032, align 2048
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }

>From 418805b04a1683940965fd17775f5ec4c21452c1 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Wed, 4 Oct 2023 14:58:29 +0100
Subject: [PATCH 05/11] Refactor determining stack probe size and decision to
 issue stack probes

Now it's more uniform across Windows and non-Windows.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 25 ++++++---------
 .../Target/AArch64/AArch64ISelLowering.cpp    | 32 ++-----------------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  4 ---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  4 +--
 .../AArch64/AArch64MachineFunctionInfo.cpp    | 29 +++++++++++++----
 .../AArch64/AArch64MachineFunctionInfo.h      |  6 ++++
 6 files changed, 42 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 338fb28b447c158..c4509c9c483be8a 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -694,6 +694,7 @@ void AArch64FrameLowering::allocateSVEStackSpace(
     StackOffset AllocSize, StackOffset InitialOffset, bool EmitCFI) const {
   DebugLoc DL;
   MachineFunction &MF = *MBB.getParent();
+  const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
   const AArch64TargetLowering &TLI = *Subtarget.getTargetLowering();
@@ -706,7 +707,7 @@ void AArch64FrameLowering::allocateSVEStackSpace(
 
   // The bit-length of SVE registers is architecturally limited.
   const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
-  int64_t ProbeSize = TLI.getStackProbeSize(MF);
+  int64_t ProbeSize = MFI.getStackProbeSize();
   if (!TLI.hasInlineStackProbe(MF) ||
       AllocSize.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE +
               AllocSize.getFixed() <=
@@ -727,7 +728,7 @@ void AArch64FrameLowering::allocateSVEStackSpace(
     return;
   }
 
-  // If we can't be sure the allocation size if less than the probe size, we
+  // If we can't be sure the allocation size is less than the probe size, we
   // have to emit a stack probing loop.
   Register ScratchReg = findScratchNonCalleeSaveRegister(&MBB);
   assert(ScratchReg != AArch64::NoRegister);
@@ -949,15 +950,11 @@ bool AArch64FrameLowering::canUseAsPrologue(
 static bool windowsRequiresStackProbe(MachineFunction &MF,
                                       uint64_t StackSizeInBytes) {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  if (!Subtarget.isTargetWindows())
-    return false;
-  const Function &F = MF.getFunction();
+  const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
   // TODO: When implementing stack protectors, take that into account
   // for the probe threshold.
-  unsigned StackProbeSize =
-      F.getFnAttributeAsParsedInteger("stack-probe-size", 4096);
-  return (StackSizeInBytes >= StackProbeSize) &&
-         !F.hasFnAttribute("no-stack-arg-probe");
+  return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
+         StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
 }
 
 static bool needsWinCFI(const MachineFunction &MF) {
@@ -1942,13 +1939,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       //   -- X9 is temporary register, so shouldn't contain any live data here,
       //   -- free to use. This is already produced by emitFrameOffset above.
 
-      const uint64_t MaxAlign = MFI.getMaxAlign().value();
+      const int64_t MaxAlign = MFI.getMaxAlign().value();
       const uint64_t AndMask = ~(MaxAlign - 1);
 
       if (NeedsStackProbe) {
         // If allocation size is known to not exceed the probe size, don't emit
         // a probing loop.
-        if (NumBytes + MaxAlign - 1 <= TLI.getStackProbeSize(MF)) {
+        if (NumBytes + MaxAlign - 1 <= AFI->getStackProbeSize()) {
           // AND SP, X9, 0b11111...0000
           BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
               .addReg(scratchSPReg, RegState::Kill)
@@ -4242,8 +4239,6 @@ MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
     StackOffset CFAOffset) const {
   MachineBasicBlock *MBB = MBBI->getParent();
   MachineFunction &MF = *MBB->getParent();
-  const AArch64TargetLowering *TLI =
-      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
   const AArch64InstrInfo *TII =
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -4251,7 +4246,7 @@ MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
   bool HasFP = hasFP(MF);
 
   DebugLoc DL;
-  int64_t ProbeSize = TLI->getStackProbeSize(MF);
+  int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
   int64_t NumBlocks = FrameSize / ProbeSize;
   int64_t ResidualSize = FrameSize % ProbeSize;
 
@@ -4263,7 +4258,7 @@ MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeFixed(
   // ordinary loop.
   if (NumBlocks <= AArch64::StackProbeMaxLoopUnroll) {
     for (int i = 0; i < NumBlocks; ++i) {
-      // SUB SP, SP, #FrameSize (or equivalent if FrameSize is not
+      // SUB SP, SP, #ProbeSize (or equivalent if ProbeSize is not
       // encodable in a SUB).
       emitFrameOffset(*MBB, MBBI, DL, AArch64::SP, AArch64::SP,
                       StackOffset::getFixed(-ProbeSize), TII,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3db288eb83787a0..873b1d08c8e5f1a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26534,34 +26534,6 @@ bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
 
 bool AArch64TargetLowering::hasInlineStackProbe(
     const MachineFunction &MF) const {
-  // If the function specifically requests inline stack probes, emit them.
-  if (MF.getFunction().hasFnAttribute("probe-stack")) {
-    if (MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
-        "inline-asm")
-      return true;
-    else
-      llvm_unreachable("Unsupported stack probing method");
-  }
-
-  return false;
-}
-
-unsigned
-AArch64TargetLowering::getStackProbeSize(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = Subtarget->getFrameLowering();
-  unsigned StackAlign = TFI->getStackAlignment();
-  assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
-         "Unexpected stack alignment");
-  // The default stack probe size is 4096 if the function has no
-  // stack-probe-size attribute. This is a safe default because it is the
-  // smallest possible guard page size.
-  unsigned StackProbeSize = 4096;
-  const Function &Fn = MF.getFunction();
-  if (Fn.hasFnAttribute("stack-probe-size"))
-    Fn.getFnAttribute("stack-probe-size")
-        .getValueAsString()
-        .getAsInteger(0, StackProbeSize);
-  // Round down to the stack alignment.
-  StackProbeSize &= ~(StackAlign - 1);
-  return StackProbeSize ? StackProbeSize : StackAlign;
+  return !Subtarget->isTargetWindows() &&
+         MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index b33598c1a1ba0b9..3b47903c3a5011a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -952,10 +952,6 @@ class AArch64TargetLowering : public TargetLowering {
   /// True if stack clash protection is enabled for this functions.
   bool hasInlineStackProbe(const MachineFunction &MF) const override;
 
-  /// Get the interval between stack-clash probes, which is equal to the stack
-  /// guard size, in bytes.
-  unsigned getStackProbeSize(const MachineFunction &MF) const;
-
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 99410cf9da7564d..984644d70f18bf6 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -9450,11 +9450,9 @@ AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
                                          Register TargetReg) const {
   MachineBasicBlock &MBB = *MBBI->getParent();
   MachineFunction &MF = *MBB.getParent();
-  const AArch64TargetLowering *TLI =
-      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
   const AArch64InstrInfo *TII =
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
-  int64_t ProbeSize = (int64_t)TLI->getStackProbeSize(MF);
+  int64_t ProbeSize = MF.getInfo<AArch64FunctionInfo>()->getStackProbeSize();
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
   MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 7bb5041b8ba9481..ff83e590f3a3541 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -97,14 +97,31 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
     if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
             F.getParent()->getModuleFlag("branch-target-enforcement")))
       BranchTargetEnforcement = BTE->getZExtValue();
-    return;
+  } else {
+    const StringRef BTIEnable =
+        F.getFnAttribute("branch-target-enforcement").getValueAsString();
+    assert(BTIEnable.equals_insensitive("true") ||
+           BTIEnable.equals_insensitive("false"));
+    BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
   }
 
-  const StringRef BTIEnable =
-      F.getFnAttribute("branch-target-enforcement").getValueAsString();
-  assert(BTIEnable.equals_insensitive("true") ||
-         BTIEnable.equals_insensitive("false"));
-  BranchTargetEnforcement = BTIEnable.equals_insensitive("true");
+  // The default stack probe size is 4096 if the function has no
+  // stack-probe-size attribute. This is a safe default because it is the
+  // smallest possible guard page size.
+  uint64_t ProbeSize =
+      F.getFnAttributeAsParsedInteger("stack-probe-size", 4096);
+  assert(int64_t(ProbeSize) > 0 && "Invalid stack probe size");
+  if (STI->isTargetWindows()) {
+    if (!F.hasFnAttribute("no-stack-arg-probe"))
+      StackProbeSize = ProbeSize;
+  } else if (F.hasFnAttribute("probe-stack")) {
+    if (F.getFnAttribute("probe-stack").getValueAsString() != "inline-asm")
+      report_fatal_error("Unsupported stack probing method");
+    uint64_t StackAlign =
+        STI->getFrameLowering()->getTransientStackAlign().value();
+    // Round down to the stack alignment.
+    StackProbeSize = std::max(StackAlign, ProbeSize & ~(StackAlign - 1U));
+  }
 }
 
 MachineFunctionInfo *AArch64FunctionInfo::clone(
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 0b8bfb04a572c77..219f83cfd32e0ee 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -194,6 +194,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// True if the function need asynchronous unwind information.
   mutable std::optional<bool> NeedsAsyncDwarfUnwindInfo;
 
+  int64_t StackProbeSize = 0;
+
 public:
   AArch64FunctionInfo(const Function &F, const AArch64Subtarget *STI);
 
@@ -456,6 +458,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
     HasStreamingModeChanges = HasChanges;
   }
 
+  bool hasStackProbing() const { return StackProbeSize != 0; }
+
+  int64_t getStackProbeSize() const { return StackProbeSize; }
+
 private:
   // Hold the lists of LOHs.
   MILOHContainer LOHContainerSet;

>From c57a3378bf9895933e9fac7ce927c5b903856ca7 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 5 Oct 2023 18:07:12 +0100
Subject: [PATCH 06/11] Declare implicit defs/uses of PROBED_STACKALLOC_...
 pseudo-instructions

---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 10 ++++------
 llvm/lib/Target/AArch64/AArch64InstrInfo.td      | 10 +++++++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index c4509c9c483be8a..6a5b5a91a71a470 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -738,7 +738,7 @@ void AArch64FrameLowering::allocateSVEStackSpace(
                   InitialOffset);
   // Arrange to emit a probing loop by decrementing SP until it reaches that
   // new top of the stack.
-  BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR), AArch64::SP)
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR))
       .addReg(ScratchReg);
   // Set SP to its new value.
   // MOV SP, Xs
@@ -1963,8 +1963,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
               .addReg(scratchSPReg, RegState::Kill)
               .addImm(AArch64_AM::encodeLogicalImmediate(AndMask, 64))
               .setMIFlags(MachineInstr::FrameSetup);
-          BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR),
-                  AArch64::SP)
+          BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR))
               .addReg(scratchSPReg);
           // MOV SP, X9
           BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::SP)
@@ -4339,11 +4338,10 @@ MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeVar(
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
 
   DebugLoc DL = MBB.findDebugLoc(MBBI);
-  Register ScratchReg = MBBI->getOperand(0).getReg();
-  Register TargetReg = MBBI->getOperand(1).getReg();
+  Register TargetReg = MBBI->getOperand(0).getReg();
 
   MachineBasicBlock::iterator NextInst =
-      TII->insertStackProbingLoop(MBBI, ScratchReg, TargetReg);
+      TII->insertStackProbingLoop(MBBI, AArch64::SP, TargetReg);
 
   MBBI->eraseFromParent();
   return NextInst;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ba06e28ab2cf7b4..6f108188536b4e0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -911,6 +911,9 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
                             Sched<[]>;
 
+}
+
+let Defs = [SP, NZCV], Uses = [SP] in {
 // Probed stack allocation of a constant size, used in function prologues when
 // stack-clash protection is enabled.
 def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch),
@@ -918,15 +921,16 @@ def PROBED_STACKALLOC : Pseudo<(outs GPR64:$scratch),
                                 i64imm:$scalable_offset),
                                []>,
                                Sched<[]>;
-} // Defs = [SP], Uses = [SP]
 
 // Probed stack allocation of a variable size, used in function prologues when
 // stack-clash protection is enabled.
-def PROBED_STACKALLOC_VAR : Pseudo<(outs GPR64sp:$scratch),
+def PROBED_STACKALLOC_VAR : Pseudo<(outs),
                                    (ins GPR64sp:$target),
                                    []>,
                                    Sched<[]>;
-} //hasSideEffects = 1, isCodeGenOnly = 1
+
+} // Defs = [SP, NZCV], Uses = [SP] in 
+} // hasSideEffects = 1, isCodeGenOnly = 1
 
 let isReMaterializable = 1, isCodeGenOnly = 1 in {
 // FIXME: The following pseudo instructions are only needed because remat

>From 0ad71e0b02a6d29947b837dae1d3b1c606397fa3 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 12 Oct 2023 11:45:34 +0100
Subject: [PATCH 07/11] Use module-level attributes for stack probing

Stack probing and stack probe size can be set either by module or
function attributes, with function attributes taking precedence.
---
 .../AArch64/AArch64MachineFunctionInfo.cpp    | 28 +++++++++----
 llvm/test/CodeGen/AArch64/stack-probing.ll    | 39 ++++++++++++++++++-
 2 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index ff83e590f3a3541..f8b73ba6dda3c2a 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -108,19 +108,33 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
   // The default stack probe size is 4096 if the function has no
   // stack-probe-size attribute. This is a safe default because it is the
   // smallest possible guard page size.
-  uint64_t ProbeSize =
-      F.getFnAttributeAsParsedInteger("stack-probe-size", 4096);
+  uint64_t ProbeSize = 4096;
+  if (F.hasFnAttribute("stack-probe-size"))
+    ProbeSize = F.getFnAttributeAsParsedInteger("stack-probe-size");
+  else if (const auto *PS = mdconst::extract_or_null<ConstantInt>(
+               F.getParent()->getModuleFlag("stack-probe-size")))
+    ProbeSize = PS->getZExtValue();
   assert(int64_t(ProbeSize) > 0 && "Invalid stack probe size");
+
   if (STI->isTargetWindows()) {
     if (!F.hasFnAttribute("no-stack-arg-probe"))
       StackProbeSize = ProbeSize;
-  } else if (F.hasFnAttribute("probe-stack")) {
-    if (F.getFnAttribute("probe-stack").getValueAsString() != "inline-asm")
-      report_fatal_error("Unsupported stack probing method");
+  } else {
+    // Round down to the stack alignment.
     uint64_t StackAlign =
         STI->getFrameLowering()->getTransientStackAlign().value();
-    // Round down to the stack alignment.
-    StackProbeSize = std::max(StackAlign, ProbeSize & ~(StackAlign - 1U));
+    ProbeSize = std::max(StackAlign, ProbeSize & ~(StackAlign - 1U));
+    StringRef ProbeKind;
+    if (F.hasFnAttribute("probe-stack"))
+      ProbeKind = F.getFnAttribute("probe-stack").getValueAsString();
+    else if (const auto *PS = dyn_cast_or_null<MDString>(
+                 F.getParent()->getModuleFlag("probe-stack")))
+      ProbeKind = PS->getString();
+    if (ProbeKind.size()) {
+      if (ProbeKind != "inline-asm")
+        report_fatal_error("Unsupported stack probing method");
+      StackProbeSize = ProbeSize;
+    }
   }
 }
 
diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll
index fb4c4e00b719197..fc76650cf123688 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll
@@ -4,6 +4,9 @@
 
 ; Tests for prolog sequences for stack probing, when using a 4KiB stack guard.
 
+; The stack probing parameters in function attributes take precedence over
+; ones in the module flags.
+
 ; Small stack frame, no probing required.
 define void @static_64(ptr %out) #0 {
 ; CHECK-LABEL: static_64:
@@ -471,4 +474,38 @@ entry:
   ret void
 }
 
-attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
+; Test stack probing is enabled by module flags
+define void @static_9232(ptr %out) uwtable(async) {
+; CHECK-LABEL: static_9232:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #2, lsl #12 // =8192
+; CHECK-NEXT:    .cfi_def_cfa_offset 8208
+; CHECK-NEXT:    sub sp, sp, #800
+; CHECK-NEXT:    .cfi_def_cfa_offset 9008
+; CHECK-NEXT:    str xzr, [sp], #-240
+; CHECK-NEXT:    .cfi_def_cfa_offset 9248
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    add sp, sp, #2, lsl #12 // =8192
+; CHECK-NEXT:    .cfi_def_cfa_offset 1056
+; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i64 9232, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 4, !"probe-stack", !"inline-asm"}
+!1 = !{i32 8, !"stack-probe-size", i32 9000}

>From 0137f84d75f1b8ff08401e6b57821426df6c7ce3 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Sat, 28 Oct 2023 15:52:42 +0100
Subject: [PATCH 08/11] Do not write below the curent SP

Fix a bug where a probe is issued to the new top of the stack
before the SP is decremented.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 16 +------
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 47 +++++++++----------
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |  7 +--
 .../test/CodeGen/AArch64/stack-probing-64k.ll |  2 +-
 .../test/CodeGen/AArch64/stack-probing-sve.ll | 10 ++--
 llvm/test/CodeGen/AArch64/stack-probing.ll    |  2 +-
 6 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 6a5b5a91a71a470..ec1b6e92d6f92d6 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -740,13 +740,6 @@ void AArch64FrameLowering::allocateSVEStackSpace(
   // new top of the stack.
   BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR))
       .addReg(ScratchReg);
-  // Set SP to its new value.
-  // MOV SP, Xs
-  BuildMI(MBB, MBBI, DL, TII.get(AArch64::ADDXri), AArch64::SP)
-      .addReg(ScratchReg)
-      .addImm(0)
-      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
-      .setMIFlags(MachineInstr::FrameSetup);
   if (EmitCFI) {
     // Set the CFA register back to SP.
     unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
@@ -1965,12 +1958,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
               .setMIFlags(MachineInstr::FrameSetup);
           BuildMI(MBB, MBBI, DL, TII->get(AArch64::PROBED_STACKALLOC_VAR))
               .addReg(scratchSPReg);
-          // MOV SP, X9
-          BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::SP)
-              .addReg(scratchSPReg)
-              .addImm(0)
-              .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
-              .setMIFlags(MachineInstr::FrameSetup);
         }
       } else {
         // AND SP, X9, 0b11111...0000
@@ -4340,8 +4327,7 @@ MachineBasicBlock::iterator AArch64FrameLowering::inlineStackProbeVar(
   DebugLoc DL = MBB.findDebugLoc(MBBI);
   Register TargetReg = MBBI->getOperand(0).getReg();
 
-  MachineBasicBlock::iterator NextInst =
-      TII->insertStackProbingLoop(MBBI, AArch64::SP, TargetReg);
+  MachineBasicBlock::iterator NextInst = TII->probedStackAlloc(MBBI, TargetReg);
 
   MBBI->eraseFromParent();
   return NextInst;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 984644d70f18bf6..0c27109dd211f12 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -9445,9 +9445,10 @@ bool AArch64InstrInfo::isReallyTriviallyReMaterializable(
 }
 
 MachineBasicBlock::iterator
-AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
-                                         Register ScratchReg,
-                                         Register TargetReg) const {
+AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
+                                   Register TargetReg) const {
+  assert(TargetReg != AArch64::SP && "New top of stack cannot aleady be in SP");
+
   MachineBasicBlock &MBB = *MBBI->getParent();
   MachineFunction &MF = *MBB.getParent();
   const AArch64InstrInfo *TII =
@@ -9466,38 +9467,29 @@ AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
   MF.insert(MBBInsertPoint, ExitMBB);
 
   // LoopTest:
-  //   SUB ScratchReg, ScratchReg, #ProbeSize
-  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, ScratchReg, ScratchReg,
-                  StackOffset::getFixed(-ProbeSize), TII,
+  //   SUB SP, SP, #ProbeSize
+  emitFrameOffset(*LoopTestMBB, LoopTestMBB->end(), DL, AArch64::SP,
+                  AArch64::SP, StackOffset::getFixed(-ProbeSize), TII,
                   MachineInstr::FrameSetup);
 
-  //   CMP ScratchReg, TargetReg
-  AArch64CC::CondCode Cond = AArch64CC::LE;
-  Register Op1 = ScratchReg;
-  Register Op2 = TargetReg;
-  if (Op2 == AArch64::SP) {
-    assert(Op1 != AArch64::SP && "At most one of the registers can be SP");
-    // CMP TargetReg, ScratchReg
-    std::swap(Op1, Op2);
-    Cond = AArch64CC::GT;
-  }
+  //   CMP SP, TargetReg
   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::SUBSXrx64),
           AArch64::XZR)
-      .addReg(Op1)
-      .addReg(Op2)
+      .addReg(AArch64::SP)
+      .addReg(TargetReg)
       .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0))
       .setMIFlags(MachineInstr::FrameSetup);
 
   //   B.<Cond> LoopExit
   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(AArch64::Bcc))
-      .addImm(Cond)
+      .addImm(AArch64CC::LE)
       .addMBB(ExitMBB)
       .setMIFlags(MachineInstr::FrameSetup);
 
-  //   STR XZR, [ScratchReg]
+  //   STR XZR, [SP]
   BuildMI(*LoopBodyMBB, LoopBodyMBB->end(), DL, TII->get(AArch64::STRXui))
       .addReg(AArch64::XZR)
-      .addReg(ScratchReg)
+      .addReg(AArch64::SP)
       .addImm(0)
       .setMIFlags(MachineInstr::FrameSetup);
 
@@ -9507,11 +9499,18 @@ AArch64InstrInfo::insertStackProbingLoop(MachineBasicBlock::iterator MBBI,
       .setMIFlags(MachineInstr::FrameSetup);
 
   // LoopExit:
-  //   STR XZR, [TargetReg]
-  BuildMI(*ExitMBB, ExitMBB->begin(), DL, TII->get(AArch64::STRXui))
-      .addReg(AArch64::XZR)
+  //   MOV SP, TargetReg
+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::ADDXri), AArch64::SP)
       .addReg(TargetReg)
       .addImm(0)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+      .setMIFlags(MachineInstr::FrameSetup);
+
+  //   STR XZR, [SP]
+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(AArch64::STRXui))
+      .addReg(AArch64::XZR)
+      .addReg(AArch64::SP)
+      .addImm(0)
       .setMIFlags(MachineInstr::FrameSetup);
 
   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 6972002defc0a3e..86a70fe0f5f4b9e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -382,9 +382,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset,
                              unsigned Scale) const;
 
-  MachineBasicBlock::iterator
-  insertStackProbingLoop(MachineBasicBlock::iterator MBBI, Register ScratchReg,
-                         Register TargetReg) const;
+  // Decrement the SP, issuing probes along the way. `TargetReg` is the new top
+  // of the stack.
+  MachineBasicBlock::iterator probedStackAlloc(MachineBasicBlock::iterator MBBI,
+                                               Register TargetReg) const;
 
 #define GET_INSTRINFO_HELPER_DECLS
 #include "AArch64GenInstrInfo.inc"
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
index 912d0c273c66180..945c271d375001b 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-64k.ll
@@ -312,8 +312,8 @@ define void @static_16_align_131072(ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB9_1
 ; CHECK-NEXT:  .LBB9_3: // %entry
-; CHECK-NEXT:    str xzr, [x9]
 ; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    mov sp, x29
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 6f7420d04d2830f..631bc7b26c1428f 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -114,8 +114,8 @@ define void @sve_17_vector(ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB3_1
 ; CHECK-NEXT:  .LBB3_3: // %entry
-; CHECK-NEXT:    str xzr, [x9]
 ; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    .cfi_def_cfa_register wsp
 ; CHECK-NEXT:    addvl sp, sp, #17
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -354,8 +354,8 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB9_1
 ; CHECK-NEXT:  .LBB9_3: // %entry
-; CHECK-NEXT:    str xzr, [x9]
 ; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    .cfi_def_cfa_register wsp
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
@@ -468,8 +468,8 @@ define void @sve_1_vector_4096_arr(ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB11_1
 ; CHECK-NEXT:  .LBB11_3: // %entry
-; CHECK-NEXT:    str xzr, [x9]
 ; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    .cfi_def_cfa_register wsp
 ; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0x20, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 4112 + 512 * VG
@@ -527,8 +527,8 @@ define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB12_1
 ; CHECK-NEXT:  .LBB12_3: // %entry
-; CHECK-NEXT:    str xzr, [x9]
 ; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
@@ -627,8 +627,8 @@ define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB14_1
 ; CHECK-NEXT:  .LBB14_3: // %entry
-; CHECK-NEXT:    str xzr, [x9]
 ; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    .cfi_def_cfa_register wsp
 ; CHECK-NEXT:    addvl sp, sp, #31
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG
diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll
index fc76650cf123688..601cdfe60d491fb 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll
@@ -399,8 +399,8 @@ define void @static_16_align_8192(ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB13_1
 ; CHECK-NEXT:  .LBB13_3: // %entry
-; CHECK-NEXT:    str xzr, [x9]
 ; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    str x8, [x0]
 ; CHECK-NEXT:    mov sp, x29

>From 1ba57e1976bbc9782cb35b3b03f809a994dcb000 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Sat, 28 Oct 2023 18:13:51 +0100
Subject: [PATCH 09/11] Use a tighter bound on the amount of stack adjustment
 due to realignment

---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 10 +++----
 llvm/test/CodeGen/AArch64/stack-probing.ll    | 28 +++++++++++++++++++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ec1b6e92d6f92d6..89f72542ea5a2cc 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1706,7 +1706,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // Alignment is required for the parent frame, not the funclet
   const bool NeedsRealignment =
       NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF);
-  int64_t RealignmentPadding =
+  const int64_t RealignmentPadding =
       (NeedsRealignment && MFI.getMaxAlign() > Align(16))
           ? MFI.getMaxAlign().value() - 16
           : 0;
@@ -1882,9 +1882,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
 
     if (NeedsRealignment) {
       scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
-      NeedsStackProbe |= TLI.hasInlineStackProbe(MF) &&
-                         (NumBytes + MFI.getMaxAlign().value()) >
-                             AArch64::StackProbeMaxUnprobedStack;
+      NeedsStackProbe |=
+          TLI.hasInlineStackProbe(MF) &&
+          (NumBytes + RealignmentPadding) > AArch64::StackProbeMaxUnprobedStack;
       assert(scratchSPReg != AArch64::NoRegister);
     }
 
@@ -1938,7 +1938,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       if (NeedsStackProbe) {
         // If allocation size is known to not exceed the probe size, don't emit
         // a probing loop.
-        if (NumBytes + MaxAlign - 1 <= AFI->getStackProbeSize()) {
+        if (NumBytes + RealignmentPadding <= AFI->getStackProbeSize()) {
           // AND SP, X9, 0b11111...0000
           BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
               .addReg(scratchSPReg, RegState::Kill)
diff --git a/llvm/test/CodeGen/AArch64/stack-probing.ll b/llvm/test/CodeGen/AArch64/stack-probing.ll
index 601cdfe60d491fb..5c5d9321a56e581 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing.ll
@@ -503,6 +503,34 @@ entry:
   ret void
 }
 
+; Test for a tight upper bound on the amount of stack adjustment
+; due to stack realignment. No probes should appear.
+define void @static_1008(ptr %out) #0 {
+; CHECK-LABEL: static_1008:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub x9, sp, #1008
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i8, i32 1008, align 32
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
 attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" }
 
 !llvm.module.flags = !{!0, !1}

>From 47b6ade918d125bd9c60a97b6385d3fc752945f9 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 15 Sep 2023 12:48:41 +0100
Subject: [PATCH 10/11] [AArch64] Stack probing for dynamic allocas in
 SelectionDAG

Change-Id: I1ef19ce40702a789d220c4bbfd5560220fa329f5
Co-authored-by: Oliver Stannard <oliver.stannard at linaro.org>
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  23 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 160 ++++++---
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  13 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +
 .../CodeGen/AArch64/stack-probing-dynamic.ll  | 320 ++++++++++++++++++
 5 files changed, 473 insertions(+), 57 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 89f72542ea5a2cc..16d42785e13efc6 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -487,6 +487,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
     MachineBasicBlock::iterator I) const {
   const AArch64InstrInfo *TII =
       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const AArch64TargetLowering *TLI =
+      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   DebugLoc DL = I->getDebugLoc();
   unsigned Opc = I->getOpcode();
   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
@@ -513,8 +516,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
       // Most call frames will be allocated at the start of a function so
       // this is OK, but it is a limitation that needs dealing with.
       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
-      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
-                      StackOffset::getFixed(Amount), TII);
+
+      if (TLI->hasInlineStackProbe(MF) &&
+          -Amount >= AArch64::StackProbeMaxUnprobedStack) {
+        // When stack probing is enabled, the decrement of SP may need to be
+        // probed. We only need to do this if the call site needs 1024 bytes of
+        // space or more, because a region smaller than that is allowed to be
+        // unprobed at an ABI boundary. We rely on the fact that SP has been
+        // probed exactly at this point, either by the prologue or most recent
+        // dynamic allocation.
+        assert(MFI.hasVarSizedObjects() &&
+               "non-reserved call frame without var sized objects?");
+        Register ScratchReg =
+            MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+        inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
+      } else {
+        emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
+                        StackOffset::getFixed(Amount), TII);
+      }
     }
   } else if (CalleePopAmount != 0) {
     // If the calling convention demands that the callee pops arguments from the
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 873b1d08c8e5f1a..72ec12f7da3b0fa 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -568,10 +568,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FSHL, MVT::i32, Custom);
   setOperationAction(ISD::FSHL, MVT::i64, Custom);
 
-  if (Subtarget->isTargetWindows())
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
-  else
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
 
   // Constant pool entries
   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
@@ -2348,6 +2345,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::CSINC)
     MAKE_CASE(AArch64ISD::THREAD_POINTER)
     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
+    MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
     MAKE_CASE(AArch64ISD::ABDS_PRED)
     MAKE_CASE(AArch64ISD::ABDU_PRED)
     MAKE_CASE(AArch64ISD::HADDS_PRED)
@@ -2707,6 +2705,28 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
   return BB;
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
+                                              MachineBasicBlock *MBB) const {
+  MachineFunction &MF = *MBB->getParent();
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  DebugLoc DL = MBB->findDebugLoc(MBBI);
+  const AArch64InstrInfo &TII =
+      *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  Register TargetReg = MI.getOperand(0).getReg();
+  MachineBasicBlock::iterator NextInst =
+      TII.insertStackProbingLoop(MBBI, AArch64::SP, TargetReg);
+
+  // MOV SP, TargetReg
+  BuildMI(*NextInst->getParent(), std::next(NextInst), DL,
+          TII.get(AArch64::ADDXri), AArch64::SP)
+      .addReg(TargetReg)
+      .addImm(0)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+  MI.eraseFromParent();
+  return NextInst->getParent();
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
                                     MachineInstr &MI,
@@ -2835,6 +2855,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
 
   case AArch64::CATCHRET:
     return EmitLoweredCatchRet(MI, BB);
+
+  case AArch64::PROBED_STACKALLOC_DYN:
+    return EmitDynamicProbedAlloc(MI, BB);
+
   case AArch64::LD1_MXIPXX_H_PSEUDO_B:
     return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
   case AArch64::LD1_MXIPXX_H_PSEUDO_H:
@@ -13865,9 +13889,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                        AN->getMemOperand());
 }
 
-SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
-    SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
+SDValue
+AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+
   SDLoc dl(Op);
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  MaybeAlign Align =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+  EVT VT = Node->getValueType(0);
+
+  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+          "no-stack-arg-probe")) {
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+    Chain = SP.getValue(1);
+    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+    if (Align)
+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+    SDValue Ops[2] = {SP, Chain};
+    return DAG.getMergeValues(Ops, dl);
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
                                                PtrVT, 0);
@@ -13891,7 +13940,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
 
   Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
                      DAG.getConstant(4, dl, MVT::i64));
-  return Chain;
+
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+  if (Align)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+
+  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
+
+  SDValue Ops[2] = {SP, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue
+AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+
+  MaybeAlign Align =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+  SDLoc dl(Op);
+  EVT VT = Node->getValueType(0);
+
+  // Construct the new SP value in a GPR.
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+  if (Align)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+
+  // Set the real SP to the new value with a probing loop.
+  Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
+  SDValue Ops[2] = {SP, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue
+AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (Subtarget->isTargetWindows())
+    return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
+  else if (hasInlineStackProbe(MF))
+    return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
+  else
+    return SDValue();
 }
 
 // When x and y are extended, lower:
@@ -13945,51 +14046,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
 }
 
-SDValue
-AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetWindows() &&
-         "Only Windows alloca probing supported");
-  SDLoc dl(Op);
-  // Get the inputs.
-  SDNode *Node = Op.getNode();
-  SDValue Chain = Op.getOperand(0);
-  SDValue Size = Op.getOperand(1);
-  MaybeAlign Align =
-      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
-  EVT VT = Node->getValueType(0);
-
-  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
-          "no-stack-arg-probe")) {
-    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
-    Chain = SP.getValue(1);
-    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
-    if (Align)
-      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
-    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
-    SDValue Ops[2] = {SP, Chain};
-    return DAG.getMergeValues(Ops, dl);
-  }
-
-  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
-
-  Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
-
-  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
-  Chain = SP.getValue(1);
-  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
-  if (Align)
-    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
-  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
-
-  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
-
-  SDValue Ops[2] = {SP, Chain};
-  return DAG.getMergeValues(Ops, dl);
-}
-
 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
                                            SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 3b47903c3a5011a..4d4fb71441362c1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -83,6 +83,10 @@ enum NodeType : unsigned {
   ADC,
   SBC, // adc, sbc instructions
 
+  // To avoid stack clash, allocation is performed by block and each block is
+  // probed.
+  PROBED_ALLOCA,
+
   // Predicated instructions where inactive lanes produce undefined results.
   ABDS_PRED,
   ABDU_PRED,
@@ -606,6 +610,9 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
 
+  MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) const;
+
   MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
                                   MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
@@ -1112,10 +1119,10 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
-                                         SDValue &Size,
-                                         SelectionDAG &DAG) const;
+
   SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
 
   SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6f108188536b4e0..08e270d646f2451 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -832,6 +832,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain,
 def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
+
+def AArch64probedalloca
+    : SDNode<"AArch64ISD::PROBED_ALLOCA",
+             SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+             [SDNPHasChain]>;
+
 def AArch64mrs : SDNode<"AArch64ISD::MRS",
                         SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
                         [SDNPHasChain, SDNPOutGlue]>;
@@ -929,6 +935,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs),
                                    []>,
                                    Sched<[]>;
 
+// Probed stack allocations of a variable size, used for allocas of unknown size
+// when stack-clash protection is enabled.
+let usesCustomInserter = 1 in
+def PROBED_STACKALLOC_DYN : Pseudo<(outs),
+                                   (ins GPR64common:$target),
+                                   [(AArch64probedalloca GPR64common:$target)]>,
+                                   Sched<[]>;
+
 } // Defs = [SP, NZCV], Uses = [SP] in 
 } // hasSideEffects = 1, isCodeGenOnly = 1
 
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
new file mode 100644
index 000000000000000..561ab81059b9dd7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -0,0 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+
+; Dynamically-sized allocation, needs a loop which can handle any size at
+; runtime. The final iteration of the loop will temporarily put SP below the
+; target address, but this doesn't break any of the ABI constraints on the
+; stack, and also doesn't probe below the target SP value.
+define void @dynamic(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB0_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; This function has a fixed-size stack slot and a dynamic one. The fixed size
+; slot isn't large enough that we would normally probe it, but we need to do so
+; here otherwise the gap between the CSR save and the first probe of the
+; dynamic allocation could be too far apart when the size of the dynamic
+; allocation is close to the guard size.
+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
+; CHECK-LABEL: dynamic_fixed:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    str xzr, [sp, #-64]!
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    sub x10, x29, #64
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    str x10, [x1]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB1_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB1_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x2]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v1 = alloca i8, i64 64, align 1
+  store ptr %v1, ptr %out1, align 8
+  %v2 = alloca i8, i64 %size, align 1
+  store ptr %v2, ptr %out2, align 8
+  ret void
+}
+
+; Dynamic allocation, with an alignment requirement greater than the alignment
+; of SP. Done by ANDing the target SP with a constant to align it down, then
+; doing the loop as normal. Note that we also re-align the stack in the prolog,
+; which isn't actually needed because the only aligned allocations are dynamic,
+; this is done even without stack probing.
+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic_align_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    sub x9, sp, #32
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffc0
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    and x8, x8, #0xffffffffffffffc0
+; CHECK-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB2_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 64
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; Dynamic allocation, with an alignment greater than the stack guard size. The
+; only difference to the dynamic allocation is the constant used for aligning
+; the target SP, the loop will probe the whole allocation without needing to
+; know about the alignment padding.
+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic_align_8192:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4064
+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT:  .LBB3_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB3_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB3_1
+; CHECK-NEXT:  .LBB3_3:
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    and x8, x8, #0xffffffffffffe000
+; CHECK-NEXT:  .LBB3_4: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB3_6
+; CHECK-NEXT:  // %bb.5: // in Loop: Header=BB3_4 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB3_4
+; CHECK-NEXT:  .LBB3_6:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 8192
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; For 64k guard pages, the only difference is the constant subtracted from SP
+; in the loop.
+define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" {
+; CHECK-LABEL: dynamic_64k_guard:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB4_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB4_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB4_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB4_1
+; CHECK-NEXT:  .LBB4_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; If a function has variable-sized stack objects, then any function calls which
+; need to pass arguments on the stack must allocate the stack space for them
+; dynamically, to ensure they are at the bottom of the frame. We need to probe
+; that space when it is larger than the unprobed space allowed by the ABI (1024
+; bytes), so this needs a very large number of arguments.
+define void @no_reserved_call_frame(i64 %n) #0 {
+; CHECK-LABEL: no_reserved_call_frame:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    lsl x9, x0, #2
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    add x9, x9, #15
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x0, x8, x9
+; CHECK-NEXT:  .LBB5_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x0
+; CHECK-NEXT:    b.le .LBB5_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB5_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB5_1
+; CHECK-NEXT:  .LBB5_3: // %entry
+; CHECK-NEXT:    str xzr, [x0]
+; CHECK-NEXT:    mov sp, x0
+; CHECK-NEXT:    sub sp, sp, #1104
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    bl callee_stack_args
+; CHECK-NEXT:    add sp, sp, #1104
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i32, i64 %n
+  call void @callee_stack_args(ptr %v, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef) #2
+  ret void
+}
+
+; Same as above but without a variable-sized allocation, so the reserved call
+; frame can be folded into the fixed-size allocation in the prologue.
+define void @reserved_call_frame(i64 %n) #0 {
+; CHECK-LABEL: reserved_call_frame:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w28, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    sub sp, sp, #1504
+; CHECK-NEXT:    add x0, sp, #1104
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    bl callee_stack_args
+; CHECK-NEXT:    add sp, sp, #1504
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i32, i64 100
+  call void @callee_stack_args(ptr %v,
+    i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef) #2
+  ret void
+}
+
+declare void @callee_stack_args(ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64)
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }

>From 227b5e02d07cf4b4f71fada8f6f5b37e0bc2cf1a Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Sat, 28 Oct 2023 16:12:46 +0100
Subject: [PATCH 11/11] Update after the bugfix about writing below SP

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp  | 11 ++---------
 .../CodeGen/AArch64/stack-probing-dynamic.ll     | 16 ++++++++--------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 72ec12f7da3b0fa..642be7d2eafa343 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2714,15 +2714,8 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
   const AArch64InstrInfo &TII =
       *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   Register TargetReg = MI.getOperand(0).getReg();
-  MachineBasicBlock::iterator NextInst =
-      TII.insertStackProbingLoop(MBBI, AArch64::SP, TargetReg);
-
-  // MOV SP, TargetReg
-  BuildMI(*NextInst->getParent(), std::next(NextInst), DL,
-          TII.get(AArch64::ADDXri), AArch64::SP)
-      .addReg(TargetReg)
-      .addImm(0)
-      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+  MachineBasicBlock::iterator NextInst = TII.probedStackAlloc(MBBI, TargetReg);
+
   MI.eraseFromParent();
   return NextInst->getParent();
 }
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
index 561ab81059b9dd7..2c6e14adb0b06ac 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -26,8 +26,8 @@ define void @dynamic(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB0_1
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -70,8 +70,8 @@ define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB1_1
 ; CHECK-NEXT:  .LBB1_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x2]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -120,8 +120,8 @@ define void @dynamic_align_64(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB2_1
 ; CHECK-NEXT:  .LBB2_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 32
@@ -163,12 +163,12 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB3_1
 ; CHECK-NEXT:  .LBB3_3:
-; CHECK-NEXT:    str xzr, [x9]
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    add x9, x0, #15
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    mov x19, sp
 ; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    and x8, x8, #0xffffffffffffe000
 ; CHECK-NEXT:  .LBB3_4: // =>This Inner Loop Header: Depth=1
@@ -179,8 +179,8 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB3_4
 ; CHECK-NEXT:  .LBB3_6:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 32
@@ -219,8 +219,8 @@ define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB4_1
 ; CHECK-NEXT:  .LBB4_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -263,8 +263,8 @@ define void @no_reserved_call_frame(i64 %n) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB5_1
 ; CHECK-NEXT:  .LBB5_3: // %entry
-; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    mov sp, x0
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    sub sp, sp, #1104
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    bl callee_stack_args