[llvm] [AArch64] Stack probing for dynamic allocas in GlobalISel (PR #67123)

Fri Dec 1 14:50:07 PST 2023

https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/67123

>From 9972bf6a2678d2b421ba68121eef978ef18e5d39 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 15 Sep 2023 12:48:41 +0100
Subject: [PATCH 1/9] [AArch64] Stack probing for dynamic allocas in
 SelectionDAG

Change-Id: I1ef19ce40702a789d220c4bbfd5560220fa329f5
Co-authored-by: Oliver Stannard <oliver.stannard at linaro.org>
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  23 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 160 ++++++---
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  13 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +
 .../CodeGen/AArch64/stack-probing-dynamic.ll  | 320 ++++++++++++++++++
 5 files changed, 473 insertions(+), 57 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 96702cb8b255af5..b9d611a22f4e4f8 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -487,6 +487,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
     MachineBasicBlock::iterator I) const {
   const AArch64InstrInfo *TII =
       static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const AArch64TargetLowering *TLI =
+      MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   DebugLoc DL = I->getDebugLoc();
   unsigned Opc = I->getOpcode();
   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
@@ -513,8 +516,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
       // Most call frames will be allocated at the start of a function so
       // this is OK, but it is a limitation that needs dealing with.
       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
-      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
-                      StackOffset::getFixed(Amount), TII);
+
+      if (TLI->hasInlineStackProbe(MF) &&
+          -Amount >= AArch64::StackProbeMaxUnprobedStack) {
+        // When stack probing is enabled, the decrement of SP may need to be
+        // probed. We only need to do this if the call site needs 1024 bytes of
+        // space or more, because a region smaller than that is allowed to be
+        // unprobed at an ABI boundary. We rely on the fact that SP has been
+        // probed exactly at this point, either by the prologue or most recent
+        // dynamic allocation.
+        assert(MFI.hasVarSizedObjects() &&
+               "non-reserved call frame without var sized objects?");
+        Register ScratchReg =
+            MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+        inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
+      } else {
+        emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
+                        StackOffset::getFixed(Amount), TII);
+      }
     }
   } else if (CalleePopAmount != 0) {
     // If the calling convention demands that the callee pops arguments from the
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 011aedeba1eb793..ca50b5a2b66b704 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -569,10 +569,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FSHL, MVT::i32, Custom);
   setOperationAction(ISD::FSHL, MVT::i64, Custom);
 
-  if (Subtarget->isTargetWindows())
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
-  else
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
 
   // Constant pool entries
   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
@@ -2353,6 +2350,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::CSINC)
     MAKE_CASE(AArch64ISD::THREAD_POINTER)
     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
+    MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
     MAKE_CASE(AArch64ISD::ABDS_PRED)
     MAKE_CASE(AArch64ISD::ABDU_PRED)
     MAKE_CASE(AArch64ISD::HADDS_PRED)
@@ -2719,6 +2717,28 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
   return BB;
 }
 
+MachineBasicBlock *
+AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
+                                              MachineBasicBlock *MBB) const {
+  MachineFunction &MF = *MBB->getParent();
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  DebugLoc DL = MBB->findDebugLoc(MBBI);
+  const AArch64InstrInfo &TII =
+      *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  Register TargetReg = MI.getOperand(0).getReg();
+  MachineBasicBlock::iterator NextInst =
+      TII.insertStackProbingLoop(MBBI, AArch64::SP, TargetReg);
+
+  // MOV SP, TargetReg
+  BuildMI(*NextInst->getParent(), std::next(NextInst), DL,
+          TII.get(AArch64::ADDXri), AArch64::SP)
+      .addReg(TargetReg)
+      .addImm(0)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+  MI.eraseFromParent();
+  return NextInst->getParent();
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
                                     MachineInstr &MI,
@@ -2863,6 +2883,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
 
   case AArch64::CATCHRET:
     return EmitLoweredCatchRet(MI, BB);
+
+  case AArch64::PROBED_STACKALLOC_DYN:
+    return EmitDynamicProbedAlloc(MI, BB);
+
   case AArch64::LD1_MXIPXX_H_PSEUDO_B:
     return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
   case AArch64::LD1_MXIPXX_H_PSEUDO_H:
@@ -14052,9 +14076,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                        AN->getMemOperand());
 }
 
-SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
-    SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
+SDValue
+AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+
   SDLoc dl(Op);
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  MaybeAlign Align =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+  EVT VT = Node->getValueType(0);
+
+  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+          "no-stack-arg-probe")) {
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+    Chain = SP.getValue(1);
+    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+    if (Align)
+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+    SDValue Ops[2] = {SP, Chain};
+    return DAG.getMergeValues(Ops, dl);
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
                                                PtrVT, 0);
@@ -14078,7 +14127,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
 
   Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
                      DAG.getConstant(4, dl, MVT::i64));
-  return Chain;
+
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+  if (Align)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+
+  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
+
+  SDValue Ops[2] = {SP, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue
+AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+
+  MaybeAlign Align =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+  SDLoc dl(Op);
+  EVT VT = Node->getValueType(0);
+
+  // Construct the new SP value in a GPR.
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+  if (Align)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+
+  // Set the real SP to the new value with a probing loop.
+  Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
+  SDValue Ops[2] = {SP, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue
+AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  if (Subtarget->isTargetWindows())
+    return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
+  else if (hasInlineStackProbe(MF))
+    return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
+  else
+    return SDValue();
 }
 
 // When x and y are extended, lower:
@@ -14132,51 +14233,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
 }
 
-SDValue
-AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetWindows() &&
-         "Only Windows alloca probing supported");
-  SDLoc dl(Op);
-  // Get the inputs.
-  SDNode *Node = Op.getNode();
-  SDValue Chain = Op.getOperand(0);
-  SDValue Size = Op.getOperand(1);
-  MaybeAlign Align =
-      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
-  EVT VT = Node->getValueType(0);
-
-  if (DAG.getMachineFunction().getFunction().hasFnAttribute(
-          "no-stack-arg-probe")) {
-    SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
-    Chain = SP.getValue(1);
-    SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
-    if (Align)
-      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
-    Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
-    SDValue Ops[2] = {SP, Chain};
-    return DAG.getMergeValues(Ops, dl);
-  }
-
-  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
-
-  Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
-
-  SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
-  Chain = SP.getValue(1);
-  SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
-  if (Align)
-    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
-  Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
-
-  Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
-
-  SDValue Ops[2] = {SP, Chain};
-  return DAG.getMergeValues(Ops, dl);
-}
-
 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
                                            SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e6d62f1704726b1..3c8479e1f6e3c32 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -83,6 +83,10 @@ enum NodeType : unsigned {
   ADC,
   SBC, // adc, sbc instructions
 
+  // To avoid stack clash, allocation is performed by block and each block is
+  // probed.
+  PROBED_ALLOCA,
+
   // Predicated instructions where inactive lanes produce undefined results.
   ABDS_PRED,
   ABDU_PRED,
@@ -616,6 +620,9 @@ class AArch64TargetLowering : public TargetLowering {
   MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
 
+  MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) const;
+
   MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
                                   MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
@@ -1141,10 +1148,10 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
-                                         SDValue &Size,
-                                         SelectionDAG &DAG) const;
+
   SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
 
   SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 60cd94f4ff8e12e..cb4ccb5b2f9aa9a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -865,6 +865,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain,
 def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
+
+def AArch64probedalloca
+    : SDNode<"AArch64ISD::PROBED_ALLOCA",
+             SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+             [SDNPHasChain]>;
+
 def AArch64mrs : SDNode<"AArch64ISD::MRS",
                         SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
                         [SDNPHasChain, SDNPOutGlue]>;
@@ -965,6 +971,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs),
                                    []>,
                                    Sched<[]>;
 
+// Probed stack allocations of a variable size, used for allocas of unknown size
+// when stack-clash protection is enabled.
+let usesCustomInserter = 1 in
+def PROBED_STACKALLOC_DYN : Pseudo<(outs),
+                                   (ins GPR64common:$target),
+                                   [(AArch64probedalloca GPR64common:$target)]>,
+                                   Sched<[]>;
+
 } // Defs = [SP, NZCV], Uses = [SP] in 
 } // hasSideEffects = 1, isCodeGenOnly = 1
 
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
new file mode 100644
index 000000000000000..561ab81059b9dd7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -0,0 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+
+; Dynamically-sized allocation, needs a loop which can handle any size at
+; runtime. The final iteration of the loop will temporarily put SP below the
+; target address, but this doesn't break any of the ABI constraints on the
+; stack, and also doesn't probe below the target SP value.
+define void @dynamic(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB0_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; This function has a fixed-size stack slot and a dynamic one. The fixed size
+; slot isn't large enough that we would normally probe it, but we need to do so
+; here otherwise the gap between the CSR save and the first probe of the
+; dynamic allocation could be too far apart when the size of the dynamic
+; allocation is close to the guard size.
+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
+; CHECK-LABEL: dynamic_fixed:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    str xzr, [sp, #-64]!
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    sub x10, x29, #64
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    str x10, [x1]
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB1_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB1_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x2]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v1 = alloca i8, i64 64, align 1
+  store ptr %v1, ptr %out1, align 8
+  %v2 = alloca i8, i64 %size, align 1
+  store ptr %v2, ptr %out2, align 8
+  ret void
+}
+
+; Dynamic allocation, with an alignment requirement greater than the alignment
+; of SP. Done by ANDing the target SP with a constant to align it down, then
+; doing the loop as normal. Note that we also re-align the stack in the prolog,
+; which isn't actually needed because the only aligned allocations are dynamic,
+; this is done even without stack probing.
+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic_align_64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    sub x9, sp, #32
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffc0
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    and x8, x8, #0xffffffffffffffc0
+; CHECK-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB2_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB2_1
+; CHECK-NEXT:  .LBB2_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 64
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; Dynamic allocation, with an alignment greater than the stack guard size. The
+; only difference to the dynamic allocation is the constant used for aligning
+; the target SP, the loop will probe the whole allocation without needing to
+; know about the alignment padding.
+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic_align_8192:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    sub x9, x9, #4064
+; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT:  .LBB3_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x9
+; CHECK-NEXT:    b.le .LBB3_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB3_1
+; CHECK-NEXT:  .LBB3_3:
+; CHECK-NEXT:    str xzr, [x9]
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    and x8, x8, #0xffffffffffffe000
+; CHECK-NEXT:  .LBB3_4: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB3_6
+; CHECK-NEXT:  // %bb.5: // in Loop: Header=BB3_4 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB3_4
+; CHECK-NEXT:  .LBB3_6:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 8192
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; For 64k guard pages, the only difference is the constant subtracted from SP
+; in the loop.
+define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" {
+; CHECK-LABEL: dynamic_64k_guard:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB4_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB4_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB4_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB4_1
+; CHECK-NEXT:  .LBB4_3:
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca i8, i64 %size, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; If a function has variable-sized stack objects, then any function calls which
+; need to pass arguments on the stack must allocate the stack space for them
+; dynamically, to ensure they are at the bottom of the frame. We need to probe
+; that space when it is larger than the unprobed space allowed by the ABI (1024
+; bytes), so this needs a very large number of arguments.
+define void @no_reserved_call_frame(i64 %n) #0 {
+; CHECK-LABEL: no_reserved_call_frame:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    lsl x9, x0, #2
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    add x9, x9, #15
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x0, x8, x9
+; CHECK-NEXT:  .LBB5_1: // %entry
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x0
+; CHECK-NEXT:    b.le .LBB5_3
+; CHECK-NEXT:  // %bb.2: // %entry
+; CHECK-NEXT:    // in Loop: Header=BB5_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB5_1
+; CHECK-NEXT:  .LBB5_3: // %entry
+; CHECK-NEXT:    str xzr, [x0]
+; CHECK-NEXT:    mov sp, x0
+; CHECK-NEXT:    sub sp, sp, #1104
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    bl callee_stack_args
+; CHECK-NEXT:    add sp, sp, #1104
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i32, i64 %n
+  call void @callee_stack_args(ptr %v, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef) #2
+  ret void
+}
+
+; Same as above but without a variable-sized allocation, so the reserved call
+; frame can be folded into the fixed-size allocation in the prologue.
+define void @reserved_call_frame(i64 %n) #0 {
+; CHECK-LABEL: reserved_call_frame:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w28, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    sub sp, sp, #1504
+; CHECK-NEXT:    add x0, sp, #1104
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    bl callee_stack_args
+; CHECK-NEXT:    add sp, sp, #1504
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+entry:
+  %v = alloca i32, i64 100
+  call void @callee_stack_args(ptr %v,
+    i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef) #2
+  ret void
+}
+
+declare void @callee_stack_args(ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64)
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }

>From a8d8c34c13b025a2f76c81c94c7668ff24ee5fa3 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Sat, 28 Oct 2023 16:12:46 +0100
Subject: [PATCH 2/9] Update after the bugfix about writing below SP

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp  | 11 ++---------
 .../CodeGen/AArch64/stack-probing-dynamic.ll     | 16 ++++++++--------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ca50b5a2b66b704..898f6540ec15ffd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2726,15 +2726,8 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
   const AArch64InstrInfo &TII =
       *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   Register TargetReg = MI.getOperand(0).getReg();
-  MachineBasicBlock::iterator NextInst =
-      TII.insertStackProbingLoop(MBBI, AArch64::SP, TargetReg);
-
-  // MOV SP, TargetReg
-  BuildMI(*NextInst->getParent(), std::next(NextInst), DL,
-          TII.get(AArch64::ADDXri), AArch64::SP)
-      .addReg(TargetReg)
-      .addImm(0)
-      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+  MachineBasicBlock::iterator NextInst = TII.probedStackAlloc(MBBI, TargetReg);
+
   MI.eraseFromParent();
   return NextInst->getParent();
 }
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
index 561ab81059b9dd7..2c6e14adb0b06ac 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -26,8 +26,8 @@ define void @dynamic(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB0_1
 ; CHECK-NEXT:  .LBB0_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -70,8 +70,8 @@ define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB1_1
 ; CHECK-NEXT:  .LBB1_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x2]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -120,8 +120,8 @@ define void @dynamic_align_64(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB2_1
 ; CHECK-NEXT:  .LBB2_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 32
@@ -163,12 +163,12 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB3_1
 ; CHECK-NEXT:  .LBB3_3:
-; CHECK-NEXT:    str xzr, [x9]
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    add x9, x0, #15
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    mov x19, sp
 ; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    and x8, x8, #0xffffffffffffe000
 ; CHECK-NEXT:  .LBB3_4: // =>This Inner Loop Header: Depth=1
@@ -179,8 +179,8 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB3_4
 ; CHECK-NEXT:  .LBB3_6:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 32
@@ -219,8 +219,8 @@ define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB4_1
 ; CHECK-NEXT:  .LBB4_3:
-; CHECK-NEXT:    str xzr, [x8]
 ; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    mov sp, x29
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -263,8 +263,8 @@ define void @no_reserved_call_frame(i64 %n) #0 {
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    b .LBB5_1
 ; CHECK-NEXT:  .LBB5_3: // %entry
-; CHECK-NEXT:    str xzr, [x0]
 ; CHECK-NEXT:    mov sp, x0
+; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    sub sp, sp, #1104
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    bl callee_stack_args

>From 69e74058c4d094ac663378dc2b1278c320b283a4 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 9 Nov 2023 14:33:42 +0000
Subject: [PATCH 3/9] Update after the change to not always set FrameSetup
 flags

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp    |  2 +-
 .../stack-probing-dynamic-no-frame-setup.ll        | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 898f6540ec15ffd..a529337387c61cd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2726,7 +2726,7 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
   const AArch64InstrInfo &TII =
       *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   Register TargetReg = MI.getOperand(0).getReg();
-  MachineBasicBlock::iterator NextInst = TII.probedStackAlloc(MBBI, TargetReg);
+  MachineBasicBlock::iterator NextInst = TII.probedStackAlloc(MBBI, TargetReg, false);
 
   MI.eraseFromParent();
   return NextInst->getParent();
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
new file mode 100644
index 000000000000000..96f2f63d703c7de
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
@@ -0,0 +1,14 @@
+; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s
+target triple = "aarch64-linux"
+
+; Check dynamic stack allocation and probing instructions do not have
+; the FrameSetup flag.
+
+; CHECK-NOT: frame-setup
+define void @no_frame_setup(i64 %size, ptr %out) #0 {
+  %v = alloca i8, i64 %size, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }

>From 7fd38a28222a6191f575a0d396ae2df0598decda Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 22 Sep 2023 13:04:05 +0100
Subject: [PATCH 4/9] Add a test for stack probing of dynamically alocated
 space for SVE vector

Change-Id: I1f9645b7ca786259bb7276806a9cfa17ba543a5a
---
 .../CodeGen/AArch64/stack-probing-dynamic.ll  | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
index 2c6e14adb0b06ac..7cf677c904fe920 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -317,4 +317,47 @@ entry:
 
 declare void @callee_stack_args(ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64)
 
+; Dynamic allocation of SVE vectors
+define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" {
+; CHECK-LABEL: dynamic_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    rdvl x9, #1
+; CHECK-NEXT:    mov x10, #15 // =0xf
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    madd x9, x0, x9, x10
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT:    cmp sp, x8
+; CHECK-NEXT:    b.le .LBB7_3
+; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB7_1 Depth=1
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    b .LBB7_1
+; CHECK-NEXT:  .LBB7_3:
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    str xzr, [sp]
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    .cfi_def_cfa wsp, 32
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w19
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+  %v = alloca <vscale x 4 x float>, i64 %size, align 16
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
 attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }

>From 0dbc50c913e65316fc32d30282ffedd3fa933c4e Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 23 Nov 2023 13:59:17 +0000
Subject: [PATCH 5/9] Shorten a couple of tests

---
 llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
index 7cf677c904fe920..78b83099c7ed9fb 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -278,7 +278,7 @@ define void @no_reserved_call_frame(i64 %n) #0 {
 ; CHECK-NEXT:    ret
 entry:
   %v = alloca i32, i64 %n
-  call void @callee_stack_args(ptr %v, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef) #2
+  call void @callee_stack_args(ptr %v, [138 x i64] undef)
   ret void
 }
 
@@ -310,12 +310,11 @@ define void @reserved_call_frame(i64 %n) #0 {
 ; CHECK-NEXT:    ret
 entry:
   %v = alloca i32, i64 100
-  call void @callee_stack_args(ptr %v,
-    i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef) #2
+  call void @callee_stack_args(ptr %v, [138 x i64] undef)
   ret void
 }
 
-declare void @callee_stack_args(ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64)
+declare void @callee_stack_args(ptr, [138 x i64])
 
 ; Dynamic allocation of SVE vectors
 define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" {

>From a3072be504077448eb4a5225297b79c7fc0fc50f Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 23 Nov 2023 14:36:32 +0000
Subject: [PATCH 6/9] Mark the probed-alloca SDNode as writing to memory

---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index cb4ccb5b2f9aa9a..bdb38f0c3789521 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -869,7 +869,7 @@ def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
 def AArch64probedalloca
     : SDNode<"AArch64ISD::PROBED_ALLOCA",
              SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
-             [SDNPHasChain]>;
+             [SDNPHasChain, SDNPMayStore]>;
 
 def AArch64mrs : SDNode<"AArch64ISD::MRS",
                         SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,

>From fecf2a21b94b95c7324b9ccc0dfe8f25afc57606 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 1 Dec 2023 22:01:15 +0000
Subject: [PATCH 7/9] [fixup] Add a comment how we depend on the implementation
 of hasReservedCallFrame

---
 llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index b9d611a22f4e4f8..e4451a9a85f50fb 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -479,6 +479,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
 /// included as part of the stack frame.
 bool
 AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  // The stack probing code for the dynamically allocated outgoing arguments
+  // area assumes that the stack is probed at the top - either by the prologue
+  // code, which issues a probe if `hasVarSizedObjects` return true, or by the
+  // most recent variable-sized object allocation. Changing the condition here
+  // may need to be followed up by changes to the probe issuing logic.
   return !MF.getFrameInfo().hasVarSizedObjects();
 }
 

>From 3e7bdb5588ba50c1a973f1a663f93e7305de697b Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 1 Dec 2023 22:48:13 +0000
Subject: [PATCH 8/9] [fixup] Fix formatting

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a529337387c61cd..b6a16217dfae394 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2726,7 +2726,8 @@ AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
   const AArch64InstrInfo &TII =
       *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
   Register TargetReg = MI.getOperand(0).getReg();
-  MachineBasicBlock::iterator NextInst = TII.probedStackAlloc(MBBI, TargetReg, false);
+  MachineBasicBlock::iterator NextInst =
+      TII.probedStackAlloc(MBBI, TargetReg, false);
 
   MI.eraseFromParent();
   return NextInst->getParent();

>From bb62d6bebbdc373769864701372cfc05e0961236 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Wed, 20 Sep 2023 14:49:45 +0100
Subject: [PATCH 9/9] [AArch64] Stack probing for dynamic allocas in GlobalISel

This adds a stack probing instruction sequence for dynamic stack
allocations, to protect against stack clash attacks. The instruction
sequence used is the same one used for unknown-size allocations in
function prologues.

Change-Id: Iba5f94462d18bcf62d09bb9b4f0d7563f680c19c
Co-authored-by: Oliver Stannard <oliver.stannard at linaro.org>
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   2 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  37 ++-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  47 +++-
 .../AArch64/GISel/AArch64LegalizerInfo.h      |   1 +
 .../GlobalISel/legalize-dyn-alloca.mir        | 261 ++++++++++++++----
 .../GlobalISel/legalizer-info-validation.mir  |   7 +-
 .../CodeGen/AArch64/stack-probing-dynamic.ll  |   3 +-
 7 files changed, 286 insertions(+), 72 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 86d3cb2bedb95b6..365d2223a81c1d0 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -410,6 +410,8 @@ class LegalizerHelper {
   LegalizeResult lowerUnmergeValues(MachineInstr &MI);
   LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI);
   LegalizeResult lowerShuffleVector(MachineInstr &MI);
+  Register getDynStackAllocTargetPtr(Register SPReg, Register AllocSize,
+                                     Align Alignment, LLT PtrTy);
   LegalizeResult lowerDynStackAlloc(MachineInstr &MI);
   LegalizeResult lowerStackSave(MachineInstr &MI);
   LegalizeResult lowerStackRestore(MachineInstr &MI);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 3fa659eff652a8d..56118e5bef35cbd 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7042,21 +7042,12 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
   return Legalized;
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
-  const auto &MF = *MI.getMF();
-  const auto &TFI = *MF.getSubtarget().getFrameLowering();
-  if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
-    return UnableToLegalize;
-
-  Register Dst = MI.getOperand(0).getReg();
-  Register AllocSize = MI.getOperand(1).getReg();
-  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
-
-  LLT PtrTy = MRI.getType(Dst);
+Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
+                                                    Register AllocSize,
+                                                    Align Alignment,
+                                                    LLT PtrTy) {
   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
 
-  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
 
@@ -7071,7 +7062,25 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
   }
 
-  SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
+  return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0);
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
+  const auto &MF = *MI.getMF();
+  const auto &TFI = *MF.getSubtarget().getFrameLowering();
+  if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
+    return UnableToLegalize;
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register AllocSize = MI.getOperand(1).getReg();
+  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
+
+  LLT PtrTy = MRI.getType(Dst);
+  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
+  Register SPTmp =
+      getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
+
   MIRBuilder.buildCopy(SPReg, SPTmp);
   MIRBuilder.buildCopy(Dst, SPTmp);
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index e665bf42a98de8a..21a412e9360dce3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -934,9 +934,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   getActionDefinitionsBuilder(G_BRJT).legalFor({{p0, s64}});
 
-  getActionDefinitionsBuilder({G_DYN_STACKALLOC,
-                               G_STACKSAVE,
-                               G_STACKRESTORE}).lower();
+  getActionDefinitionsBuilder(G_DYN_STACKALLOC).custom();
+
+  getActionDefinitionsBuilder({G_STACKSAVE, G_STACKRESTORE}).lower();
 
   if (ST.hasMOPS()) {
     // G_BZERO is not supported. Currently it is only emitted by
@@ -1174,6 +1174,8 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
     return legalizeFCopySign(MI, Helper);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return legalizeExtractVectorElt(MI, MRI, Helper);
+  case TargetOpcode::G_DYN_STACKALLOC:
+    return legalizeDynStackAlloc(MI, Helper);
   }
 
   llvm_unreachable("expected switch to return");
@@ -1945,3 +1947,42 @@ bool AArch64LegalizerInfo::legalizeExtractVectorElt(
   return Helper.lowerExtractInsertVectorElt(MI) !=
          LegalizerHelper::LegalizeResult::UnableToLegalize;
 }
+
+bool AArch64LegalizerInfo::legalizeDynStackAlloc(
+    MachineInstr &MI, LegalizerHelper &Helper) const {
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+
+  // If stack probing is not enabled for this function, use the default
+  // lowering.
+  if (!MF.getFunction().hasFnAttribute("probe-stack") ||
+      MF.getFunction().getFnAttribute("probe-stack").getValueAsString() !=
+          "inline-asm") {
+    Helper.lowerDynStackAlloc(MI);
+    return true;
+  }
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register AllocSize = MI.getOperand(1).getReg();
+  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
+
+  assert(MRI.getType(Dst) == LLT::pointer(0, 64) &&
+         "Unexpected type for dynamic alloca");
+  assert(MRI.getType(AllocSize) == LLT::scalar(64) &&
+         "Unexpected type for dynamic alloca");
+
+  LLT PtrTy = MRI.getType(Dst);
+  Register SPReg =
+      Helper.getTargetLowering().getStackPointerRegisterToSaveRestore();
+  Register SPTmp =
+      Helper.getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
+  auto NewMI =
+      MIRBuilder.buildInstr(AArch64::PROBED_STACKALLOC_DYN, {}, {SPTmp});
+  MRI.setRegClass(NewMI.getReg(0), &AArch64::GPR64commonRegClass);
+  MIRBuilder.setInsertPt(*NewMI->getParent(), NewMI);
+  MIRBuilder.buildCopy(Dst, SPTmp);
+
+  MI.eraseFromParent();
+  return true;
+}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index e6c9182da912dba..6fd859d334cd81d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -63,6 +63,7 @@ class AArch64LegalizerInfo : public LegalizerInfo {
   bool legalizeFCopySign(MachineInstr &MI, LegalizerHelper &Helper) const;
   bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
                                 LegalizerHelper &Helper) const;
+  bool legalizeDynStackAlloc(MachineInstr &MI, LegalizerHelper &Helper) const;
   const AArch64Subtarget *ST;
 };
 } // End llvm namespace.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir
index e9188fb89f699a0..c0a286b0a1ca0ce 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-dyn-alloca.mir
@@ -19,6 +19,21 @@
     ret i128* %addr
   }
 
+  define i8* @test_simple_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" {
+    %addr = alloca i8, i32 %numelts
+    ret i8* %addr
+  }
+
+  define i8* @test_aligned_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" {
+    %addr = alloca i8, i32 %numelts, align 32
+    ret i8* %addr
+  }
+
+  define i128* @test_natural_alloca_stack_probing(i32 %numelts) "probe-stack"="inline-asm" {
+    %addr = alloca i128, i32 %numelts
+    ret i128* %addr
+  }
+
 ...
 ---
 name:            test_simple_alloca
@@ -37,22 +52,23 @@ body:             |
 
     ; CHECK-LABEL: name: test_simple_alloca
     ; CHECK: liveins: $w0
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
-    ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
-    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
-    ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]]
-    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
-    ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
-    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
-    ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
-    ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
-    ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64)
-    ; CHECK: $sp = COPY [[INTTOPTR]](p0)
-    ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
-    ; CHECK: $x0 = COPY [[COPY2]](p0)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64)
+    ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
+    ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s32) = COPY $w0
     %3:_(s64) = G_CONSTANT i64 1
     %1:_(s64) = G_ZEXT %0(s32)
@@ -83,24 +99,25 @@ body:             |
 
     ; CHECK-LABEL: name: test_aligned_alloca
     ; CHECK: liveins: $w0
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
-    ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
-    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
-    ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]]
-    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
-    ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
-    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
-    ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
-    ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
-    ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32
-    ; CHECK: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]]
-    ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[AND1]](s64)
-    ; CHECK: $sp = COPY [[INTTOPTR]](p0)
-    ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
-    ; CHECK: $x0 = COPY [[COPY2]](p0)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]]
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[AND1]](s64)
+    ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
+    ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s32) = COPY $w0
     %3:_(s64) = G_CONSTANT i64 1
     %1:_(s64) = G_ZEXT %0(s32)
@@ -131,22 +148,23 @@ body:             |
 
     ; CHECK-LABEL: name: test_natural_alloca
     ; CHECK: liveins: $w0
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
-    ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
-    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
-    ; CHECK: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]]
-    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
-    ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
-    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
-    ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
-    ; CHECK: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
-    ; CHECK: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64)
-    ; CHECK: $sp = COPY [[INTTOPTR]](p0)
-    ; CHECK: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
-    ; CHECK: $x0 = COPY [[COPY2]](p0)
-    ; CHECK: RET_ReallyLR implicit $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[MUL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[SUB]](s64)
+    ; CHECK-NEXT: $sp = COPY [[INTTOPTR]](p0)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
+    ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s32) = COPY $w0
     %3:_(s64) = G_CONSTANT i64 16
     %1:_(s64) = G_ZEXT %0(s32)
@@ -160,3 +178,146 @@ body:             |
     RET_ReallyLR implicit $x0
 
 ...
+---
+name:            test_simple_alloca_stack_probing
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  maxAlignment:    1
+stack:
+  - { id: 0, name: addr, type: variable-sized, alignment: 1 }
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $w0
+
+    ; CHECK-LABEL: name: test_simple_alloca_stack_probing
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[SUB]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
+    ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp
+    ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s32) = COPY $w0
+    %1:_(s64) = G_ZEXT %0(s32)
+    %9:_(s64) = G_CONSTANT i64 0
+    %2:_(s64) = G_SHL %1, %9(s64)
+    %4:_(s64) = G_CONSTANT i64 15
+    %5:_(s64) = nuw G_ADD %2, %4
+    %6:_(s64) = G_CONSTANT i64 -16
+    %7:_(s64) = G_AND %5, %6
+    %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1
+    $x0 = COPY %8(p0)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            test_aligned_alloca_stack_probing
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  maxAlignment:    32
+stack:
+  - { id: 0, name: addr, type: variable-sized, alignment: 32 }
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $w0
+
+    ; CHECK-LABEL: name: test_aligned_alloca_stack_probing
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -32
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SUB]], [[C3]]
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[AND1]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
+    ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp
+    ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s32) = COPY $w0
+    %1:_(s64) = G_ZEXT %0(s32)
+    %9:_(s64) = G_CONSTANT i64 0
+    %2:_(s64) = G_SHL %1, %9(s64)
+    %4:_(s64) = G_CONSTANT i64 15
+    %5:_(s64) = nuw G_ADD %2, %4
+    %6:_(s64) = G_CONSTANT i64 -16
+    %7:_(s64) = G_AND %5, %6
+    %8:_(p0) = G_DYN_STACKALLOC %7(s64), 32
+    $x0 = COPY %8(p0)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            test_natural_alloca_stack_probing
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+frameInfo:
+  maxAlignment:    1
+stack:
+  - { id: 0, name: addr, type: variable-sized, alignment: 1 }
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $w0
+
+    ; CHECK-LABEL: name: test_natural_alloca_stack_probing
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ZEXT]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = nuw G_ADD [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ADD]], [[C2]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $sp
+    ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY1]](p0)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[PTRTOINT]], [[AND]]
+    ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:gpr64common(p0) = G_INTTOPTR [[SUB]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY [[INTTOPTR]](p0)
+    ; CHECK-NEXT: PROBED_STACKALLOC_DYN [[INTTOPTR]](p0), implicit-def $sp, implicit-def $nzcv, implicit $sp
+    ; CHECK-NEXT: $x0 = COPY [[COPY2]](p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s32) = COPY $w0
+    %1:_(s64) = G_ZEXT %0(s32)
+    %9:_(s64) = G_CONSTANT i64 4
+    %2:_(s64) = G_SHL %1, %9(s64)
+    %4:_(s64) = G_CONSTANT i64 15
+    %5:_(s64) = nuw G_ADD %2, %4
+    %6:_(s64) = G_CONSTANT i64 -16
+    %7:_(s64) = G_AND %5, %6
+    %8:_(p0) = G_DYN_STACKALLOC %7(s64), 1
+    $x0 = COPY %8(p0)
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 6996416a8243009..ae15e74a43277ab 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -665,15 +665,14 @@
 # DEBUG-NEXT: G_JUMP_TABLE (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
-# DEBUG-NEXT: G_DYN_STACKALLOC (opcode [[DYN_STACKALLOC:[0-9]+]]): 2 type indices, 0 imm indices
+# DEBUG-NEXT: G_DYN_STACKALLOC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: G_STACKSAVE (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[DYN_STACKALLOC]]
+# DEBUG-NEXT: G_STACKSAVE (opcode [[STACKSAVE:[0-9]+]]): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_STACKRESTORE (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[DYN_STACKALLOC]]
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to [[STACKSAVE]]
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_STRICT_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
index 78b83099c7ed9fb..d247ed1b5997750 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs                                   | FileCheck %s
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
 
 ; Dynamically-sized allocation, needs a loop which can handle any size at
 ; runtime. The final iteration of the loop will temporarily put SP below the