[llvm] [RISCV] Stack clash protection for dynamic alloca (PR #122508)

Tue Jan 14 12:27:28 PST 2025

https://github.com/rzinsly updated https://github.com/llvm/llvm-project/pull/122508

>From 600dbfbcf28ba8c7f174503b96f5728ba110eb9d Mon Sep 17 00:00:00 2001
From: Raphael Moreira Zinsly <rzinsly at ventanamicro.com>
Date: Wed, 13 Nov 2024 15:55:52 -0300
Subject: [PATCH 1/3] [RISCV] Stack clash protection for dynamic alloca

Create a probe loop for dynamic allocation and add the corresponding
SelectionDAG support in order to use it.
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  |  69 ++-
 llvm/lib/Target/RISCV/RISCVFrameLowering.h    |   6 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 105 +++-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   9 +
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       |  10 +
 .../RISCV/rvv/stack-probing-dynamic.ll        | 552 ++++++++++++++++++
 .../CodeGen/RISCV/stack-clash-prologue.ll     | 125 ++++
 7 files changed, 868 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index ed3ec310280670..8cef3f845065f0 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -545,6 +545,16 @@ void RISCVFrameLowering::allocateAndProbeStackForRVV(
       .addReg(SPReg)
       .addReg(TargetReg)
       .setMIFlag(Flag);
+
+  // If we have a dynamic allocation later we need to probe any residuals.
+  MachineBasicBlock *NextMBB = MBBI->getParent()->getSingleSuccessor();
+  if (NextMBB != NULL && NextMBB->begin()->getFlag(MachineInstr::FrameSetup)) {
+    BuildMI(MBB, MBBI, DL, TII->get(STI.is64Bit() ? RISCV::SD : RISCV::SW))
+        .addReg(RISCV::X0)
+        .addReg(SPReg)
+        .addImm(0)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
 }
 
 static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
@@ -639,6 +649,15 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
   DebugLoc DL;
   const RISCVRegisterInfo *RI = STI.getRegisterInfo();
   const RISCVInstrInfo *TII = STI.getInstrInfo();
+  bool IsRV64 = STI.is64Bit();
+  bool dyn_alloca = false;
+
+  // If we have a dynamic allocation later we need to probe any residuals.
+  if (NeedProbe) {
+    MachineBasicBlock *NextMBB = MBBI->getParent()->getSingleSuccessor();
+    dyn_alloca = (NextMBB != NULL &&
+                  NextMBB->begin()->getFlag(MachineInstr::FrameSetup));
+  }
 
   // Simply allocate the stack if it's not big enough to require a probe.
   if (!NeedProbe || Offset <= ProbeSize) {
@@ -654,13 +673,21 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
+    if (dyn_alloca) {
+      // s[d|w] zero, 0(sp)
+      BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+          .addReg(RISCV::X0)
+          .addReg(SPReg)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+
     return;
   }
 
   // Unroll the probe loop depending on the number of iterations.
   if (Offset < ProbeSize * 5) {
     uint64_t CurrentOffset = 0;
-    bool IsRV64 = STI.is64Bit();
     while (CurrentOffset + ProbeSize <= Offset) {
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
                     StackOffset::getFixed(-ProbeSize), MachineInstr::FrameSetup,
@@ -696,6 +723,15 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
             .addCFIIndex(CFIIndex)
             .setMIFlag(MachineInstr::FrameSetup);
       }
+
+      if (dyn_alloca) {
+        // s[d|w] zero, 0(sp)
+        BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+            .addReg(RISCV::X0)
+            .addReg(SPReg)
+            .addImm(0)
+            .setMIFlags(MachineInstr::FrameSetup);
+      }
     }
 
     return;
@@ -736,9 +772,18 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
         .setMIFlags(MachineInstr::FrameSetup);
   }
 
-  if (Residual)
+  if (Residual) {
     RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual),
                   MachineInstr::FrameSetup, getStackAlign());
+    if (dyn_alloca) {
+      // s[d|w] zero, 0(sp)
+      BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+          .addReg(RISCV::X0)
+          .addReg(SPReg)
+          .addImm(0)
+          .setMIFlags(MachineInstr::FrameSetup);
+    }
+  }
 
   if (EmitCFI) {
     // Emit ".cfi_def_cfa_offset Offset"
@@ -2084,9 +2129,10 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
 }
 
 // Synthesize the probe loop.
-static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                                 Register TargetReg, bool IsRVV) {
+MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, DebugLoc DL, Register TargetReg,
+    bool IsRVV) const {
   assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP");
 
   auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
@@ -2154,6 +2200,8 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.addSuccessor(LoopTestMBB);
   // Update liveins.
   fullyRecomputeLiveIns({ExitMBB, LoopTestMBB});
+
+  return ExitMBB;
 }
 
 void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
@@ -2176,8 +2224,15 @@ void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
       MachineBasicBlock::iterator MBBI = MI->getIterator();
       DebugLoc DL = MBB.findDebugLoc(MBBI);
       Register TargetReg = MI->getOperand(1).getReg();
-      emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg,
-                           (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
+      MachineBasicBlock *Succ = MBBI->getParent()->getSingleSuccessor();
+      MachineBasicBlock *Next = emitStackProbeInline(
+          MF, MBB, MBBI, DL, TargetReg,
+          (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
+      // Update the BBs information if we have a BB from a dynamic allocation.
+      if (Succ != NULL && Succ->begin()->getFlag(MachineInstr::FrameSetup)) {
+        MBBI->getParent()->removeSuccessor(Succ);
+        Next->addSuccessor(Succ);
+      }
       MBBI->eraseFromParent();
     }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 26d2a26d681c35..1a2c6e0302623d 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -83,6 +83,12 @@ class RISCVFrameLowering : public TargetFrameLowering {
                      uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
                      uint64_t ProbeSize) const;
 
+  MachineBasicBlock *emitStackProbeInline(MachineFunction &MF,
+                                          MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          DebugLoc DL, Register TargetReg,
+                                          bool IsRVV) const;
+
 protected:
   const RISCVSubtarget &STI;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 95f1deed8b6c02..512e0e8f098889 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -280,7 +280,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                    MVT::i1, Promote);
 
   // TODO: add all necessary setOperationAction calls.
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Custom);
 
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, XLenVT, Expand);
@@ -7684,6 +7684,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return emitFlushICache(DAG, Op.getOperand(0), Op.getOperand(1),
                            Op.getOperand(2), Flags, DL);
   }
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::INIT_TRAMPOLINE:
     return lowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:
@@ -19598,6 +19600,8 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case RISCV::PseudoFROUND_D_INX:
   case RISCV::PseudoFROUND_D_IN32X:
     return emitFROUND(MI, BB, Subtarget);
+  case RISCV::PROBED_STACKALLOC_DYN:
+    return EmitDynamicProbedAlloc(MI, BB);
   case TargetOpcode::STATEPOINT:
     // STATEPOINT is a pseudo instruction which has no implicit defs/uses
     // while jal call instruction (where statepoint will be lowered at the end)
@@ -20830,6 +20834,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SF_VC_V_IVW_SE)
   NODE_NAME_CASE(SF_VC_V_VVW_SE)
   NODE_NAME_CASE(SF_VC_V_FVW_SE)
+  NODE_NAME_CASE(PROBED_ALLOCA)
   }
   // clang-format on
   return nullptr;
@@ -22559,3 +22564,101 @@ unsigned RISCVTargetLowering::getStackProbeSize(const MachineFunction &MF,
   StackProbeSize = alignDown(StackProbeSize, StackAlign.value());
   return StackProbeSize ? StackProbeSize : StackAlign.value();
 }
+
+SDValue RISCVTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  if (!hasInlineStackProbe(MF))
+    return SDValue();
+
+  MVT XLenVT = Subtarget.getXLenVT();
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+
+  MaybeAlign Align =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+  SDLoc dl(Op);
+  EVT VT = Node->getValueType(0);
+
+  // Construct the new SP value in a GPR.
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, RISCV::X2, XLenVT);
+  Chain = SP.getValue(1);
+  SP = DAG.getNode(ISD::SUB, dl, XLenVT, SP, Size);
+  if (Align)
+    SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                     DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT));
+
+  // Set the real SP to the new value with a probing loop.
+  Chain = DAG.getNode(RISCVISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
+  SDValue Ops[2] = {SP, Chain};
+  return DAG.getMergeValues(Ops, dl);
+}
+
+MachineBasicBlock *
+RISCVTargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) const {
+  MachineFunction &MF = *MBB->getParent();
+  MachineBasicBlock::iterator MBBI = MI.getIterator();
+  DebugLoc DL = MBB->findDebugLoc(MBBI);
+  Register TargetReg = MI.getOperand(1).getReg();
+
+  auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
+  const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
+  bool IsRV64 = Subtarget.is64Bit();
+  Align StackAlign = Subtarget.getFrameLowering()->getStackAlign();
+  const RISCVTargetLowering *TLI = Subtarget.getTargetLowering();
+  uint64_t ProbeSize = TLI->getStackProbeSize(MF, StackAlign);
+
+  MachineFunction::iterator MBBInsertPoint = std::next(MBB->getIterator());
+  MachineBasicBlock *LoopTestMBB =
+      MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+  MF.insert(MBBInsertPoint, LoopTestMBB);
+  MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+  MF.insert(MBBInsertPoint, ExitMBB);
+  MachineInstr::MIFlag Flags = MachineInstr::FrameSetup;
+  Register SPReg = RISCV::X2;
+  Register ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+
+  // ScratchReg = ProbeSize
+  TII->movImm(*MBB, MBBI, DL, ScratchReg, ProbeSize, Flags);
+
+  // LoopTest:
+  //   SUB SP, SP, ProbeSize
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB), SPReg)
+      .addReg(SPReg)
+      .addReg(ScratchReg)
+      .setMIFlags(Flags);
+
+  //   s[d|w] zero, 0(sp)
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL,
+          TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+      .addReg(RISCV::X0)
+      .addReg(SPReg)
+      .addImm(0)
+      .setMIFlags(Flags);
+
+  //  BLT TargetReg, SP, LoopTest
+  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BLT))
+      .addReg(TargetReg)
+      .addReg(SPReg)
+      .addMBB(LoopTestMBB)
+      .setMIFlags(Flags);
+
+  // Adjust with: MV SP, TargetReg.
+  BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(RISCV::ADDI), SPReg)
+      .addReg(TargetReg)
+      .addImm(0)
+      .setMIFlags(Flags);
+
+  ExitMBB->splice(ExitMBB->end(), MBB, std::next(MBBI), MBB->end());
+
+  LoopTestMBB->addSuccessor(ExitMBB);
+  LoopTestMBB->addSuccessor(LoopTestMBB);
+  MBB->addSuccessor(LoopTestMBB);
+
+  MI.eraseFromParent();
+  return ExitMBB->begin()->getParent();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ea077c7d2d23a5..752faa5778984e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -461,6 +461,10 @@ enum NodeType : unsigned {
   SF_VC_V_VVW_SE,
   SF_VC_V_FVW_SE,
 
+  // To avoid stack clash, allocation is performed by block and each block is
+  // probed.
+  PROBED_ALLOCA,
+
   // RISC-V vector tuple type version of INSERT_SUBVECTOR/EXTRACT_SUBVECTOR.
   TUPLE_INSERT,
   TUPLE_EXTRACT,
@@ -922,6 +926,9 @@ class RISCVTargetLowering : public TargetLowering {
 
   unsigned getStackProbeSize(const MachineFunction &MF, Align StackAlign) const;
 
+  MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) const;
+
 private:
   void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
                         const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
@@ -1015,6 +1022,8 @@ class RISCVTargetLowering : public TargetLowering {
 
   SDValue lowerVectorStrictFSetcc(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index ee86f53a5c8a8d..05ff9c70cacd43 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -100,6 +100,11 @@ def riscv_add_tprel : SDNode<"RISCVISD::ADD_TPREL",
                                                   SDTCisSameAs<0, 3>,
                                                   SDTCisInt<0>]>>;
 
+def riscv_probed_alloca : SDNode<"RISCVISD::PROBED_ALLOCA",
+                                  SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+                                                       SDTCisVT<0, i32>]>,
+                                  [SDNPHasChain, SDNPMayStore]>;
+
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
@@ -1428,6 +1433,11 @@ def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp),
                                (ins GPR:$scratch),
                                []>,
                                Sched<[]>;
+let usesCustomInserter = 1 in
+def PROBED_STACKALLOC_DYN : Pseudo<(outs GPR:$rd),
+                               (ins GPR:$scratch),
+                               [(set GPR:$rd, (riscv_probed_alloca GPR:$scratch))]>,
+                               Sched<[]>;
 }
 
 /// HI and ADD_LO address nodes.
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
new file mode 100644
index 00000000000000..6dda0c38a070ac
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -0,0 +1,552 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+
+; Tests copied from AArch64.
+
+; Dynamically-sized allocation, needs a loop which can handle any size at
+; runtime. The final iteration of the loop will temporarily put SP below the
+; target address, but this doesn't break any of the ABI constraints on the
+; stack, and also doesn't probe below the target SP value.
+define void @dynamic(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    addi s0, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB0_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    sd a0, 0(a1)
+; RV64I-NEXT:    addi sp, s0, -16
+; RV64I-NEXT:    .cfi_def_cfa sp, 16
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: dynamic:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    addi s0, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB0_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    addi sp, s0, -16
+; RV32I-NEXT:    .cfi_def_cfa sp, 16
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %v = alloca i8, i64 %size, align 1
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; This function has a fixed-size stack slot and a dynamic one. The fixed size
+; slot isn't large enough that we would normally probe it, but we need to do so
+; here otherwise the gap between the CSR save and the first probe of the
+; dynamic allocation could be too far apart when the size of the dynamic
+; allocation is close to the guard size.
+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
+; RV64I-LABEL: dynamic_fixed:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -80
+; RV64I-NEXT:    .cfi_def_cfa_offset 80
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    addi s0, sp, 80
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    addi a3, s0, -80
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    sd a3, 0(a1)
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a1
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB1_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    sd a0, 0(a2)
+; RV64I-NEXT:    addi sp, s0, -80
+; RV64I-NEXT:    .cfi_def_cfa sp, 80
+; RV64I-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    addi sp, sp, 80
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: dynamic_fixed:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -80
+; RV32I-NEXT:    .cfi_def_cfa_offset 80
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    addi s0, sp, 80
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    addi a1, s0, -72
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    sw a1, 0(a2)
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB1_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    sw a0, 0(a3)
+; RV32I-NEXT:    addi sp, s0, -80
+; RV32I-NEXT:    .cfi_def_cfa sp, 80
+; RV32I-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    addi sp, sp, 80
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %v1 = alloca i8, i64 64, align 1
+  store ptr %v1, ptr %out1, align 8
+  %v2 = alloca i8, i64 %size, align 1
+  store ptr %v2, ptr %out2, align 8
+  ret void
+}
+
+; Dynamic allocation, with an alignment requirement greater than the alignment
+; of SP. Done by ANDing the target SP with a constant to align it down, then
+; doing the loop as normal. Note that we also re-align the stack in the prolog,
+; which isn't actually needed because the only aligned allocations are dynamic,
+; this is done even without stack probing.
+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_align_64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -64
+; RV64I-NEXT:    .cfi_def_cfa_offset 64
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    addi s0, sp, 64
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    andi sp, sp, -64
+; RV64I-NEXT:    mv s1, sp
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    andi a0, a0, -64
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB2_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    sd a0, 0(a1)
+; RV64I-NEXT:    addi sp, s0, -64
+; RV64I-NEXT:    .cfi_def_cfa sp, 64
+; RV64I-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 64
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: dynamic_align_64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -64
+; RV32I-NEXT:    .cfi_def_cfa_offset 64
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    addi s0, sp, 64
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    andi sp, sp, -64
+; RV32I-NEXT:    mv s1, sp
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    andi a0, a0, -64
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB2_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    addi sp, s0, -64
+; RV32I-NEXT:    .cfi_def_cfa sp, 64
+; RV32I-NEXT:    lw ra, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 64
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %v = alloca i8, i64 %size, align 64
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; Dynamic allocation, with an alignment greater than the stack guard size. The
+; only difference to the dynamic allocation is the constant used for aligning
+; the target SP, the loop will probe the whole allocation without needing to
+; know about the alignment padding.
+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_align_8192:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 2032
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 2008(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    addi s0, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    addi sp, sp, -2048
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    srli a2, sp, 13
+; RV64I-NEXT:    slli sp, a2, 13
+; RV64I-NEXT:    mv s1, sp
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    lui a2, 1048574
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB3_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    sd a0, 0(a1)
+; RV64I-NEXT:    addi sp, s0, -2032
+; RV64I-NEXT:    .cfi_def_cfa sp, 2032
+; RV64I-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 2008(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: dynamic_align_8192:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 2032
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 2028(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 2024(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 2020(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    addi s0, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    addi sp, sp, -2048
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    srli a1, sp, 13
+; RV32I-NEXT:    slli sp, a1, 13
+; RV32I-NEXT:    mv s1, sp
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    lui a1, 1048574
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB3_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    addi sp, s0, -2032
+; RV32I-NEXT:    .cfi_def_cfa sp, 2032
+; RV32I-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 2020(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %v = alloca i8, i64 %size, align 8192
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+; If a function has variable-sized stack objects, then any function calls which
+; need to pass arguments on the stack must allocate the stack space for them
+; dynamically, to ensure they are at the bottom of the frame. We need to probe
+; that space when it is larger than the unprobed space allowed by the ABI, so
+; this needs a very large number of arguments.
+define void @no_reserved_call_frame(i64 %n, i32 %dummy) #0 {
+; RV64I-LABEL: no_reserved_call_frame:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    addi s0, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:  .LBB4_1: # %entry
+; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB4_1
+; RV64I-NEXT:  # %bb.2: # %entry
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    call callee_stack_args
+; RV64I-NEXT:    addi sp, s0, -16
+; RV64I-NEXT:    .cfi_def_cfa sp, 16
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: no_reserved_call_frame:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    addi s0, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    lui a2, 1
+; RV32I-NEXT:  .LBB4_1: # %entry
+; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a2
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB4_1
+; RV32I-NEXT:  # %bb.2: # %entry
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    call callee_stack_args
+; RV32I-NEXT:    addi sp, s0, -16
+; RV32I-NEXT:    .cfi_def_cfa sp, 16
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+entry:
+  %v = alloca i32, i64 %n
+  call void @callee_stack_args(ptr %v, i32 %dummy)
+  ret void
+}
+
+; Same as above but without a variable-sized allocation, so the reserved call
+; frame can be folded into the fixed-size allocation in the prologue.
+define void @reserved_call_frame(i64 %n, i32 %dummy) #0 {
+; RV64I-LABEL: reserved_call_frame:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    addi sp, sp, -416
+; RV64I-NEXT:    .cfi_def_cfa_offset 416
+; RV64I-NEXT:    sd ra, 408(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    addi a0, sp, 8
+; RV64I-NEXT:    call callee_stack_args
+; RV64I-NEXT:    ld ra, 408(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    addi sp, sp, 416
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: reserved_call_frame:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    addi sp, sp, -416
+; RV32I-NEXT:    .cfi_def_cfa_offset 416
+; RV32I-NEXT:    sw ra, 412(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    addi a0, sp, 12
+; RV32I-NEXT:    call callee_stack_args
+; RV32I-NEXT:    lw ra, 412(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    addi sp, sp, 416
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+entry:
+  %v = alloca i32, i64 100
+  call void @callee_stack_args(ptr %v, i32 %dummy)
+  ret void
+}
+
+declare void @callee_stack_args(ptr, i32)
+
+; Dynamic allocation of vectors
+define void @dynamic_vector(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_vector:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    addi s0, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    csrr a2, vlenb
+; RV64I-NEXT:    mul a0, a2, a0
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    lui a2, 1
+; RV64I-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB6_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    sd a0, 0(a1)
+; RV64I-NEXT:    addi sp, s0, -16
+; RV64I-NEXT:    .cfi_def_cfa sp, 16
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: dynamic_vector:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    .cfi_def_cfa_offset 16
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    addi s0, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    csrr a1, vlenb
+; RV32I-NEXT:    mul a0, a1, a0
+; RV32I-NEXT:    slli a0, a0, 1
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB6_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    addi sp, s0, -16
+; RV32I-NEXT:    .cfi_def_cfa sp, 16
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %v = alloca <vscale x 4 x float>, i64 %size, align 16
+  store ptr %v, ptr %out, align 8
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
diff --git a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
index 843e57a42d926d..b1c0755c36ec1f 100644
--- a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
+++ b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
@@ -606,4 +606,129 @@ define i32 @f10(i64 %i) local_unnamed_addr #0 {
   ret i32 %c
 }
 
+define void @f11(i32 %vla_size, i64 %i) #0 {
+; RV64I-LABEL: f11:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 2032
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 2008(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    .cfi_offset s1, -24
+; RV64I-NEXT:    addi s0, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    lui a2, 15
+; RV64I-NEXT:    sub t1, sp, a2
+; RV64I-NEXT:    lui t2, 1
+; RV64I-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, t2
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    bne sp, t1, .LBB11_1
+; RV64I-NEXT:  # %bb.2:
+; RV64I-NEXT:    addi sp, sp, -2048
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    srli a2, sp, 15
+; RV64I-NEXT:    slli sp, a2, 15
+; RV64I-NEXT:    mv s1, sp
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    lui a2, 8
+; RV64I-NEXT:    add a2, s1, a2
+; RV64I-NEXT:    add a1, a2, a1
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    sw a2, 0(a1)
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    andi a0, a0, -16
+; RV64I-NEXT:    sub a0, sp, a0
+; RV64I-NEXT:    andi a0, a0, -2048
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:  .LBB11_3: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT:    sub sp, sp, a1
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    blt a0, sp, .LBB11_3
+; RV64I-NEXT:  # %bb.4:
+; RV64I-NEXT:    mv sp, a0
+; RV64I-NEXT:    lbu zero, 0(a0)
+; RV64I-NEXT:    addi sp, s0, -2032
+; RV64I-NEXT:    .cfi_def_cfa sp, 2032
+; RV64I-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 2008(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    .cfi_restore s1
+; RV64I-NEXT:    addi sp, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: f11:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 2032
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    sw ra, 2028(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 2024(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s1, 2020(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    .cfi_offset s1, -12
+; RV32I-NEXT:    addi s0, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    lui a2, 15
+; RV32I-NEXT:    sub t1, sp, a2
+; RV32I-NEXT:    lui t2, 1
+; RV32I-NEXT:  .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, t2
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    bne sp, t1, .LBB11_1
+; RV32I-NEXT:  # %bb.2:
+; RV32I-NEXT:    addi sp, sp, -2048
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    srli a2, sp, 15
+; RV32I-NEXT:    slli sp, a2, 15
+; RV32I-NEXT:    mv s1, sp
+; RV32I-NEXT:    slli a1, a1, 2
+; RV32I-NEXT:    lui a2, 8
+; RV32I-NEXT:    add a2, s1, a2
+; RV32I-NEXT:    add a1, a2, a1
+; RV32I-NEXT:    li a2, 1
+; RV32I-NEXT:    addi a0, a0, 15
+; RV32I-NEXT:    andi a0, a0, -16
+; RV32I-NEXT:    sw a2, 0(a1)
+; RV32I-NEXT:    sub a0, sp, a0
+; RV32I-NEXT:    andi a0, a0, -2048
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:  .LBB11_3: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT:    sub sp, sp, a1
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    blt a0, sp, .LBB11_3
+; RV32I-NEXT:  # %bb.4:
+; RV32I-NEXT:    mv sp, a0
+; RV32I-NEXT:    lbu zero, 0(a0)
+; RV32I-NEXT:    addi sp, s0, -2032
+; RV32I-NEXT:    .cfi_def_cfa sp, 2032
+; RV32I-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s1, 2020(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    .cfi_restore s1
+; RV32I-NEXT:    addi sp, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %a = alloca i32, i32 4096, align 32768
+  %b = getelementptr inbounds i32, ptr %a, i64 %i
+  store volatile i32 1, ptr %b
+  %1 = zext i32 %vla_size to i64
+  %vla = alloca i8, i64 %1, align 2048
+  %2 = load volatile i8, ptr %vla, align 2048
+  ret void
+}
+
 attributes #0 = { "probe-stack"="inline-asm" }

>From 4cf7ad94ef1b1cc52dd14e57387cd832c3b22d9b Mon Sep 17 00:00:00 2001
From: Raphael Moreira Zinsly <rzinsly at ventanamicro.com>
Date: Mon, 13 Jan 2025 10:13:47 -0300
Subject: [PATCH 2/3] Follow coding standards

---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 14 +++++++-------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp  | 12 +++++-------
 llvm/lib/Target/RISCV/RISCVISelLowering.h    |  4 ++--
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 8cef3f845065f0..9a7ee5f8a477b9 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -548,7 +548,7 @@ void RISCVFrameLowering::allocateAndProbeStackForRVV(
 
   // If we have a dynamic allocation later we need to probe any residuals.
   MachineBasicBlock *NextMBB = MBBI->getParent()->getSingleSuccessor();
-  if (NextMBB != NULL && NextMBB->begin()->getFlag(MachineInstr::FrameSetup)) {
+  if (NextMBB && NextMBB->begin()->getFlag(MachineInstr::FrameSetup)) {
     BuildMI(MBB, MBBI, DL, TII->get(STI.is64Bit() ? RISCV::SD : RISCV::SW))
         .addReg(RISCV::X0)
         .addReg(SPReg)
@@ -650,13 +650,13 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
   const RISCVRegisterInfo *RI = STI.getRegisterInfo();
   const RISCVInstrInfo *TII = STI.getInstrInfo();
   bool IsRV64 = STI.is64Bit();
-  bool dyn_alloca = false;
+  bool DynAllocation = false;
 
   // If we have a dynamic allocation later we need to probe any residuals.
   if (NeedProbe) {
     MachineBasicBlock *NextMBB = MBBI->getParent()->getSingleSuccessor();
-    dyn_alloca = (NextMBB != NULL &&
-                  NextMBB->begin()->getFlag(MachineInstr::FrameSetup));
+    DynAllocation =
+        (NextMBB && NextMBB->begin()->getFlag(MachineInstr::FrameSetup));
   }
 
   // Simply allocate the stack if it's not big enough to require a probe.
@@ -673,7 +673,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
-    if (dyn_alloca) {
+    if (DynAllocation) {
       // s[d|w] zero, 0(sp)
       BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
           .addReg(RISCV::X0)
@@ -724,7 +724,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
             .setMIFlag(MachineInstr::FrameSetup);
       }
 
-      if (dyn_alloca) {
+      if (DynAllocation) {
         // s[d|w] zero, 0(sp)
         BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
             .addReg(RISCV::X0)
@@ -775,7 +775,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
   if (Residual) {
     RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual),
                   MachineInstr::FrameSetup, getStackAlign());
-    if (dyn_alloca) {
+    if (DynAllocation) {
       // s[d|w] zero, 0(sp)
       BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
           .addReg(RISCV::X0)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 512e0e8f098889..0166e9b4f45d82 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -7685,7 +7685,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
                            Op.getOperand(2), Flags, DL);
   }
   case ISD::DYNAMIC_STACKALLOC:
-    return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    return lowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::INIT_TRAMPOLINE:
     return lowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:
@@ -19601,7 +19601,7 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case RISCV::PseudoFROUND_D_IN32X:
     return emitFROUND(MI, BB, Subtarget);
   case RISCV::PROBED_STACKALLOC_DYN:
-    return EmitDynamicProbedAlloc(MI, BB);
+    return emitDynamicProbedAlloc(MI, BB);
   case TargetOpcode::STATEPOINT:
     // STATEPOINT is a pseudo instruction which has no implicit defs/uses
     // while jal call instruction (where statepoint will be lowered at the end)
@@ -22565,7 +22565,7 @@ unsigned RISCVTargetLowering::getStackProbeSize(const MachineFunction &MF,
   return StackProbeSize ? StackProbeSize : StackAlign.value();
 }
 
-SDValue RISCVTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+SDValue RISCVTargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
                                                      SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   if (!hasInlineStackProbe(MF))
@@ -22592,19 +22592,17 @@ SDValue RISCVTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
   // Set the real SP to the new value with a probing loop.
   Chain = DAG.getNode(RISCVISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
-  SDValue Ops[2] = {SP, Chain};
-  return DAG.getMergeValues(Ops, dl);
+  return DAG.getMergeValues({SP, Chain}, dl);
 }
 
 MachineBasicBlock *
-RISCVTargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
+RISCVTargetLowering::emitDynamicProbedAlloc(MachineInstr &MI,
                                             MachineBasicBlock *MBB) const {
   MachineFunction &MF = *MBB->getParent();
   MachineBasicBlock::iterator MBBI = MI.getIterator();
   DebugLoc DL = MBB->findDebugLoc(MBBI);
   Register TargetReg = MI.getOperand(1).getReg();
 
-  auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
   const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
   bool IsRV64 = Subtarget.is64Bit();
   Align StackAlign = Subtarget.getFrameLowering()->getStackAlign();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 752faa5778984e..892c1cd96ca615 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -926,7 +926,7 @@ class RISCVTargetLowering : public TargetLowering {
 
   unsigned getStackProbeSize(const MachineFunction &MF, Align StackAlign) const;
 
-  MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
+  MachineBasicBlock *emitDynamicProbedAlloc(MachineInstr &MI,
                                             MachineBasicBlock *MBB) const;
 
 private:
@@ -1022,7 +1022,7 @@ class RISCVTargetLowering : public TargetLowering {
 
   SDValue lowerVectorStrictFSetcc(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;

>From 8d825fa0ce4b08f820769848dfbf9917320b0172 Mon Sep 17 00:00:00 2001
From: Raphael Moreira Zinsly <rzinsly at ventanamicro.com>
Date: Tue, 14 Jan 2025 10:55:03 -0300
Subject: [PATCH 3/3] Remove flags and move the dynalloca control to
 RISCVMachineFunctionInfo

---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  | 49 +++++++------------
 llvm/lib/Target/RISCV/RISCVFrameLowering.h    | 12 ++---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 17 +++----
 .../Target/RISCV/RISCVMachineFunctionInfo.h   |  6 +++
 .../RISCV/rvv/stack-probing-dynamic.ll        |  4 +-
 5 files changed, 34 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 9a7ee5f8a477b9..4335da93179b0e 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -502,7 +502,7 @@ getPushOrLibCallsSavedInfo(const MachineFunction &MF,
 void RISCVFrameLowering::allocateAndProbeStackForRVV(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount,
-    MachineInstr::MIFlag Flag, bool EmitCFI) const {
+    MachineInstr::MIFlag Flag, bool EmitCFI, bool DynAllocation) const {
   assert(Amount != 0 && "Did not need to adjust stack pointer for RVV.");
 
   // Emit a variable-length allocation probing loop.
@@ -547,8 +547,7 @@ void RISCVFrameLowering::allocateAndProbeStackForRVV(
       .setMIFlag(Flag);
 
   // If we have a dynamic allocation later we need to probe any residuals.
-  MachineBasicBlock *NextMBB = MBBI->getParent()->getSingleSuccessor();
-  if (NextMBB && NextMBB->begin()->getFlag(MachineInstr::FrameSetup)) {
+  if (DynAllocation) {
     BuildMI(MBB, MBBI, DL, TII->get(STI.is64Bit() ? RISCV::SD : RISCV::SW))
         .addReg(RISCV::X0)
         .addReg(SPReg)
@@ -644,20 +643,12 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI,
                                        MachineFunction &MF, uint64_t Offset,
                                        uint64_t RealStackSize, bool EmitCFI,
-                                       bool NeedProbe,
-                                       uint64_t ProbeSize) const {
+                                       bool NeedProbe, uint64_t ProbeSize,
+                                       bool DynAllocation) const {
   DebugLoc DL;
   const RISCVRegisterInfo *RI = STI.getRegisterInfo();
   const RISCVInstrInfo *TII = STI.getInstrInfo();
   bool IsRV64 = STI.is64Bit();
-  bool DynAllocation = false;
-
-  // If we have a dynamic allocation later we need to probe any residuals.
-  if (NeedProbe) {
-    MachineBasicBlock *NextMBB = MBBI->getParent()->getSingleSuccessor();
-    DynAllocation =
-        (NextMBB && NextMBB->begin()->getFlag(MachineInstr::FrameSetup));
-  }
 
   // Simply allocate the stack if it's not big enough to require a probe.
   if (!NeedProbe || Offset <= ProbeSize) {
@@ -673,7 +664,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
-    if (DynAllocation) {
+    if (NeedProbe && DynAllocation) {
       // s[d|w] zero, 0(sp)
       BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
           .addReg(RISCV::X0)
@@ -914,9 +905,11 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   const RISCVTargetLowering *TLI = Subtarget.getTargetLowering();
   bool NeedProbe = TLI->hasInlineStackProbe(MF);
   uint64_t ProbeSize = TLI->getStackProbeSize(MF, getStackAlign());
+  bool DynAllocation =
+      MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
   if (StackSize != 0)
     allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true,
-                  NeedProbe, ProbeSize);
+                  NeedProbe, ProbeSize, DynAllocation);
 
   // The frame pointer is callee-saved, and code has been generated for us to
   // save it to the stack. We need to skip over the storing of callee-saved
@@ -959,13 +952,14 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
     allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount,
                   getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe,
-                  ProbeSize);
+                  ProbeSize, DynAllocation);
   }
 
   if (RVVStackSize) {
     if (NeedProbe) {
       allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
-                                  MachineInstr::FrameSetup, !hasFP(MF));
+                                  MachineInstr::FrameSetup, !hasFP(MF),
+                                  DynAllocation);
     } else {
       // We must keep the stack pointer aligned through any intermediate
       // updates.
@@ -2129,10 +2123,9 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
 }
 
 // Synthesize the probe loop.
-MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline(
-    MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator MBBI, DebugLoc DL, Register TargetReg,
-    bool IsRVV) const {
+static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                                 Register TargetReg, bool IsRVV) {
   assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP");
 
   auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
@@ -2194,14 +2187,13 @@ MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline(
   }
 
   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
 
   LoopTestMBB->addSuccessor(ExitMBB);
   LoopTestMBB->addSuccessor(LoopTestMBB);
   MBB.addSuccessor(LoopTestMBB);
   // Update liveins.
   fullyRecomputeLiveIns({ExitMBB, LoopTestMBB});
-
-  return ExitMBB;
 }
 
 void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
@@ -2224,15 +2216,8 @@ void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
       MachineBasicBlock::iterator MBBI = MI->getIterator();
       DebugLoc DL = MBB.findDebugLoc(MBBI);
       Register TargetReg = MI->getOperand(1).getReg();
-      MachineBasicBlock *Succ = MBBI->getParent()->getSingleSuccessor();
-      MachineBasicBlock *Next = emitStackProbeInline(
-          MF, MBB, MBBI, DL, TargetReg,
-          (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
-      // Update the BBs information if we have a BB from a dynamic allocation.
-      if (Succ != NULL && Succ->begin()->getFlag(MachineInstr::FrameSetup)) {
-        MBBI->getParent()->removeSuccessor(Succ);
-        Next->addSuccessor(Succ);
-      }
+      emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg,
+                           (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
       MBBI->eraseFromParent();
     }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 1a2c6e0302623d..d013755ce58a0e 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -81,13 +81,7 @@ class RISCVFrameLowering : public TargetFrameLowering {
   void allocateStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      MachineFunction &MF, uint64_t Offset,
                      uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
-                     uint64_t ProbeSize) const;
-
-  MachineBasicBlock *emitStackProbeInline(MachineFunction &MF,
-                                          MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MBBI,
-                                          DebugLoc DL, Register TargetReg,
-                                          bool IsRVV) const;
+                     uint64_t ProbeSize, bool DynAllocation) const;
 
 protected:
   const RISCVSubtarget &STI;
@@ -116,8 +110,8 @@ class RISCVFrameLowering : public TargetFrameLowering {
   void allocateAndProbeStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    const DebugLoc &DL, int64_t Amount,
-                                   MachineInstr::MIFlag Flag,
-                                   bool EmitCFI) const;
+                                   MachineInstr::MIFlag Flag, bool EmitCFI,
+                                   bool DynAllocation) const;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0166e9b4f45d82..0f672fef8c80c7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22615,48 +22615,45 @@ RISCVTargetLowering::emitDynamicProbedAlloc(MachineInstr &MI,
   MF.insert(MBBInsertPoint, LoopTestMBB);
   MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
   MF.insert(MBBInsertPoint, ExitMBB);
-  MachineInstr::MIFlag Flags = MachineInstr::FrameSetup;
   Register SPReg = RISCV::X2;
   Register ScratchReg =
       MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
 
   // ScratchReg = ProbeSize
-  TII->movImm(*MBB, MBBI, DL, ScratchReg, ProbeSize, Flags);
+  TII->movImm(*MBB, MBBI, DL, ScratchReg, ProbeSize, MachineInstr::NoFlags);
 
   // LoopTest:
   //   SUB SP, SP, ProbeSize
   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB), SPReg)
       .addReg(SPReg)
-      .addReg(ScratchReg)
-      .setMIFlags(Flags);
+      .addReg(ScratchReg);
 
   //   s[d|w] zero, 0(sp)
   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL,
           TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
       .addReg(RISCV::X0)
       .addReg(SPReg)
-      .addImm(0)
-      .setMIFlags(Flags);
+      .addImm(0);
 
   //  BLT TargetReg, SP, LoopTest
   BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BLT))
       .addReg(TargetReg)
       .addReg(SPReg)
-      .addMBB(LoopTestMBB)
-      .setMIFlags(Flags);
+      .addMBB(LoopTestMBB);
 
   // Adjust with: MV SP, TargetReg.
   BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(RISCV::ADDI), SPReg)
       .addReg(TargetReg)
-      .addImm(0)
-      .setMIFlags(Flags);
+      .addImm(0);
 
   ExitMBB->splice(ExitMBB->end(), MBB, std::next(MBBI), MBB->end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   LoopTestMBB->addSuccessor(ExitMBB);
   LoopTestMBB->addSuccessor(LoopTestMBB);
   MBB->addSuccessor(LoopTestMBB);
 
   MI.eraseFromParent();
+  MF.getInfo<RISCVMachineFunctionInfo>()->setDynamicAllocation();
   return ExitMBB->begin()->getParent();
 }
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index 8909f2f3bd3170..9e03db675b6ed6 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -78,6 +78,9 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
 
   int64_t StackProbeSize = 0;
 
+  /// Does it probe the stack for a dynamic allocation?
+  bool HasDynProbe = false;
+
 public:
   RISCVMachineFunctionInfo(const Function &F, const RISCVSubtarget *STI);
 
@@ -159,6 +162,9 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
 
   bool isVectorCall() const { return IsVectorCall; }
   void setIsVectorCall() { IsVectorCall = true; }
+
+  bool hasDynamicAllocation() const { return HasDynProbe; }
+  void setDynamicAllocation() { HasDynProbe = true; }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
index 6dda0c38a070ac..c3c1643e6de011 100644
--- a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -360,9 +360,7 @@ define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
 
 ; If a function has variable-sized stack objects, then any function calls which
 ; need to pass arguments on the stack must allocate the stack space for them
-; dynamically, to ensure they are at the bottom of the frame. We need to probe
-; that space when it is larger than the unprobed space allowed by the ABI, so
-; this needs a very large number of arguments.
+; dynamically, to ensure they are at the bottom of the frame.
 define void @no_reserved_call_frame(i64 %n, i32 %dummy) #0 {
 ; RV64I-LABEL: no_reserved_call_frame:
 ; RV64I:       # %bb.0: # %entry