[llvm] 01d7f43 - [RISCV] Stack clash protection for dynamic alloca (#122508)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 16 11:58:45 PST 2025
Author: Raphael Moreira Zinsly
Date: 2025-01-16T11:58:42-08:00
New Revision: 01d7f434d21a70158094a9c7da971ce9e0d0915c
URL: https://github.com/llvm/llvm-project/commit/01d7f434d21a70158094a9c7da971ce9e0d0915c
DIFF: https://github.com/llvm/llvm-project/commit/01d7f434d21a70158094a9c7da971ce9e0d0915c.diff
LOG: [RISCV] Stack clash protection for dynamic alloca (#122508)
Create a probe loop for dynamic allocation and add the corresponding
SelectionDAG support in order to use it.
Added:
llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
Modified:
llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
llvm/lib/Target/RISCV/RISCVFrameLowering.h
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.h
llvm/lib/Target/RISCV/RISCVInstrInfo.td
llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 911cea27a48ac2..333c8060f37f4f 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -502,7 +502,7 @@ getPushOrLibCallsSavedInfo(const MachineFunction &MF,
void RISCVFrameLowering::allocateAndProbeStackForRVV(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount,
- MachineInstr::MIFlag Flag, bool EmitCFI) const {
+ MachineInstr::MIFlag Flag, bool EmitCFI, bool DynAllocation) const {
assert(Amount != 0 && "Did not need to adjust stack pointer for RVV.");
// Emit a variable-length allocation probing loop.
@@ -545,6 +545,15 @@ void RISCVFrameLowering::allocateAndProbeStackForRVV(
.addReg(SPReg)
.addReg(TargetReg)
.setMIFlag(Flag);
+
+ // If we have a dynamic allocation later we need to probe any residuals.
+ if (DynAllocation) {
+ BuildMI(MBB, MBBI, DL, TII->get(STI.is64Bit() ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
}
static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
@@ -634,11 +643,12 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
MachineFunction &MF, uint64_t Offset,
uint64_t RealStackSize, bool EmitCFI,
- bool NeedProbe,
- uint64_t ProbeSize) const {
+ bool NeedProbe, uint64_t ProbeSize,
+ bool DynAllocation) const {
DebugLoc DL;
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
const RISCVInstrInfo *TII = STI.getInstrInfo();
+ bool IsRV64 = STI.is64Bit();
// Simply allocate the stack if it's not big enough to require a probe.
if (!NeedProbe || Offset <= ProbeSize) {
@@ -654,13 +664,21 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
.setMIFlag(MachineInstr::FrameSetup);
}
+ if (NeedProbe && DynAllocation) {
+ // s[d|w] zero, 0(sp)
+ BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+
return;
}
// Unroll the probe loop depending on the number of iterations.
if (Offset < ProbeSize * 5) {
uint64_t CurrentOffset = 0;
- bool IsRV64 = STI.is64Bit();
while (CurrentOffset + ProbeSize <= Offset) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-ProbeSize), MachineInstr::FrameSetup,
@@ -696,6 +714,15 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);
}
+
+ if (DynAllocation) {
+ // s[d|w] zero, 0(sp)
+ BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
}
return;
@@ -736,9 +763,18 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
.setMIFlags(MachineInstr::FrameSetup);
}
- if (Residual)
+ if (Residual) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual),
MachineInstr::FrameSetup, getStackAlign());
+ if (DynAllocation) {
+ // s[d|w] zero, 0(sp)
+ BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ }
if (EmitCFI) {
// Emit ".cfi_def_cfa_offset Offset"
@@ -869,9 +905,11 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
const RISCVTargetLowering *TLI = Subtarget.getTargetLowering();
bool NeedProbe = TLI->hasInlineStackProbe(MF);
uint64_t ProbeSize = TLI->getStackProbeSize(MF, getStackAlign());
+ bool DynAllocation =
+ MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
if (StackSize != 0)
allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true,
- NeedProbe, ProbeSize);
+ NeedProbe, ProbeSize, DynAllocation);
// The frame pointer is callee-saved, and code has been generated for us to
// save it to the stack. We need to skip over the storing of callee-saved
@@ -914,13 +952,14 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount,
getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe,
- ProbeSize);
+ ProbeSize, DynAllocation);
}
if (RVVStackSize) {
if (NeedProbe) {
allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
- MachineInstr::FrameSetup, !hasFP(MF));
+ MachineInstr::FrameSetup, !hasFP(MF),
+ DynAllocation);
} else {
// We must keep the stack pointer aligned through any intermediate
// updates.
@@ -2148,6 +2187,7 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
}
ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
LoopTestMBB->addSuccessor(ExitMBB);
LoopTestMBB->addSuccessor(LoopTestMBB);
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 26d2a26d681c35..d013755ce58a0e 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -81,7 +81,7 @@ class RISCVFrameLowering : public TargetFrameLowering {
void allocateStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
MachineFunction &MF, uint64_t Offset,
uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
- uint64_t ProbeSize) const;
+ uint64_t ProbeSize, bool DynAllocation) const;
protected:
const RISCVSubtarget &STI;
@@ -110,8 +110,8 @@ class RISCVFrameLowering : public TargetFrameLowering {
void allocateAndProbeStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, int64_t Amount,
- MachineInstr::MIFlag Flag,
- bool EmitCFI) const;
+ MachineInstr::MIFlag Flag, bool EmitCFI,
+ bool DynAllocation) const;
};
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f8a5ccc3023a4d..35934ec8bdea56 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -280,7 +280,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
MVT::i1, Promote);
// TODO: add all necessary setOperationAction calls.
- setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Custom);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, XLenVT, Expand);
@@ -7727,6 +7727,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return emitFlushICache(DAG, Op.getOperand(0), Op.getOperand(1),
Op.getOperand(2), Flags, DL);
}
+ case ISD::DYNAMIC_STACKALLOC:
+ return lowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::INIT_TRAMPOLINE:
return lowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE:
@@ -19705,6 +19707,8 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case RISCV::PseudoFROUND_D_INX:
case RISCV::PseudoFROUND_D_IN32X:
return emitFROUND(MI, BB, Subtarget);
+ case RISCV::PROBED_STACKALLOC_DYN:
+ return emitDynamicProbedAlloc(MI, BB);
case TargetOpcode::STATEPOINT:
// STATEPOINT is a pseudo instruction which has no implicit defs/uses
// while jal call instruction (where statepoint will be lowered at the end)
@@ -20937,6 +20941,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SF_VC_V_IVW_SE)
NODE_NAME_CASE(SF_VC_V_VVW_SE)
NODE_NAME_CASE(SF_VC_V_FVW_SE)
+ NODE_NAME_CASE(PROBED_ALLOCA)
}
// clang-format on
return nullptr;
@@ -22666,3 +22671,95 @@ unsigned RISCVTargetLowering::getStackProbeSize(const MachineFunction &MF,
StackProbeSize = alignDown(StackProbeSize, StackAlign.value());
return StackProbeSize ? StackProbeSize : StackAlign.value();
}
+
+SDValue RISCVTargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (!hasInlineStackProbe(MF))
+ return SDValue();
+
+ MVT XLenVT = Subtarget.getXLenVT();
+ // Get the inputs.
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+
+ MaybeAlign Align =
+ cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+ SDLoc dl(Op);
+ EVT VT = Op.getValueType();
+
+ // Construct the new SP value in a GPR.
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, RISCV::X2, XLenVT);
+ Chain = SP.getValue(1);
+ SP = DAG.getNode(ISD::SUB, dl, XLenVT, SP, Size);
+ if (Align)
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT));
+
+ // Set the real SP to the new value with a probing loop.
+ Chain = DAG.getNode(RISCVISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
+ return DAG.getMergeValues({SP, Chain}, dl);
+}
+
+MachineBasicBlock *
+RISCVTargetLowering::emitDynamicProbedAlloc(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ MachineFunction &MF = *MBB->getParent();
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
+ DebugLoc DL = MBB->findDebugLoc(MBBI);
+ Register TargetReg = MI.getOperand(1).getReg();
+
+ const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
+ bool IsRV64 = Subtarget.is64Bit();
+ Align StackAlign = Subtarget.getFrameLowering()->getStackAlign();
+ const RISCVTargetLowering *TLI = Subtarget.getTargetLowering();
+ uint64_t ProbeSize = TLI->getStackProbeSize(MF, StackAlign);
+
+ MachineFunction::iterator MBBInsertPoint = std::next(MBB->getIterator());
+ MachineBasicBlock *LoopTestMBB =
+ MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+ MF.insert(MBBInsertPoint, LoopTestMBB);
+ MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+ MF.insert(MBBInsertPoint, ExitMBB);
+ Register SPReg = RISCV::X2;
+ Register ScratchReg =
+ MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+
+ // ScratchReg = ProbeSize
+ TII->movImm(*MBB, MBBI, DL, ScratchReg, ProbeSize, MachineInstr::NoFlags);
+
+ // LoopTest:
+ // SUB SP, SP, ProbeSize
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB), SPReg)
+ .addReg(SPReg)
+ .addReg(ScratchReg);
+
+ // s[d|w] zero, 0(sp)
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL,
+ TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0);
+
+ // BLT TargetReg, SP, LoopTest
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BLT))
+ .addReg(TargetReg)
+ .addReg(SPReg)
+ .addMBB(LoopTestMBB);
+
+ // Adjust with: MV SP, TargetReg.
+ BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(RISCV::ADDI), SPReg)
+ .addReg(TargetReg)
+ .addImm(0);
+
+ ExitMBB->splice(ExitMBB->end(), MBB, std::next(MBBI), MBB->end());
+ ExitMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ LoopTestMBB->addSuccessor(ExitMBB);
+ LoopTestMBB->addSuccessor(LoopTestMBB);
+ MBB->addSuccessor(LoopTestMBB);
+
+ MI.eraseFromParent();
+ MF.getInfo<RISCVMachineFunctionInfo>()->setDynamicAllocation();
+ return ExitMBB->begin()->getParent();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ea077c7d2d23a5..892c1cd96ca615 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -461,6 +461,10 @@ enum NodeType : unsigned {
SF_VC_V_VVW_SE,
SF_VC_V_FVW_SE,
+ // To avoid stack clash, allocation is performed by block and each block is
+ // probed.
+ PROBED_ALLOCA,
+
// RISC-V vector tuple type version of INSERT_SUBVECTOR/EXTRACT_SUBVECTOR.
TUPLE_INSERT,
TUPLE_EXTRACT,
@@ -922,6 +926,9 @@ class RISCVTargetLowering : public TargetLowering {
unsigned getStackProbeSize(const MachineFunction &MF, Align StackAlign) const;
+ MachineBasicBlock *emitDynamicProbedAlloc(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
private:
void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
@@ -1015,6 +1022,8 @@ class RISCVTargetLowering : public TargetLowering {
SDValue lowerVectorStrictFSetcc(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index ee86f53a5c8a8d..bb5bb6352c32a5 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -100,6 +100,11 @@ def riscv_add_tprel : SDNode<"RISCVISD::ADD_TPREL",
SDTCisSameAs<0, 3>,
SDTCisInt<0>]>>;
+def riscv_probed_alloca : SDNode<"RISCVISD::PROBED_ALLOCA",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisVT<0, i32>]>,
+ [SDNPHasChain, SDNPMayStore]>;
+
//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
@@ -1428,6 +1433,11 @@ def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp),
(ins GPR:$scratch),
[]>,
Sched<[]>;
+let usesCustomInserter = 1 in
+def PROBED_STACKALLOC_DYN : Pseudo<(outs GPR:$rd),
+ (ins GPR:$scratch),
+ [(set GPR:$rd, (riscv_probed_alloca GPR:$scratch))]>,
+ Sched<[]>;
}
/// HI and ADD_LO address nodes.
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index 8909f2f3bd3170..27a13bb7cace14 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -78,6 +78,9 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
int64_t StackProbeSize = 0;
+ /// Does it probe the stack for a dynamic allocation?
+ bool HasDynamicAllocation = false;
+
public:
RISCVMachineFunctionInfo(const Function &F, const RISCVSubtarget *STI);
@@ -159,6 +162,9 @@ class RISCVMachineFunctionInfo : public MachineFunctionInfo {
bool isVectorCall() const { return IsVectorCall; }
void setIsVectorCall() { IsVectorCall = true; }
+
+ bool hasDynamicAllocation() const { return HasDynamicAllocation; }
+ void setDynamicAllocation() { HasDynamicAllocation = true; }
};
} // end namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
new file mode 100644
index 00000000000000..c3c1643e6de011
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -0,0 +1,550 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
+; RUN: | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+
+; Tests copied from AArch64.
+
+; Dynamically-sized allocation, needs a loop which can handle any size at
+; runtime. The final iteration of the loop will temporarily put SP below the
+; target address, but this doesn't break any of the ABI constraints on the
+; stack, and also doesn't probe below the target SP value.
+define void @dynamic(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: .cfi_def_cfa_offset 16
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: addi s0, sp, 16
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB0_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: sd a0, 0(a1)
+; RV64I-NEXT: addi sp, s0, -16
+; RV64I-NEXT: .cfi_def_cfa sp, 16
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: dynamic:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: addi s0, sp, 16
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB0_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi sp, s0, -16
+; RV32I-NEXT: .cfi_def_cfa sp, 16
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %v = alloca i8, i64 %size, align 1
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; This function has a fixed-size stack slot and a dynamic one. The fixed size
+; slot isn't large enough that we would normally probe it, but we need to do so
+; here otherwise the gap between the CSR save and the first probe of the
+; dynamic allocation could be too far apart when the size of the dynamic
+; allocation is close to the guard size.
+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
+; RV64I-LABEL: dynamic_fixed:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -80
+; RV64I-NEXT: .cfi_def_cfa_offset 80
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: addi s0, sp, 80
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: addi a3, s0, -80
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: sd a3, 0(a1)
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: lui a1, 1
+; RV64I-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a1
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB1_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: sd a0, 0(a2)
+; RV64I-NEXT: addi sp, s0, -80
+; RV64I-NEXT: .cfi_def_cfa sp, 80
+; RV64I-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: addi sp, sp, 80
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: dynamic_fixed:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: .cfi_def_cfa_offset 80
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: addi s0, sp, 80
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: addi a1, s0, -72
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: sw a1, 0(a2)
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB1_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: sw a0, 0(a3)
+; RV32I-NEXT: addi sp, s0, -80
+; RV32I-NEXT: .cfi_def_cfa sp, 80
+; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: addi sp, sp, 80
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %v1 = alloca i8, i64 64, align 1
+ store ptr %v1, ptr %out1, align 8
+ %v2 = alloca i8, i64 %size, align 1
+ store ptr %v2, ptr %out2, align 8
+ ret void
+}
+
+; Dynamic allocation, with an alignment requirement greater than the alignment
+; of SP. Done by ANDing the target SP with a constant to align it down, then
+; doing the loop as normal. Note that we also re-align the stack in the prolog,
+; which isn't actually needed because the only aligned allocations are dynamic,
+; this is done even without stack probing.
+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_align_64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: .cfi_def_cfa_offset 64
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: .cfi_offset s1, -24
+; RV64I-NEXT: addi s0, sp, 64
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: andi sp, sp, -64
+; RV64I-NEXT: mv s1, sp
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: andi a0, a0, -64
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB2_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: sd a0, 0(a1)
+; RV64I-NEXT: addi sp, s0, -64
+; RV64I-NEXT: .cfi_def_cfa sp, 64
+; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: .cfi_restore s1
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: dynamic_align_64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: .cfi_def_cfa_offset 64
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: .cfi_offset s1, -12
+; RV32I-NEXT: addi s0, sp, 64
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: andi sp, sp, -64
+; RV32I-NEXT: mv s1, sp
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: andi a0, a0, -64
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB2_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi sp, s0, -64
+; RV32I-NEXT: .cfi_def_cfa sp, 64
+; RV32I-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: .cfi_restore s1
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %v = alloca i8, i64 %size, align 64
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; Dynamic allocation, with an alignment greater than the stack guard size. The
+; only
diff erence to the dynamic allocation is the constant used for aligning
+; the target SP, the loop will probe the whole allocation without needing to
+; know about the alignment padding.
+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_align_8192:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -2032
+; RV64I-NEXT: .cfi_def_cfa_offset 2032
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 2008(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: .cfi_offset s1, -24
+; RV64I-NEXT: addi s0, sp, 2032
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: addi sp, sp, -2048
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: srli a2, sp, 13
+; RV64I-NEXT: slli sp, a2, 13
+; RV64I-NEXT: mv s1, sp
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: lui a2, 1048574
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB3_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: sd a0, 0(a1)
+; RV64I-NEXT: addi sp, s0, -2032
+; RV64I-NEXT: .cfi_def_cfa sp, 2032
+; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 2016(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 2008(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: .cfi_restore s1
+; RV64I-NEXT: addi sp, sp, 2032
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: dynamic_align_8192:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -2032
+; RV32I-NEXT: .cfi_def_cfa_offset 2032
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 2024(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 2020(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: .cfi_offset s1, -12
+; RV32I-NEXT: addi s0, sp, 2032
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: addi sp, sp, -2048
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: srli a1, sp, 13
+; RV32I-NEXT: slli sp, a1, 13
+; RV32I-NEXT: mv s1, sp
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: lui a1, 1048574
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB3_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi sp, s0, -2032
+; RV32I-NEXT: .cfi_def_cfa sp, 2032
+; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 2024(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 2020(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: .cfi_restore s1
+; RV32I-NEXT: addi sp, sp, 2032
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %v = alloca i8, i64 %size, align 8192
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; If a function has variable-sized stack objects, then any function calls which
+; need to pass arguments on the stack must allocate the stack space for them
+; dynamically, to ensure they are at the bottom of the frame.
+define void @no_reserved_call_frame(i64 %n, i32 %dummy) #0 {
+; RV64I-LABEL: no_reserved_call_frame:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: .cfi_def_cfa_offset 16
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: addi s0, sp, 16
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: slli a0, a0, 2
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: .LBB4_1: # %entry
+; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB4_1
+; RV64I-NEXT: # %bb.2: # %entry
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: call callee_stack_args
+; RV64I-NEXT: addi sp, s0, -16
+; RV64I-NEXT: .cfi_def_cfa sp, 16
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: no_reserved_call_frame:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: addi s0, sp, 16
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: mv a1, a2
+; RV32I-NEXT: slli a0, a0, 2
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: lui a2, 1
+; RV32I-NEXT: .LBB4_1: # %entry
+; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a2
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB4_1
+; RV32I-NEXT: # %bb.2: # %entry
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: call callee_stack_args
+; RV32I-NEXT: addi sp, s0, -16
+; RV32I-NEXT: .cfi_def_cfa sp, 16
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+entry:
+ %v = alloca i32, i64 %n
+ call void @callee_stack_args(ptr %v, i32 %dummy)
+ ret void
+}
+
+; Same as above but without a variable-sized allocation, so the reserved call
+; frame can be folded into the fixed-size allocation in the prologue.
+define void @reserved_call_frame(i64 %n, i32 %dummy) #0 {
+; RV64I-LABEL: reserved_call_frame:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: addi sp, sp, -416
+; RV64I-NEXT: .cfi_def_cfa_offset 416
+; RV64I-NEXT: sd ra, 408(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: addi a0, sp, 8
+; RV64I-NEXT: call callee_stack_args
+; RV64I-NEXT: ld ra, 408(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: addi sp, sp, 416
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: reserved_call_frame:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -416
+; RV32I-NEXT: .cfi_def_cfa_offset 416
+; RV32I-NEXT: sw ra, 412(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: mv a1, a2
+; RV32I-NEXT: addi a0, sp, 12
+; RV32I-NEXT: call callee_stack_args
+; RV32I-NEXT: lw ra, 412(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: addi sp, sp, 416
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+entry:
+ %v = alloca i32, i64 100
+ call void @callee_stack_args(ptr %v, i32 %dummy)
+ ret void
+}
+
+declare void @callee_stack_args(ptr, i32)
+
+; Dynamic allocation of vectors
+define void @dynamic_vector(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_vector:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: .cfi_def_cfa_offset 16
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: addi s0, sp, 16
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: csrr a2, vlenb
+; RV64I-NEXT: mul a0, a2, a0
+; RV64I-NEXT: slli a0, a0, 1
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB6_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: sd a0, 0(a1)
+; RV64I-NEXT: addi sp, s0, -16
+; RV64I-NEXT: .cfi_def_cfa sp, 16
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: dynamic_vector:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: addi s0, sp, 16
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: csrr a1, vlenb
+; RV32I-NEXT: mul a0, a1, a0
+; RV32I-NEXT: slli a0, a0, 1
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB6_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi sp, s0, -16
+; RV32I-NEXT: .cfi_def_cfa sp, 16
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %v = alloca <vscale x 4 x float>, i64 %size, align 16
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
diff --git a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
index 843e57a42d926d..b1c0755c36ec1f 100644
--- a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
+++ b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
@@ -606,4 +606,129 @@ define i32 @f10(i64 %i) local_unnamed_addr #0 {
ret i32 %c
}
+define void @f11(i32 %vla_size, i64 %i) #0 {
+; RV64I-LABEL: f11:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -2032
+; RV64I-NEXT: .cfi_def_cfa_offset 2032
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 2008(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: .cfi_offset s1, -24
+; RV64I-NEXT: addi s0, sp, 2032
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: lui a2, 15
+; RV64I-NEXT: sub t1, sp, a2
+; RV64I-NEXT: lui t2, 1
+; RV64I-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, t2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: bne sp, t1, .LBB11_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: addi sp, sp, -2048
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: srli a2, sp, 15
+; RV64I-NEXT: slli sp, a2, 15
+; RV64I-NEXT: mv s1, sp
+; RV64I-NEXT: slli a1, a1, 2
+; RV64I-NEXT: lui a2, 8
+; RV64I-NEXT: add a2, s1, a2
+; RV64I-NEXT: add a1, a2, a1
+; RV64I-NEXT: li a2, 1
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
+; RV64I-NEXT: sw a2, 0(a1)
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: andi a0, a0, -2048
+; RV64I-NEXT: lui a1, 1
+; RV64I-NEXT: .LBB11_3: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a1
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB11_3
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: lbu zero, 0(a0)
+; RV64I-NEXT: addi sp, s0, -2032
+; RV64I-NEXT: .cfi_def_cfa sp, 2032
+; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 2016(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 2008(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: .cfi_restore s1
+; RV64I-NEXT: addi sp, sp, 2032
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: f11:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -2032
+; RV32I-NEXT: .cfi_def_cfa_offset 2032
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 2024(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 2020(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: .cfi_offset s1, -12
+; RV32I-NEXT: addi s0, sp, 2032
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: lui a2, 15
+; RV32I-NEXT: sub t1, sp, a2
+; RV32I-NEXT: lui t2, 1
+; RV32I-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, t2
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: bne sp, t1, .LBB11_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: addi sp, sp, -2048
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: srli a2, sp, 15
+; RV32I-NEXT: slli sp, a2, 15
+; RV32I-NEXT: mv s1, sp
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: lui a2, 8
+; RV32I-NEXT: add a2, s1, a2
+; RV32I-NEXT: add a1, a2, a1
+; RV32I-NEXT: li a2, 1
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sw a2, 0(a1)
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: andi a0, a0, -2048
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB11_3: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB11_3
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: lbu zero, 0(a0)
+; RV32I-NEXT: addi sp, s0, -2032
+; RV32I-NEXT: .cfi_def_cfa sp, 2032
+; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 2024(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 2020(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: .cfi_restore s1
+; RV32I-NEXT: addi sp, sp, 2032
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %a = alloca i32, i32 4096, align 32768
+ %b = getelementptr inbounds i32, ptr %a, i64 %i
+ store volatile i32 1, ptr %b
+ %1 = zext i32 %vla_size to i64
+ %vla = alloca i8, i64 %1, align 2048
+ %2 = load volatile i8, ptr %vla, align 2048
+ ret void
+}
+
attributes #0 = { "probe-stack"="inline-asm" }
More information about the llvm-commits
mailing list