[llvm] [RISCV] Stack clash protection for dynamic alloca (PR #122508)
Raphael Moreira Zinsly via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 10 11:16:10 PST 2025
https://github.com/rzinsly updated https://github.com/llvm/llvm-project/pull/122508
>From 600dbfbcf28ba8c7f174503b96f5728ba110eb9d Mon Sep 17 00:00:00 2001
From: Raphael Moreira Zinsly <rzinsly at ventanamicro.com>
Date: Wed, 13 Nov 2024 15:55:52 -0300
Subject: [PATCH] [RISCV] Stack clash protection for dynamic alloca
Create a probe loop for dynamic allocation and add the corresponding
SelectionDAG support in order to use it.
---
llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 69 ++-
llvm/lib/Target/RISCV/RISCVFrameLowering.h | 6 +
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 105 +++-
llvm/lib/Target/RISCV/RISCVISelLowering.h | 9 +
llvm/lib/Target/RISCV/RISCVInstrInfo.td | 10 +
.../RISCV/rvv/stack-probing-dynamic.ll | 552 ++++++++++++++++++
.../CodeGen/RISCV/stack-clash-prologue.ll | 125 ++++
7 files changed, 868 insertions(+), 8 deletions(-)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index ed3ec310280670..8cef3f845065f0 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -545,6 +545,16 @@ void RISCVFrameLowering::allocateAndProbeStackForRVV(
.addReg(SPReg)
.addReg(TargetReg)
.setMIFlag(Flag);
+
+ // If we have a dynamic allocation later we need to probe any residuals.
+ MachineBasicBlock *NextMBB = MBBI->getParent()->getSingleSuccessor();
+ if (NextMBB != NULL && NextMBB->begin()->getFlag(MachineInstr::FrameSetup)) {
+ BuildMI(MBB, MBBI, DL, TII->get(STI.is64Bit() ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
}
static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
@@ -639,6 +649,15 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
DebugLoc DL;
const RISCVRegisterInfo *RI = STI.getRegisterInfo();
const RISCVInstrInfo *TII = STI.getInstrInfo();
+ bool IsRV64 = STI.is64Bit();
+ bool dyn_alloca = false;
+
+ // If we have a dynamic allocation later we need to probe any residuals.
+ if (NeedProbe) {
+ MachineBasicBlock *NextMBB = MBBI->getParent()->getSingleSuccessor();
+ dyn_alloca = (NextMBB != NULL &&
+ NextMBB->begin()->getFlag(MachineInstr::FrameSetup));
+ }
// Simply allocate the stack if it's not big enough to require a probe.
if (!NeedProbe || Offset <= ProbeSize) {
@@ -654,13 +673,21 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
.setMIFlag(MachineInstr::FrameSetup);
}
+ if (dyn_alloca) {
+ // s[d|w] zero, 0(sp)
+ BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+
return;
}
// Unroll the probe loop depending on the number of iterations.
if (Offset < ProbeSize * 5) {
uint64_t CurrentOffset = 0;
- bool IsRV64 = STI.is64Bit();
while (CurrentOffset + ProbeSize <= Offset) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
StackOffset::getFixed(-ProbeSize), MachineInstr::FrameSetup,
@@ -696,6 +723,15 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
.addCFIIndex(CFIIndex)
.setMIFlag(MachineInstr::FrameSetup);
}
+
+ if (dyn_alloca) {
+ // s[d|w] zero, 0(sp)
+ BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
}
return;
@@ -736,9 +772,18 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
.setMIFlags(MachineInstr::FrameSetup);
}
- if (Residual)
+ if (Residual) {
RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual),
MachineInstr::FrameSetup, getStackAlign());
+ if (dyn_alloca) {
+ // s[d|w] zero, 0(sp)
+ BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ }
if (EmitCFI) {
// Emit ".cfi_def_cfa_offset Offset"
@@ -2084,9 +2129,10 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
}
// Synthesize the probe loop.
-static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc DL,
- Register TargetReg, bool IsRVV) {
+MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, Register TargetReg,
+ bool IsRVV) const {
assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP");
auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
@@ -2154,6 +2200,8 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
MBB.addSuccessor(LoopTestMBB);
// Update liveins.
fullyRecomputeLiveIns({ExitMBB, LoopTestMBB});
+
+ return ExitMBB;
}
void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
@@ -2176,8 +2224,15 @@ void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
MachineBasicBlock::iterator MBBI = MI->getIterator();
DebugLoc DL = MBB.findDebugLoc(MBBI);
Register TargetReg = MI->getOperand(1).getReg();
- emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg,
- (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
+ MachineBasicBlock *Succ = MBBI->getParent()->getSingleSuccessor();
+ MachineBasicBlock *Next = emitStackProbeInline(
+ MF, MBB, MBBI, DL, TargetReg,
+ (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
+ // Update the BBs information if we have a BB from a dynamic allocation.
+ if (Succ != NULL && Succ->begin()->getFlag(MachineInstr::FrameSetup)) {
+ MBBI->getParent()->removeSuccessor(Succ);
+ Next->addSuccessor(Succ);
+ }
MBBI->eraseFromParent();
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 26d2a26d681c35..1a2c6e0302623d 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -83,6 +83,12 @@ class RISCVFrameLowering : public TargetFrameLowering {
uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
uint64_t ProbeSize) const;
+ MachineBasicBlock *emitStackProbeInline(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, Register TargetReg,
+ bool IsRVV) const;
+
protected:
const RISCVSubtarget &STI;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 95f1deed8b6c02..512e0e8f098889 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -280,7 +280,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
MVT::i1, Promote);
// TODO: add all necessary setOperationAction calls.
- setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, XLenVT, Custom);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, XLenVT, Expand);
@@ -7684,6 +7684,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return emitFlushICache(DAG, Op.getOperand(0), Op.getOperand(1),
Op.getOperand(2), Flags, DL);
}
+ case ISD::DYNAMIC_STACKALLOC:
+ return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::INIT_TRAMPOLINE:
return lowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE:
@@ -19598,6 +19600,8 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case RISCV::PseudoFROUND_D_INX:
case RISCV::PseudoFROUND_D_IN32X:
return emitFROUND(MI, BB, Subtarget);
+ case RISCV::PROBED_STACKALLOC_DYN:
+ return EmitDynamicProbedAlloc(MI, BB);
case TargetOpcode::STATEPOINT:
// STATEPOINT is a pseudo instruction which has no implicit defs/uses
// while jal call instruction (where statepoint will be lowered at the end)
@@ -20830,6 +20834,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(SF_VC_V_IVW_SE)
NODE_NAME_CASE(SF_VC_V_VVW_SE)
NODE_NAME_CASE(SF_VC_V_FVW_SE)
+ NODE_NAME_CASE(PROBED_ALLOCA)
}
// clang-format on
return nullptr;
@@ -22559,3 +22564,101 @@ unsigned RISCVTargetLowering::getStackProbeSize(const MachineFunction &MF,
StackProbeSize = alignDown(StackProbeSize, StackAlign.value());
return StackProbeSize ? StackProbeSize : StackAlign.value();
}
+
+SDValue RISCVTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (!hasInlineStackProbe(MF))
+ return SDValue();
+
+ MVT XLenVT = Subtarget.getXLenVT();
+ // Get the inputs.
+ SDNode *Node = Op.getNode();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+
+ MaybeAlign Align =
+ cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+ SDLoc dl(Op);
+ EVT VT = Node->getValueType(0);
+
+ // Construct the new SP value in a GPR.
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, RISCV::X2, XLenVT);
+ Chain = SP.getValue(1);
+ SP = DAG.getNode(ISD::SUB, dl, XLenVT, SP, Size);
+ if (Align)
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getSignedConstant(-(uint64_t)Align->value(), dl, VT));
+
+ // Set the real SP to the new value with a probing loop.
+ Chain = DAG.getNode(RISCVISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
+ SDValue Ops[2] = {SP, Chain};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+MachineBasicBlock *
+RISCVTargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ MachineFunction &MF = *MBB->getParent();
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
+ DebugLoc DL = MBB->findDebugLoc(MBBI);
+ Register TargetReg = MI.getOperand(1).getReg();
+
+ auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
+ const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
+ bool IsRV64 = Subtarget.is64Bit();
+ Align StackAlign = Subtarget.getFrameLowering()->getStackAlign();
+ const RISCVTargetLowering *TLI = Subtarget.getTargetLowering();
+ uint64_t ProbeSize = TLI->getStackProbeSize(MF, StackAlign);
+
+ MachineFunction::iterator MBBInsertPoint = std::next(MBB->getIterator());
+ MachineBasicBlock *LoopTestMBB =
+ MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+ MF.insert(MBBInsertPoint, LoopTestMBB);
+ MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+ MF.insert(MBBInsertPoint, ExitMBB);
+ MachineInstr::MIFlag Flags = MachineInstr::FrameSetup;
+ Register SPReg = RISCV::X2;
+ Register ScratchReg =
+ MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+
+ // ScratchReg = ProbeSize
+ TII->movImm(*MBB, MBBI, DL, ScratchReg, ProbeSize, Flags);
+
+ // LoopTest:
+ // SUB SP, SP, ProbeSize
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB), SPReg)
+ .addReg(SPReg)
+ .addReg(ScratchReg)
+ .setMIFlags(Flags);
+
+ // s[d|w] zero, 0(sp)
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL,
+ TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(Flags);
+
+ // BLT TargetReg, SP, LoopTest
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BLT))
+ .addReg(TargetReg)
+ .addReg(SPReg)
+ .addMBB(LoopTestMBB)
+ .setMIFlags(Flags);
+
+ // Adjust with: MV SP, TargetReg.
+ BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(RISCV::ADDI), SPReg)
+ .addReg(TargetReg)
+ .addImm(0)
+ .setMIFlags(Flags);
+
+ ExitMBB->splice(ExitMBB->end(), MBB, std::next(MBBI), MBB->end());
+
+ LoopTestMBB->addSuccessor(ExitMBB);
+ LoopTestMBB->addSuccessor(LoopTestMBB);
+ MBB->addSuccessor(LoopTestMBB);
+
+ MI.eraseFromParent();
+ return ExitMBB->begin()->getParent();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ea077c7d2d23a5..752faa5778984e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -461,6 +461,10 @@ enum NodeType : unsigned {
SF_VC_V_VVW_SE,
SF_VC_V_FVW_SE,
+ // To avoid stack clash, allocation is performed by block and each block is
+ // probed.
+ PROBED_ALLOCA,
+
// RISC-V vector tuple type version of INSERT_SUBVECTOR/EXTRACT_SUBVECTOR.
TUPLE_INSERT,
TUPLE_EXTRACT,
@@ -922,6 +926,9 @@ class RISCVTargetLowering : public TargetLowering {
unsigned getStackProbeSize(const MachineFunction &MF, Align StackAlign) const;
+ MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
private:
void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
const SmallVectorImpl<ISD::InputArg> &Ins, bool IsRet,
@@ -1015,6 +1022,8 @@ class RISCVTargetLowering : public TargetLowering {
SDValue lowerVectorStrictFSetcc(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index ee86f53a5c8a8d..05ff9c70cacd43 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -100,6 +100,11 @@ def riscv_add_tprel : SDNode<"RISCVISD::ADD_TPREL",
SDTCisSameAs<0, 3>,
SDTCisInt<0>]>>;
+def riscv_probed_alloca : SDNode<"RISCVISD::PROBED_ALLOCA",
+ SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
+ SDTCisVT<0, i32>]>,
+ [SDNPHasChain, SDNPMayStore]>;
+
//===----------------------------------------------------------------------===//
// Operand and SDNode transformation definitions.
//===----------------------------------------------------------------------===//
@@ -1428,6 +1433,11 @@ def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp),
(ins GPR:$scratch),
[]>,
Sched<[]>;
+let usesCustomInserter = 1 in
+def PROBED_STACKALLOC_DYN : Pseudo<(outs GPR:$rd),
+ (ins GPR:$scratch),
+ [(set GPR:$rd, (riscv_probed_alloca GPR:$scratch))]>,
+ Sched<[]>;
}
/// HI and ADD_LO address nodes.
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
new file mode 100644
index 00000000000000..6dda0c38a070ac
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-dynamic.ll
@@ -0,0 +1,552 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
+; RUN: | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
+; RUN: | FileCheck %s -check-prefix=RV32I
+
+; Tests copied from AArch64.
+
+; Dynamically-sized allocation, needs a loop which can handle any size at
+; runtime. The final iteration of the loop will temporarily put SP below the
+; target address, but this doesn't break any of the ABI constraints on the
+; stack, and also doesn't probe below the target SP value.
+define void @dynamic(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: .cfi_def_cfa_offset 16
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: addi s0, sp, 16
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB0_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: sd a0, 0(a1)
+; RV64I-NEXT: addi sp, s0, -16
+; RV64I-NEXT: .cfi_def_cfa sp, 16
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: dynamic:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: addi s0, sp, 16
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB0_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi sp, s0, -16
+; RV32I-NEXT: .cfi_def_cfa sp, 16
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %v = alloca i8, i64 %size, align 1
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; This function has a fixed-size stack slot and a dynamic one. The fixed size
+; slot isn't large enough that we would normally probe it, but we need to do so
+; here otherwise the gap between the CSR save and the first probe of the
+; dynamic allocation could be too far apart when the size of the dynamic
+; allocation is close to the guard size.
+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
+; RV64I-LABEL: dynamic_fixed:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -80
+; RV64I-NEXT: .cfi_def_cfa_offset 80
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: addi s0, sp, 80
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: addi a3, s0, -80
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: sd a3, 0(a1)
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: lui a1, 1
+; RV64I-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a1
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB1_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: sd a0, 0(a2)
+; RV64I-NEXT: addi sp, s0, -80
+; RV64I-NEXT: .cfi_def_cfa sp, 80
+; RV64I-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: addi sp, sp, 80
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: dynamic_fixed:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -80
+; RV32I-NEXT: .cfi_def_cfa_offset 80
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: addi s0, sp, 80
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: addi a1, s0, -72
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: sw a1, 0(a2)
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB1_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: sw a0, 0(a3)
+; RV32I-NEXT: addi sp, s0, -80
+; RV32I-NEXT: .cfi_def_cfa sp, 80
+; RV32I-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: addi sp, sp, 80
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %v1 = alloca i8, i64 64, align 1
+ store ptr %v1, ptr %out1, align 8
+ %v2 = alloca i8, i64 %size, align 1
+ store ptr %v2, ptr %out2, align 8
+ ret void
+}
+
+; Dynamic allocation, with an alignment requirement greater than the alignment
+; of SP. Done by ANDing the target SP with a constant to align it down, then
+; doing the loop as normal. Note that we also re-align the stack in the prolog,
+; which isn't actually needed because the only aligned allocations are dynamic,
+; this is done even without stack probing.
+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_align_64:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -64
+; RV64I-NEXT: .cfi_def_cfa_offset 64
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: .cfi_offset s1, -24
+; RV64I-NEXT: addi s0, sp, 64
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: andi sp, sp, -64
+; RV64I-NEXT: mv s1, sp
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: andi a0, a0, -64
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB2_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: sd a0, 0(a1)
+; RV64I-NEXT: addi sp, s0, -64
+; RV64I-NEXT: .cfi_def_cfa sp, 64
+; RV64I-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: .cfi_restore s1
+; RV64I-NEXT: addi sp, sp, 64
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: dynamic_align_64:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -64
+; RV32I-NEXT: .cfi_def_cfa_offset 64
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 60(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 56(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 52(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: .cfi_offset s1, -12
+; RV32I-NEXT: addi s0, sp, 64
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: andi sp, sp, -64
+; RV32I-NEXT: mv s1, sp
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: andi a0, a0, -64
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB2_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi sp, s0, -64
+; RV32I-NEXT: .cfi_def_cfa sp, 64
+; RV32I-NEXT: lw ra, 60(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 56(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 52(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: .cfi_restore s1
+; RV32I-NEXT: addi sp, sp, 64
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %v = alloca i8, i64 %size, align 64
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; Dynamic allocation, with an alignment greater than the stack guard size. The
+; only difference to the dynamic allocation is the constant used for aligning
+; the target SP, the loop will probe the whole allocation without needing to
+; know about the alignment padding.
+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_align_8192:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -2032
+; RV64I-NEXT: .cfi_def_cfa_offset 2032
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 2008(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: .cfi_offset s1, -24
+; RV64I-NEXT: addi s0, sp, 2032
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: addi sp, sp, -2048
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: srli a2, sp, 13
+; RV64I-NEXT: slli sp, a2, 13
+; RV64I-NEXT: mv s1, sp
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: lui a2, 1048574
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: and a0, a0, a2
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB3_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: sd a0, 0(a1)
+; RV64I-NEXT: addi sp, s0, -2032
+; RV64I-NEXT: .cfi_def_cfa sp, 2032
+; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 2016(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 2008(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: .cfi_restore s1
+; RV64I-NEXT: addi sp, sp, 2032
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: dynamic_align_8192:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -2032
+; RV32I-NEXT: .cfi_def_cfa_offset 2032
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 2024(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 2020(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: .cfi_offset s1, -12
+; RV32I-NEXT: addi s0, sp, 2032
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: addi sp, sp, -2048
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: srli a1, sp, 13
+; RV32I-NEXT: slli sp, a1, 13
+; RV32I-NEXT: mv s1, sp
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: lui a1, 1048574
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: and a0, a0, a1
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB3_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi sp, s0, -2032
+; RV32I-NEXT: .cfi_def_cfa sp, 2032
+; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 2024(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 2020(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: .cfi_restore s1
+; RV32I-NEXT: addi sp, sp, 2032
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %v = alloca i8, i64 %size, align 8192
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; If a function has variable-sized stack objects, then any function calls which
+; need to pass arguments on the stack must allocate the stack space for them
+; dynamically, to ensure they are at the bottom of the frame. We need to probe
+; that space when it is larger than the unprobed space allowed by the ABI, so
+; this needs a very large number of arguments.
+define void @no_reserved_call_frame(i64 %n, i32 %dummy) #0 {
+; RV64I-LABEL: no_reserved_call_frame:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: .cfi_def_cfa_offset 16
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: addi s0, sp, 16
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: slli a0, a0, 2
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: .LBB4_1: # %entry
+; RV64I-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB4_1
+; RV64I-NEXT: # %bb.2: # %entry
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: call callee_stack_args
+; RV64I-NEXT: addi sp, s0, -16
+; RV64I-NEXT: .cfi_def_cfa sp, 16
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: no_reserved_call_frame:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: addi s0, sp, 16
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: mv a1, a2
+; RV32I-NEXT: slli a0, a0, 2
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: lui a2, 1
+; RV32I-NEXT: .LBB4_1: # %entry
+; RV32I-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a2
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB4_1
+; RV32I-NEXT: # %bb.2: # %entry
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: call callee_stack_args
+; RV32I-NEXT: addi sp, s0, -16
+; RV32I-NEXT: .cfi_def_cfa sp, 16
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+entry:
+ %v = alloca i32, i64 %n
+ call void @callee_stack_args(ptr %v, i32 %dummy)
+ ret void
+}
+
+; Same as above but without a variable-sized allocation, so the reserved call
+; frame can be folded into the fixed-size allocation in the prologue.
+define void @reserved_call_frame(i64 %n, i32 %dummy) #0 {
+; RV64I-LABEL: reserved_call_frame:
+; RV64I: # %bb.0: # %entry
+; RV64I-NEXT: addi sp, sp, -416
+; RV64I-NEXT: .cfi_def_cfa_offset 416
+; RV64I-NEXT: sd ra, 408(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: addi a0, sp, 8
+; RV64I-NEXT: call callee_stack_args
+; RV64I-NEXT: ld ra, 408(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: addi sp, sp, 416
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: reserved_call_frame:
+; RV32I: # %bb.0: # %entry
+; RV32I-NEXT: addi sp, sp, -416
+; RV32I-NEXT: .cfi_def_cfa_offset 416
+; RV32I-NEXT: sw ra, 412(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: mv a1, a2
+; RV32I-NEXT: addi a0, sp, 12
+; RV32I-NEXT: call callee_stack_args
+; RV32I-NEXT: lw ra, 412(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: addi sp, sp, 416
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+entry:
+ %v = alloca i32, i64 100
+ call void @callee_stack_args(ptr %v, i32 %dummy)
+ ret void
+}
+
+declare void @callee_stack_args(ptr, i32)
+
+; Dynamic allocation of vectors
+define void @dynamic_vector(i64 %size, ptr %out) #0 {
+; RV64I-LABEL: dynamic_vector:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: .cfi_def_cfa_offset 16
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: addi s0, sp, 16
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: csrr a2, vlenb
+; RV64I-NEXT: mul a0, a2, a0
+; RV64I-NEXT: slli a0, a0, 1
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: lui a2, 1
+; RV64I-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB6_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: sd a0, 0(a1)
+; RV64I-NEXT: addi sp, s0, -16
+; RV64I-NEXT: .cfi_def_cfa sp, 16
+; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: addi sp, sp, 16
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: dynamic_vector:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: .cfi_def_cfa_offset 16
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: addi s0, sp, 16
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: csrr a1, vlenb
+; RV32I-NEXT: mul a0, a1, a0
+; RV32I-NEXT: slli a0, a0, 1
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB6_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: sw a0, 0(a2)
+; RV32I-NEXT: addi sp, s0, -16
+; RV32I-NEXT: .cfi_def_cfa sp, 16
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %v = alloca <vscale x 4 x float>, i64 %size, align 16
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
diff --git a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
index 843e57a42d926d..b1c0755c36ec1f 100644
--- a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
+++ b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
@@ -606,4 +606,129 @@ define i32 @f10(i64 %i) local_unnamed_addr #0 {
ret i32 %c
}
+define void @f11(i32 %vla_size, i64 %i) #0 {
+; RV64I-LABEL: f11:
+; RV64I: # %bb.0:
+; RV64I-NEXT: addi sp, sp, -2032
+; RV64I-NEXT: .cfi_def_cfa_offset 2032
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64I-NEXT: sd s1, 2008(sp) # 8-byte Folded Spill
+; RV64I-NEXT: .cfi_offset ra, -8
+; RV64I-NEXT: .cfi_offset s0, -16
+; RV64I-NEXT: .cfi_offset s1, -24
+; RV64I-NEXT: addi s0, sp, 2032
+; RV64I-NEXT: .cfi_def_cfa s0, 0
+; RV64I-NEXT: lui a2, 15
+; RV64I-NEXT: sub t1, sp, a2
+; RV64I-NEXT: lui t2, 1
+; RV64I-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, t2
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: bne sp, t1, .LBB11_1
+; RV64I-NEXT: # %bb.2:
+; RV64I-NEXT: addi sp, sp, -2048
+; RV64I-NEXT: addi sp, sp, -16
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: srli a2, sp, 15
+; RV64I-NEXT: slli sp, a2, 15
+; RV64I-NEXT: mv s1, sp
+; RV64I-NEXT: slli a1, a1, 2
+; RV64I-NEXT: lui a2, 8
+; RV64I-NEXT: add a2, s1, a2
+; RV64I-NEXT: add a1, a2, a1
+; RV64I-NEXT: li a2, 1
+; RV64I-NEXT: slli a0, a0, 32
+; RV64I-NEXT: srli a0, a0, 32
+; RV64I-NEXT: sw a2, 0(a1)
+; RV64I-NEXT: addi a0, a0, 15
+; RV64I-NEXT: andi a0, a0, -16
+; RV64I-NEXT: sub a0, sp, a0
+; RV64I-NEXT: andi a0, a0, -2048
+; RV64I-NEXT: lui a1, 1
+; RV64I-NEXT: .LBB11_3: # =>This Inner Loop Header: Depth=1
+; RV64I-NEXT: sub sp, sp, a1
+; RV64I-NEXT: sd zero, 0(sp)
+; RV64I-NEXT: blt a0, sp, .LBB11_3
+; RV64I-NEXT: # %bb.4:
+; RV64I-NEXT: mv sp, a0
+; RV64I-NEXT: lbu zero, 0(a0)
+; RV64I-NEXT: addi sp, s0, -2032
+; RV64I-NEXT: .cfi_def_cfa sp, 2032
+; RV64I-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s0, 2016(sp) # 8-byte Folded Reload
+; RV64I-NEXT: ld s1, 2008(sp) # 8-byte Folded Reload
+; RV64I-NEXT: .cfi_restore ra
+; RV64I-NEXT: .cfi_restore s0
+; RV64I-NEXT: .cfi_restore s1
+; RV64I-NEXT: addi sp, sp, 2032
+; RV64I-NEXT: .cfi_def_cfa_offset 0
+; RV64I-NEXT: ret
+;
+; RV32I-LABEL: f11:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -2032
+; RV32I-NEXT: .cfi_def_cfa_offset 2032
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: sw ra, 2028(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s0, 2024(sp) # 4-byte Folded Spill
+; RV32I-NEXT: sw s1, 2020(sp) # 4-byte Folded Spill
+; RV32I-NEXT: .cfi_offset ra, -4
+; RV32I-NEXT: .cfi_offset s0, -8
+; RV32I-NEXT: .cfi_offset s1, -12
+; RV32I-NEXT: addi s0, sp, 2032
+; RV32I-NEXT: .cfi_def_cfa s0, 0
+; RV32I-NEXT: lui a2, 15
+; RV32I-NEXT: sub t1, sp, a2
+; RV32I-NEXT: lui t2, 1
+; RV32I-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, t2
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: bne sp, t1, .LBB11_1
+; RV32I-NEXT: # %bb.2:
+; RV32I-NEXT: addi sp, sp, -2048
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: srli a2, sp, 15
+; RV32I-NEXT: slli sp, a2, 15
+; RV32I-NEXT: mv s1, sp
+; RV32I-NEXT: slli a1, a1, 2
+; RV32I-NEXT: lui a2, 8
+; RV32I-NEXT: add a2, s1, a2
+; RV32I-NEXT: add a1, a2, a1
+; RV32I-NEXT: li a2, 1
+; RV32I-NEXT: addi a0, a0, 15
+; RV32I-NEXT: andi a0, a0, -16
+; RV32I-NEXT: sw a2, 0(a1)
+; RV32I-NEXT: sub a0, sp, a0
+; RV32I-NEXT: andi a0, a0, -2048
+; RV32I-NEXT: lui a1, 1
+; RV32I-NEXT: .LBB11_3: # =>This Inner Loop Header: Depth=1
+; RV32I-NEXT: sub sp, sp, a1
+; RV32I-NEXT: sw zero, 0(sp)
+; RV32I-NEXT: blt a0, sp, .LBB11_3
+; RV32I-NEXT: # %bb.4:
+; RV32I-NEXT: mv sp, a0
+; RV32I-NEXT: lbu zero, 0(a0)
+; RV32I-NEXT: addi sp, s0, -2032
+; RV32I-NEXT: .cfi_def_cfa sp, 2032
+; RV32I-NEXT: lw ra, 2028(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s0, 2024(sp) # 4-byte Folded Reload
+; RV32I-NEXT: lw s1, 2020(sp) # 4-byte Folded Reload
+; RV32I-NEXT: .cfi_restore ra
+; RV32I-NEXT: .cfi_restore s0
+; RV32I-NEXT: .cfi_restore s1
+; RV32I-NEXT: addi sp, sp, 2032
+; RV32I-NEXT: .cfi_def_cfa_offset 0
+; RV32I-NEXT: ret
+ %a = alloca i32, i32 4096, align 32768
+ %b = getelementptr inbounds i32, ptr %a, i64 %i
+ store volatile i32 1, ptr %b
+ %1 = zext i32 %vla_size to i64
+ %vla = alloca i8, i64 %1, align 2048
+ %2 = load volatile i8, ptr %vla, align 2048
+ ret void
+}
+
attributes #0 = { "probe-stack"="inline-asm" }
More information about the llvm-commits
mailing list