[clang] [llvm] [LoongArch] Add `-fstack-clash-protection` support (PR #195595)
Rong Mantle Bao via cfe-commits
cfe-commits at lists.llvm.org
Sat May 9 02:22:23 PDT 2026
https://github.com/CSharperMantle updated https://github.com/llvm/llvm-project/pull/195595
>From 3f866847db345d1b3fb1a0e86bfe09ad89c6e946 Mon Sep 17 00:00:00 2001
From: Rong Bao <rong.bao at csmantle.top>
Date: Sat, 2 May 2026 15:19:10 +0800
Subject: [PATCH 1/3] [LoongArch] Implement stack allocation probing
This implementation largely follows the pattern used in RISCV backend
with support of both constant and dynamic allocations.
---
.../LoongArch/LoongArchFrameLowering.cpp | 247 ++++++++++++++++--
.../Target/LoongArch/LoongArchFrameLowering.h | 8 +
.../LoongArch/LoongArchISelLowering.cpp | 124 ++++++++-
.../Target/LoongArch/LoongArchISelLowering.h | 9 +
.../Target/LoongArch/LoongArchInstrInfo.td | 22 ++
.../LoongArch/LoongArchMachineFunctionInfo.h | 5 +
6 files changed, 393 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index 690b0639484d0..0aba94383f546 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -15,6 +15,8 @@
#include "LoongArchSubtarget.h"
#include "MCTargetDesc/LoongArchBaseInfo.h"
#include "MCTargetDesc/LoongArchMCTargetDesc.h"
+#include "llvm/CodeGen/CFIInstBuilder.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -182,6 +184,118 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized(
}
}
+// Allocate stack space and probe it if necessary.
+void LoongArchFrameLowering::allocateStack(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineFunction &MF, uint64_t Offset,
+ uint64_t RealStackSize, bool EmitCFI,
+ bool NeedProbe, uint64_t ProbeSize,
+ bool DynAllocation,
+ MachineInstr::MIFlag Flag) const {
+ DebugLoc DL;
+ const LoongArchInstrInfo *TII = STI.getInstrInfo();
+ const bool IsLA64 = STI.is64Bit();
+ const Register SPReg = LoongArch::R3;
+ CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup);
+
+ // Simply allocate the stack if it's not big enough to require a probe.
+ if (!NeedProbe || Offset <= ProbeSize) {
+ adjustReg(MBB, MBBI, DL, SPReg, SPReg, -Offset, Flag);
+ if (EmitCFI)
+ CFIBuilder.buildDefCFAOffset(RealStackSize);
+
+ if (NeedProbe && DynAllocation) {
+ // st.{w/d} $zero, $sp, 0
+ BuildMI(MBB, MBBI, DL,
+ TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W))
+ .addReg(LoongArch::R0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlag(Flag);
+ }
+
+ return;
+ }
+
+ // Unroll the probe loop depending on the number of iterations.
+ if (Offset < ProbeSize * 5) {
+ const uint64_t CFAAdjust = RealStackSize - Offset;
+
+ uint64_t CurrentOffset = 0;
+ while (CurrentOffset + ProbeSize <= Offset) {
+ adjustReg(MBB, MBBI, DL, SPReg, SPReg, -ProbeSize, Flag);
+ // st.{w/d} $zero, $sp, 0
+ BuildMI(MBB, MBBI, DL,
+ TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W))
+ .addReg(LoongArch::R0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlag(Flag);
+
+ CurrentOffset += ProbeSize;
+ if (EmitCFI)
+ CFIBuilder.buildDefCFAOffset(CurrentOffset + CFAAdjust);
+ }
+
+ const uint64_t Residual = Offset - CurrentOffset;
+ if (Residual) {
+ adjustReg(MBB, MBBI, DL, SPReg, SPReg, -Residual, Flag);
+ if (EmitCFI)
+ CFIBuilder.buildDefCFAOffset(RealStackSize);
+
+ if (DynAllocation) {
+ // st.{w/d} $zero, $sp, 0
+ BuildMI(MBB, MBBI, DL,
+ TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W))
+ .addReg(LoongArch::R0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlag(Flag);
+ }
+ }
+ return;
+ }
+
+ // Emit a variable-length allocation probing loop.
+ const uint64_t RoundedSize = alignDown(Offset, ProbeSize);
+ const uint64_t Residual = Offset - RoundedSize;
+ const uint64_t CFAAdjust = RealStackSize - Offset;
+
+ const Register TargetReg = LoongArch::R13;
+ // SUB TargetReg, $sp, RoundedSize
+ adjustReg(MBB, MBBI, DL, TargetReg, SPReg, -RoundedSize, Flag);
+
+ if (EmitCFI) {
+ // Set the CFA register to TargetReg.
+ CFIBuilder.buildDefCFA(TargetReg, RoundedSize + CFAAdjust);
+ }
+
+ // It will be expanded to a probe loop in inlineStackProbe().
+ BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PROBED_STACKALLOC))
+ .addReg(TargetReg);
+
+ if (EmitCFI) {
+ // Set the CFA register back to SP.
+ CFIBuilder.buildDefCFARegister(SPReg);
+ }
+
+ if (Residual) {
+ adjustReg(MBB, MBBI, DL, SPReg, SPReg, -Residual, Flag);
+ if (DynAllocation) {
+ // st.{w/d} $zero, $sp, 0
+ BuildMI(MBB, MBBI, DL,
+ TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W))
+ .addReg(LoongArch::R0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlag(Flag);
+ }
+ }
+
+ if (EmitCFI)
+ CFIBuilder.buildDefCFAOffset(RealStackSize);
+}
+
void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -218,13 +332,15 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
StackSize = FirstSPAdjustAmount;
// Adjust stack.
- adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
- // Emit ".cfi_def_cfa_offset StackSize".
- unsigned CFIIndex =
- MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlag(MachineInstr::FrameSetup);
+ const LoongArchTargetLowering *TLI = STI.getTargetLowering();
+ const bool NeedProbe = TLI->hasInlineStackProbe(MF);
+ const uint64_t ProbeSize = TLI->getStackProbeSize(MF, getStackAlign());
+ const bool DynAllocation =
+ MF.getInfo<LoongArchMachineFunctionInfo>()->hasDynamicAllocation();
+ if (StackSize != 0)
+ allocateStack(MBB, MBBI, MF, StackSize, StackSize,
+ /*EmitCFI=*/true, NeedProbe, ProbeSize, DynAllocation,
+ MachineInstr::FrameSetup);
const auto &CSI = MFI.getCalleeSavedInfo();
@@ -265,19 +381,9 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF,
uint64_t SecondSPAdjustAmount = RealStackSize - FirstSPAdjustAmount;
assert(SecondSPAdjustAmount > 0 &&
"SecondSPAdjustAmount should be greater than zero");
- adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount,
- MachineInstr::FrameSetup);
-
- if (!hasFP(MF)) {
- // If we are using a frame-pointer, and thus emitted ".cfi_def_cfa fp, 0",
- // don't emit an sp-based .cfi_def_cfa_offset
- // Emit ".cfi_def_cfa_offset RealStackSize"
- unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlag(MachineInstr::FrameSetup);
- }
+ allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount, RealStackSize,
+ !hasFP(MF), NeedProbe, ProbeSize, DynAllocation,
+ MachineInstr::FrameSetup);
}
if (hasFP(MF)) {
@@ -353,6 +459,89 @@ void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF,
adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy);
}
+// Synthesize the probe loop.
+static void emitStackProbeInline(MachineBasicBlock::iterator MBBI, DebugLoc DL,
+ Register TargetReg) {
+ assert(TargetReg != LoongArch::R3 &&
+ "New top of stack cannot already be in $sp");
+
+ MachineBasicBlock &MBB = *MBBI->getParent();
+ MachineFunction &MF = *MBB.getParent();
+
+ const LoongArchSubtarget &STI = MF.getSubtarget<LoongArchSubtarget>();
+ const LoongArchInstrInfo *TII = STI.getInstrInfo();
+ const bool IsLA64 = STI.is64Bit();
+ const Align StackAlign = STI.getFrameLowering()->getStackAlign();
+ const LoongArchTargetLowering *TLI = STI.getTargetLowering();
+ const uint64_t ProbeSize = TLI->getStackProbeSize(MF, StackAlign);
+
+ MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator());
+ MachineBasicBlock *LoopTestMBB =
+ MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+ MF.insert(MBBInsertPoint, LoopTestMBB);
+ MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+ MF.insert(MBBInsertPoint, ExitMBB);
+ const Register SPReg = LoongArch::R3;
+ const Register ScratchReg = LoongArch::R14;
+ const MachineInstr::MIFlag Flags = MachineInstr::FrameSetup;
+
+ // ScratchReg = ProbeSize
+ TII->movImm(MBB, MBBI, DL, ScratchReg, ProbeSize, Flags);
+
+ // LoopTest:
+ // sub.{w/d} $sp, $sp, ScratchReg
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL,
+ TII->get(IsLA64 ? LoongArch::SUB_D : LoongArch::SUB_W), SPReg)
+ .addReg(SPReg)
+ .addReg(ScratchReg)
+ .setMIFlag(Flags);
+
+ // st.{w/d} $zero, $sp, 0
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL,
+ TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W))
+ .addReg(LoongArch::R0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlag(Flags);
+
+ // bne $sp, TargetReg, LoopTest
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(LoongArch::BNE))
+ .addReg(SPReg)
+ .addReg(TargetReg)
+ .addMBB(LoopTestMBB)
+ .setMIFlag(Flags);
+
+ ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
+ ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+
+ LoopTestMBB->addSuccessor(ExitMBB);
+ LoopTestMBB->addSuccessor(LoopTestMBB);
+ MBB.addSuccessor(LoopTestMBB);
+ // Update liveins.
+ fullyRecomputeLiveIns({ExitMBB, LoopTestMBB});
+}
+
+void LoongArchFrameLowering::inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ // Get the instructions that need to be replaced. We emit at most two of
+ // these. Remember them in order to avoid complications coming from the need
+ // to traverse the block while potentially creating more blocks.
+ SmallVector<MachineInstr *, 2> ToReplace;
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() == LoongArch::PROBED_STACKALLOC) {
+ ToReplace.push_back(&MI);
+ }
+ }
+
+ for (MachineInstr *MI : ToReplace) {
+ MachineBasicBlock::iterator MBBI = MI->getIterator();
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+ Register TargetReg = MI->getOperand(0).getReg();
+ emitStackProbeInline(MBBI, DL, TargetReg);
+ MBBI->eraseFromParent();
+ }
+}
+
// We would like to split the SP adjustment to reduce prologue/epilogue
// as following instructions. In this way, the offset of the callee saved
// register could fit in a single store.
@@ -425,7 +614,23 @@ LoongArchFrameLowering::eliminateCallFramePseudoInstr(
if (MI->getOpcode() == LoongArch::ADJCALLSTACKDOWN)
Amount = -Amount;
- adjustReg(MBB, MI, DL, SPReg, SPReg, Amount, MachineInstr::NoFlags);
+ const LoongArchTargetLowering *TLI =
+ MF.getSubtarget<LoongArchSubtarget>().getTargetLowering();
+ const int64_t ProbeSize = TLI->getStackProbeSize(MF, getStackAlign());
+ if (TLI->hasInlineStackProbe(MF) && -Amount >= ProbeSize) {
+ // When stack probing is enabled, the decrement of SP may need to be
+ // probed. We can handle both the decrement and the probing in
+ // allocateStack.
+ const bool DynAllocation =
+ MF.getInfo<LoongArchMachineFunctionInfo>()->hasDynamicAllocation();
+ allocateStack(MBB, MI, MF, -Amount, -Amount,
+ MF.needsFrameMoves() && !hasFP(MF),
+ /*NeedProbe=*/true, ProbeSize, DynAllocation,
+ MachineInstr::NoFlags);
+ inlineStackProbe(MF, MBB);
+ } else {
+ adjustReg(MBB, MI, DL, SPReg, SPReg, Amount, MachineInstr::NoFlags);
+ }
}
}
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
index 6cbfcf665f6a9..8a540986e9d70 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
@@ -55,11 +55,19 @@ class LoongArchFrameLowering : public TargetFrameLowering {
bool enableShrinkWrapping(const MachineFunction &MF) const override;
+ void inlineStackProbe(MachineFunction &MF,
+ MachineBasicBlock &PrologueMBB) const override;
+
protected:
bool hasFPImpl(const MachineFunction &MF) const override;
private:
void determineFrameLayout(MachineFunction &MF) const;
+ void allocateStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+ MachineFunction &MF, uint64_t Offset,
+ uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
+ uint64_t ProbeSize, bool DynAllocation,
+ MachineInstr::MIFlag Flag) const;
void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, Register DestReg, Register SrcReg,
int64_t Val, MachineInstr::MIFlag Flag) const;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index e3bdf2b993036..0d18c9c86a18a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -120,7 +120,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EH_DWARF_CFA, GRLenVT, Custom);
- setOperationAction(ISD::DYNAMIC_STACKALLOC, GRLenVT, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, GRLenVT, Custom);
setOperationAction({ISD::STACKSAVE, ISD::STACKRESTORE}, MVT::Other, Expand);
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction({ISD::VAARG, ISD::VACOPY, ISD::VAEND}, MVT::Other, Expand);
@@ -659,6 +659,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
return lowerFP_EXTEND(Op, DAG);
case ISD::SIGN_EXTEND_VECTOR_INREG:
return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC:
+ return lowerDYNAMIC_STACKALLOC(Op, DAG);
}
return SDValue();
}
@@ -8795,6 +8797,8 @@ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
if (!Subtarget.is64Bit())
report_fatal_error("STATEPOINT is only supported on 64-bit targets");
return emitPatchPoint(MI, BB);
+ case LoongArch::PROBED_STACKALLOC_DYN:
+ return emitDynamicProbedAlloc(MI, BB);
}
}
@@ -11014,3 +11018,121 @@ bool LoongArchTargetLowering::isExtractVecEltCheap(EVT VT,
// Extract a scalar FP value from index 0 of a vector is free.
return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
}
+
+bool LoongArchTargetLowering::hasInlineStackProbe(
+ const MachineFunction &MF) const {
+
+ // If the function specifically requests inline stack probes, emit them.
+ if (MF.getFunction().hasFnAttribute("probe-stack"))
+ return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+ "inline-asm";
+
+ return false;
+}
+
+unsigned LoongArchTargetLowering::getStackProbeSize(const MachineFunction &MF,
+ Align StackAlign) const {
+ // The default stack probe size is 4096 if the function has no
+ // stack-probe-size attribute.
+ const Function &Fn = MF.getFunction();
+ unsigned StackProbeSize =
+ Fn.getFnAttributeAsParsedInteger("stack-probe-size", 4096);
+ // Round down to the stack alignment.
+ StackProbeSize = alignDown(StackProbeSize, StackAlign.value());
+ return StackProbeSize ? StackProbeSize : StackAlign.value();
+}
+
+SDValue
+LoongArchTargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (!hasInlineStackProbe(MF))
+ return SDValue();
+
+ const MVT GRLenVT = Subtarget.getGRLenVT();
+ // Get the inputs.
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+
+ const MaybeAlign Align =
+ cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+ const SDLoc dl(Op);
+ const EVT VT = Op.getValueType();
+
+ // Construct the new SP value in a GPR.
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, LoongArch::R3, GRLenVT);
+ Chain = SP.getValue(1);
+ SP = DAG.getNode(ISD::SUB, dl, GRLenVT, SP, Size);
+ if (Align)
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getSignedConstant(-Align->value(), dl, VT));
+
+ // Set the real SP to the new value with a probing loop.
+ Chain = DAG.getNode(LoongArchISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
+ return DAG.getMergeValues({SP, Chain}, dl);
+}
+
+MachineBasicBlock *
+LoongArchTargetLowering::emitDynamicProbedAlloc(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ MachineFunction &MF = *MBB->getParent();
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
+ DebugLoc DL = MBB->findDebugLoc(MBBI);
+ const Register TargetReg = MI.getOperand(0).getReg();
+
+ const LoongArchInstrInfo *TII = Subtarget.getInstrInfo();
+ const bool IsLA64 = Subtarget.is64Bit();
+ const Align StackAlign = Subtarget.getFrameLowering()->getStackAlign();
+ const LoongArchTargetLowering *TLI = Subtarget.getTargetLowering();
+ const uint64_t ProbeSize = TLI->getStackProbeSize(MF, StackAlign);
+
+ MachineFunction::iterator MBBInsertPoint = std::next(MBB->getIterator());
+ MachineBasicBlock *const LoopTestMBB =
+ MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+ MF.insert(MBBInsertPoint, LoopTestMBB);
+ MachineBasicBlock *const ExitMBB =
+ MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+ MF.insert(MBBInsertPoint, ExitMBB);
+ const Register SPReg = LoongArch::R3;
+ const Register ScratchReg =
+ MF.getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+
+ // ScratchReg = ProbeSize
+ TII->movImm(*MBB, MBBI, DL, ScratchReg, ProbeSize, MachineInstr::NoFlags);
+
+ // LoopTest:
+ // sub.{w/d} $sp, $sp, ScratchReg
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL,
+ TII->get(IsLA64 ? LoongArch::SUB_D : LoongArch::SUB_W), SPReg)
+ .addReg(SPReg)
+ .addReg(ScratchReg);
+
+ // st.{w/d} $zero, $sp, 0
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL,
+ TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W))
+ .addReg(LoongArch::R0)
+ .addReg(SPReg)
+ .addImm(0);
+
+ // bltu TargetReg, $sp, LoopTest
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(LoongArch::BLTU))
+ .addReg(TargetReg)
+ .addReg(SPReg)
+ .addMBB(LoopTestMBB);
+
+ // move $sp, TargetReg
+ BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(LoongArch::OR), SPReg)
+ .addReg(TargetReg)
+ .addReg(LoongArch::R0);
+
+ ExitMBB->splice(ExitMBB->end(), MBB, std::next(MBBI), MBB->end());
+ ExitMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ LoopTestMBB->addSuccessor(ExitMBB);
+ LoopTestMBB->addSuccessor(LoopTestMBB);
+ MBB->addSuccessor(LoopTestMBB);
+
+ MI.eraseFromParent();
+ MF.getInfo<LoongArchMachineFunctionInfo>()->setDynamicAllocation();
+ return ExitMBB->begin()->getParent();
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 4cc1aa2261ecc..189ecbe4820d2 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -179,6 +179,14 @@ class LoongArchTargetLowering : public TargetLowering {
isImmVLDILegalForMode1(const APInt &SplatValue,
const unsigned SplatBitSize) const;
+ /// True if stack clash protection is enabled for this function.
+ bool hasInlineStackProbe(const MachineFunction &MF) const override;
+
+ unsigned getStackProbeSize(const MachineFunction &MF, Align StackAlign) const;
+
+ MachineBasicBlock *emitDynamicProbedAlloc(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
private:
/// Target-specific function used to lower LoongArch calling conventions.
typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI,
@@ -250,6 +258,7 @@ class LoongArchTargetLowering : public TargetLowering {
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 54d6e51d4d6db..6f78a6d8be41b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -76,6 +76,12 @@ def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+// To avoid stack clash, allocation is performed by block and each block is
+// probed.
+def loongarch_probed_alloca : SDNode<"LoongArchISD::PROBED_ALLOCA",
+ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPMayStore]>;
+
// Target-dependent nodes.
def loongarch_call : SDNode<"LoongArchISD::CALL", SDT_LoongArchCall,
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
@@ -2456,6 +2462,22 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
[(callseq_end timm:$amt1, timm:$amt2)]>;
} // Defs = [R3], Uses = [R3]
+// Stack probing
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
+ Defs = [R3], Uses = [R3] in {
+// Probed stack allocation of a constant size, used in function prologues when
+// stack-clash protection is enabled.
+def PROBED_STACKALLOC : Pseudo<(outs),
+ (ins GPR:$target),
+ []>,
+ Sched<[]>;
+let usesCustomInserter = 1 in
+def PROBED_STACKALLOC_DYN : Pseudo<(outs),
+ (ins GPR:$target),
+ [(loongarch_probed_alloca GPR:$target)]>,
+ Sched<[]>;
+}
+
//===----------------------------------------------------------------------===//
// Assembler Pseudo Instructions
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
index 904985c189dba..8e0a7f052961e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
@@ -43,6 +43,8 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo {
/// `annotate-tablejump` option.
SmallVector<std::pair<MachineInstr *, int>, 4> JumpInfos;
+ bool HasDynamicAllocation = false;
+
public:
LoongArchMachineFunctionInfo(const Function &F,
const TargetSubtargetInfo *STI) {}
@@ -82,6 +84,9 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo {
unsigned getJumpInfoSize() { return JumpInfos.size(); }
MachineInstr *getJumpInfoJrMI(unsigned Idx) { return JumpInfos[Idx].first; }
int getJumpInfoJTIIndex(unsigned Idx) { return JumpInfos[Idx].second; }
+
+ bool hasDynamicAllocation() const { return HasDynamicAllocation; }
+ void setDynamicAllocation() { HasDynamicAllocation = true; }
};
} // end namespace llvm
>From 7588e6b93fe90fad4b949b7a9646a60b9e1b2e56 Mon Sep 17 00:00:00 2001
From: Rong Bao <rong.bao at csmantle.top>
Date: Sat, 2 May 2026 15:15:52 +0800
Subject: [PATCH 2/3] [clang][LoongArch] Render stack-clash-protection flag on
LoongArch
---
clang/lib/Driver/ToolChains/Clang.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 92b3045dceff2..1777228d91c94 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3646,7 +3646,7 @@ static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args,
if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ() &&
!EffectiveTriple.isPPC64() && !EffectiveTriple.isAArch64() &&
- !EffectiveTriple.isRISCV())
+ !EffectiveTriple.isRISCV() && !EffectiveTriple.isLoongArch())
return;
Args.addOptInFlag(CmdArgs, options::OPT_fstack_clash_protection,
>From 2d3873f40e6a00960f50a21e911b54b469500f64 Mon Sep 17 00:00:00 2001
From: Rong Bao <rong.bao at csmantle.top>
Date: Sat, 2 May 2026 20:29:15 +0800
Subject: [PATCH 3/3] [test][LoongArch] Add and update tests for stack clash
protection
---
clang/test/CodeGen/stack-clash-protection.c | 1 +
.../LoongArch/inline-asm-constraint-f.ll | 2 -
.../stack-clash-prologue-nounwind.ll | 351 +++++++++
.../CodeGen/LoongArch/stack-clash-prologue.ll | 714 ++++++++++++++++++
.../stack-probing-dynamic-nonentry.ll | 109 +++
.../LoongArch/stack-probing-dynamic.ll | 479 ++++++++++++
.../LoongArch/stack-probing-frame-setup.mir | 185 +++++
7 files changed, 1839 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/LoongArch/stack-clash-prologue-nounwind.ll
create mode 100644 llvm/test/CodeGen/LoongArch/stack-clash-prologue.ll
create mode 100644 llvm/test/CodeGen/LoongArch/stack-probing-dynamic-nonentry.ll
create mode 100644 llvm/test/CodeGen/LoongArch/stack-probing-dynamic.ll
create mode 100644 llvm/test/CodeGen/LoongArch/stack-probing-frame-setup.mir
diff --git a/clang/test/CodeGen/stack-clash-protection.c b/clang/test/CodeGen/stack-clash-protection.c
index b07e4c4ce9084..b00cd46f8d24b 100644
--- a/clang/test/CodeGen/stack-clash-protection.c
+++ b/clang/test/CodeGen/stack-clash-protection.c
@@ -4,6 +4,7 @@
// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
// RUN: %clang_cc1 -triple powerpc64-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
// RUN: %clang_cc1 -triple aarch64-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
+// RUN: %clang_cc1 -triple loongarch64-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s
// CHECK: define{{.*}} void @large_stack() #[[A:.*]] {
void large_stack(void) {
diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
index b5f1c23a95207..9d66957037938 100644
--- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
+++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
@@ -33,7 +33,6 @@ define double @constraint_f_double(double %a) nounwind {
define double @constraint_gpr(double %a) {
; LA32-LABEL: constraint_gpr:
; LA32: # %bb.0:
-; LA32-NEXT: .cfi_def_cfa_offset 0
; LA32-NEXT: movfr2gr.s $a7, $fa0
; LA32-NEXT: movfrh2gr.s $t0, $fa0
; LA32-NEXT: #APP
@@ -45,7 +44,6 @@ define double @constraint_gpr(double %a) {
;
; LA64-LABEL: constraint_gpr:
; LA64: # %bb.0:
-; LA64-NEXT: .cfi_def_cfa_offset 0
; LA64-NEXT: movfr2gr.d $a7, $fa0
; LA64-NEXT: #APP
; LA64-NEXT: move $a6, $a7
diff --git a/llvm/test/CodeGen/LoongArch/stack-clash-prologue-nounwind.ll b/llvm/test/CodeGen/LoongArch/stack-clash-prologue-nounwind.ll
new file mode 100644
index 0000000000000..4e1b745a2e041
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/stack-clash-prologue-nounwind.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=loongarch64-unknown-linux-gnu -O2 < %s -verify-machineinstrs | FileCheck %s -check-prefix=LA64
+; RUN: llc -mtriple=loongarch32-unknown-linux-gnu -O2 < %s -verify-machineinstrs | FileCheck %s -check-prefix=LA32
+
+; Tests copied from PowerPC.
+
+; Free probe
+define i8 @f0() #0 nounwind {
+;
+; LA64-LABEL: f0:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -80
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: addi.d $sp, $sp, 80
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f0:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -64
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 0
+; LA32-NEXT: ld.b $a0, $sp, 0
+; LA32-NEXT: addi.w $sp, $sp, 64
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 64
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f1() #0 nounwind {
+;
+; LA64-LABEL: f1:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 1
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 1
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f1:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 1
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 4096
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f2() #0 nounwind {
+;
+; LA64-LABEL: f2:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 16
+; LA64-NEXT: sub.d $t1, $sp, $a0
+; LA64-NEXT: lu12i.w $t2, 1
+; LA64-NEXT: .LBB2_1: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $t2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bne $sp, $t1, .LBB2_1
+; LA64-NEXT: # %bb.2: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 16
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f2:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 16
+; LA32-NEXT: sub.w $t1, $sp, $a0
+; LA32-NEXT: lu12i.w $t2, 1
+; LA32-NEXT: .LBB2_1: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $t2
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bne $sp, $t1, .LBB2_1
+; LA32-NEXT: # %bb.2: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 16
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 65536
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f3() #0 "stack-probe-size"="32768" nounwind {
+;
+; LA64-LABEL: f3:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 8
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: lu12i.w $a0, 8
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 16
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f3:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 8
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: lu12i.w $a0, 8
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 16
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 65536
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+; Same as f2, but without protection.
+define i8 @f4() nounwind {
+;
+; LA64-LABEL: f4:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 16
+; LA64-NEXT: ori $a0, $a0, 16
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 16
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f4:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 16
+; LA32-NEXT: ori $a0, $a0, 16
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 16
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 65536
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f5() #0 "stack-probe-size"="65536" nounwind {
+;
+; LA64-LABEL: f5:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 256
+; LA64-NEXT: sub.d $t1, $sp, $a0
+; LA64-NEXT: lu12i.w $t2, 16
+; LA64-NEXT: .LBB5_1: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $t2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bne $sp, $t1, .LBB5_1
+; LA64-NEXT: # %bb.2: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 256
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f5:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 256
+; LA32-NEXT: sub.w $t1, $sp, $a0
+; LA32-NEXT: lu12i.w $t2, 16
+; LA32-NEXT: .LBB5_1: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $t2
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bne $sp, $t1, .LBB5_1
+; LA32-NEXT: # %bb.2: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 256
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 1048576
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f6() #0 nounwind {
+;
+; LA64-LABEL: f6:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 262144
+; LA64-NEXT: sub.d $t1, $sp, $a0
+; LA64-NEXT: lu12i.w $t2, 1
+; LA64-NEXT: .LBB6_1: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $t2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bne $sp, $t1, .LBB6_1
+; LA64-NEXT: # %bb.2: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 262144
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f6:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 262144
+; LA32-NEXT: sub.w $t1, $sp, $a0
+; LA32-NEXT: lu12i.w $t2, 1
+; LA32-NEXT: .LBB6_1: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $t2
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bne $sp, $t1, .LBB6_1
+; LA32-NEXT: # %bb.2: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 262144
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 1073741824
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f7() #0 "stack-probe-size"="65536" nounwind {
+;
+; LA64-LABEL: f7:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 244128
+; LA64-NEXT: sub.d $t1, $sp, $a0
+; LA64-NEXT: lu12i.w $t2, 16
+; LA64-NEXT: .LBB7_1: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $t2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bne $sp, $t1, .LBB7_1
+; LA64-NEXT: # %bb.2: # %entry
+; LA64-NEXT: lu12i.w $a0, 12
+; LA64-NEXT: ori $a0, $a0, 2576
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 9
+; LA64-NEXT: ld.b $a0, $sp, 9
+; LA64-NEXT: lu12i.w $a1, 244140
+; LA64-NEXT: ori $a1, $a1, 2576
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f7:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 244128
+; LA32-NEXT: sub.w $t1, $sp, $a0
+; LA32-NEXT: lu12i.w $t2, 16
+; LA32-NEXT: .LBB7_1: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $t2
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bne $sp, $t1, .LBB7_1
+; LA32-NEXT: # %bb.2: # %entry
+; LA32-NEXT: lu12i.w $a0, 12
+; LA32-NEXT: ori $a0, $a0, 2576
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 9
+; LA32-NEXT: ld.b $a0, $sp, 9
+; LA32-NEXT: lu12i.w $a1, 244140
+; LA32-NEXT: ori $a1, $a1, 2576
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 1000000007
+ %b = getelementptr inbounds i8, ptr %a, i64 101
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+attributes #0 = { "probe-stack"="inline-asm" }
diff --git a/llvm/test/CodeGen/LoongArch/stack-clash-prologue.ll b/llvm/test/CodeGen/LoongArch/stack-clash-prologue.ll
new file mode 100644
index 0000000000000..d42f3c861ec7b
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/stack-clash-prologue.ll
@@ -0,0 +1,714 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=loongarch64 -O2 < %s | FileCheck %s -check-prefix=LA64
+; RUN: llc -mtriple=loongarch32 -O2 < %s | FileCheck %s -check-prefix=LA32
+
+; Tests copied from PowerPC.
+
+; Free probe
+define i8 @f0() #0 {
+;
+; LA64-LABEL: f0:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -80
+; LA64-NEXT: .cfi_def_cfa_offset 80
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: addi.d $sp, $sp, 80
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f0:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -64
+; LA32-NEXT: .cfi_def_cfa_offset 64
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 0
+; LA32-NEXT: ld.b $a0, $sp, 0
+; LA32-NEXT: addi.w $sp, $sp, 64
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 64
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f1() #0 {
+;
+; LA64-LABEL: f1:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 1
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: .cfi_def_cfa_offset 4096
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: .cfi_def_cfa_offset 4112
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 1
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f1:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 1
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: .cfi_def_cfa_offset 4096
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: .cfi_def_cfa_offset 4112
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 4096
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f2() #0 {
+;
+; LA64-LABEL: f2:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 16
+; LA64-NEXT: sub.d $t1, $sp, $a0
+; LA64-NEXT: .cfi_def_cfa 13, 65536
+; LA64-NEXT: lu12i.w $t2, 1
+; LA64-NEXT: .LBB2_1: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $t2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bne $sp, $t1, .LBB2_1
+; LA64-NEXT: # %bb.2: # %entry
+; LA64-NEXT: .cfi_def_cfa_register 3
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: .cfi_def_cfa_offset 65552
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 16
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f2:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 16
+; LA32-NEXT: sub.w $t1, $sp, $a0
+; LA32-NEXT: .cfi_def_cfa 13, 65536
+; LA32-NEXT: lu12i.w $t2, 1
+; LA32-NEXT: .LBB2_1: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $t2
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bne $sp, $t1, .LBB2_1
+; LA32-NEXT: # %bb.2: # %entry
+; LA32-NEXT: .cfi_def_cfa_register 3
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: .cfi_def_cfa_offset 65552
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 16
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 65536
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f3() #0 "stack-probe-size"="32768" {
+;
+; LA64-LABEL: f3:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 8
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: .cfi_def_cfa_offset 32768
+; LA64-NEXT: lu12i.w $a0, 8
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: .cfi_def_cfa_offset 65536
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: .cfi_def_cfa_offset 65552
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 16
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f3:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 8
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: .cfi_def_cfa_offset 32768
+; LA32-NEXT: lu12i.w $a0, 8
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: .cfi_def_cfa_offset 65536
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: .cfi_def_cfa_offset 65552
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 16
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 65536
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+; Same as f2, but without protection.
+define i8 @f4() {
+;
+; LA64-LABEL: f4:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 16
+; LA64-NEXT: ori $a0, $a0, 16
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: .cfi_def_cfa_offset 65552
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 16
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f4:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 16
+; LA32-NEXT: ori $a0, $a0, 16
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: .cfi_def_cfa_offset 65552
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 16
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 65536
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f5() #0 "stack-probe-size"="65536" {
+;
+; LA64-LABEL: f5:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 256
+; LA64-NEXT: sub.d $t1, $sp, $a0
+; LA64-NEXT: .cfi_def_cfa 13, 1048576
+; LA64-NEXT: lu12i.w $t2, 16
+; LA64-NEXT: .LBB5_1: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $t2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bne $sp, $t1, .LBB5_1
+; LA64-NEXT: # %bb.2: # %entry
+; LA64-NEXT: .cfi_def_cfa_register 3
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: .cfi_def_cfa_offset 1048592
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 256
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f5:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 256
+; LA32-NEXT: sub.w $t1, $sp, $a0
+; LA32-NEXT: .cfi_def_cfa 13, 1048576
+; LA32-NEXT: lu12i.w $t2, 16
+; LA32-NEXT: .LBB5_1: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $t2
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bne $sp, $t1, .LBB5_1
+; LA32-NEXT: # %bb.2: # %entry
+; LA32-NEXT: .cfi_def_cfa_register 3
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: .cfi_def_cfa_offset 1048592
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 256
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 1048576
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f6() #0 {
+;
+; LA64-LABEL: f6:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 262144
+; LA64-NEXT: sub.d $t1, $sp, $a0
+; LA64-NEXT: .cfi_def_cfa 13, 1073741824
+; LA64-NEXT: lu12i.w $t2, 1
+; LA64-NEXT: .LBB6_1: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $t2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bne $sp, $t1, .LBB6_1
+; LA64-NEXT: # %bb.2: # %entry
+; LA64-NEXT: .cfi_def_cfa_register 3
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: .cfi_def_cfa_offset 1073741840
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 16
+; LA64-NEXT: ld.b $a0, $sp, 16
+; LA64-NEXT: lu12i.w $a1, 262144
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f6:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 262144
+; LA32-NEXT: sub.w $t1, $sp, $a0
+; LA32-NEXT: .cfi_def_cfa 13, 1073741824
+; LA32-NEXT: lu12i.w $t2, 1
+; LA32-NEXT: .LBB6_1: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $t2
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bne $sp, $t1, .LBB6_1
+; LA32-NEXT: # %bb.2: # %entry
+; LA32-NEXT: .cfi_def_cfa_register 3
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: .cfi_def_cfa_offset 1073741840
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 16
+; LA32-NEXT: ld.b $a0, $sp, 16
+; LA32-NEXT: lu12i.w $a1, 262144
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 1073741824
+ %b = getelementptr inbounds i8, ptr %a, i64 63
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+define i8 @f7() #0 "stack-probe-size"="65536" {
+;
+; LA64-LABEL: f7:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: lu12i.w $a0, 244128
+; LA64-NEXT: sub.d $t1, $sp, $a0
+; LA64-NEXT: .cfi_def_cfa 13, 999948288
+; LA64-NEXT: lu12i.w $t2, 16
+; LA64-NEXT: .LBB7_1: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $t2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bne $sp, $t1, .LBB7_1
+; LA64-NEXT: # %bb.2: # %entry
+; LA64-NEXT: .cfi_def_cfa_register 3
+; LA64-NEXT: lu12i.w $a0, 12
+; LA64-NEXT: ori $a0, $a0, 2576
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: .cfi_def_cfa_offset 1000000016
+; LA64-NEXT: ori $a0, $zero, 3
+; LA64-NEXT: st.b $a0, $sp, 9
+; LA64-NEXT: ld.b $a0, $sp, 9
+; LA64-NEXT: lu12i.w $a1, 244140
+; LA64-NEXT: ori $a1, $a1, 2576
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f7:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: lu12i.w $a0, 244128
+; LA32-NEXT: sub.w $t1, $sp, $a0
+; LA32-NEXT: .cfi_def_cfa 13, 999948288
+; LA32-NEXT: lu12i.w $t2, 16
+; LA32-NEXT: .LBB7_1: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $t2
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bne $sp, $t1, .LBB7_1
+; LA32-NEXT: # %bb.2: # %entry
+; LA32-NEXT: .cfi_def_cfa_register 3
+; LA32-NEXT: lu12i.w $a0, 12
+; LA32-NEXT: ori $a0, $a0, 2576
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: .cfi_def_cfa_offset 1000000016
+; LA32-NEXT: ori $a0, $zero, 3
+; LA32-NEXT: st.b $a0, $sp, 9
+; LA32-NEXT: ld.b $a0, $sp, 9
+; LA32-NEXT: lu12i.w $a1, 244140
+; LA32-NEXT: ori $a1, $a1, 2576
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ret
+entry:
+ %a = alloca i8, i64 1000000007
+ %b = getelementptr inbounds i8, ptr %a, i64 101
+ store volatile i8 3, ptr %a
+ %c = load volatile i8, ptr %a
+ ret i8 %c
+}
+
+; alloca + align < probe_size
+define i32 @f8(i64 %i) local_unnamed_addr #0 {
+;
+; LA64-LABEL: f8:
+; LA64: # %bb.0:
+; LA64-NEXT: addi.d $sp, $sp, -896
+; LA64-NEXT: .cfi_def_cfa_offset 896
+; LA64-NEXT: st.d $ra, $sp, 888 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 880 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: .cfi_offset 22, -16
+; LA64-NEXT: addi.d $fp, $sp, 896
+; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: bstrins.d $sp, $zero, 5, 0
+; LA64-NEXT: slli.d $a0, $a0, 2
+; LA64-NEXT: addi.d $a1, $sp, 64
+; LA64-NEXT: ori $a2, $zero, 1
+; LA64-NEXT: stx.w $a2, $a0, $a1
+; LA64-NEXT: ld.w $a0, $sp, 64
+; LA64-NEXT: addi.d $sp, $fp, -896
+; LA64-NEXT: ld.d $fp, $sp, 880 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 888 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 896
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f8:
+; LA32: # %bb.0:
+; LA32-NEXT: addi.w $sp, $sp, -832
+; LA32-NEXT: .cfi_def_cfa_offset 832
+; LA32-NEXT: st.w $ra, $sp, 828 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 824 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: .cfi_offset 22, -8
+; LA32-NEXT: addi.w $fp, $sp, 832
+; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: bstrins.w $sp, $zero, 5, 0
+; LA32-NEXT: slli.w $a0, $a0, 2
+; LA32-NEXT: addi.w $a1, $sp, 0
+; LA32-NEXT: add.w $a0, $a1, $a0
+; LA32-NEXT: ori $a1, $zero, 1
+; LA32-NEXT: st.w $a1, $a0, 0
+; LA32-NEXT: ld.w $a0, $sp, 0
+; LA32-NEXT: addi.w $sp, $fp, -832
+; LA32-NEXT: ld.w $fp, $sp, 824 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 828 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 832
+; LA32-NEXT: ret
+ %a = alloca i32, i32 200, align 64
+ %b = getelementptr inbounds i32, ptr %a, i64 %i
+ store volatile i32 1, ptr %b
+ %c = load volatile i32, ptr %a
+ ret i32 %c
+}
+
+; alloca > probe_size, align > probe_size
+define i32 @f9(i64 %i) local_unnamed_addr #0 {
+;
+; LA64-LABEL: f9:
+; LA64: # %bb.0:
+; LA64-NEXT: addi.d $sp, $sp, -2032
+; LA64-NEXT: .cfi_def_cfa_offset 2032
+; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: .cfi_offset 22, -16
+; LA64-NEXT: addi.d $fp, $sp, 2032
+; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: lu12i.w $a1, 1
+; LA64-NEXT: sub.d $sp, $sp, $a1
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: sub.d $sp, $sp, $a1
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: bstrins.d $sp, $zero, 10, 0
+; LA64-NEXT: slli.d $a0, $a0, 2
+; LA64-NEXT: ori $a1, $zero, 2048
+; LA64-NEXT: add.d $a1, $sp, $a1
+; LA64-NEXT: ori $a2, $zero, 1
+; LA64-NEXT: stx.w $a2, $a0, $a1
+; LA64-NEXT: ori $a0, $zero, 2048
+; LA64-NEXT: add.d $a0, $sp, $a0
+; LA64-NEXT: ld.w $a0, $a0, 0
+; LA64-NEXT: lu12i.w $a1, 2
+; LA64-NEXT: ori $a1, $a1, 2048
+; LA64-NEXT: sub.d $sp, $fp, $a1
+; LA64-NEXT: lu12i.w $a1, 2
+; LA64-NEXT: ori $a1, $a1, 16
+; LA64-NEXT: add.d $sp, $sp, $a1
+; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 2032
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f9:
+; LA32: # %bb.0:
+; LA32-NEXT: addi.w $sp, $sp, -2032
+; LA32-NEXT: .cfi_def_cfa_offset 2032
+; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: .cfi_offset 22, -8
+; LA32-NEXT: addi.w $fp, $sp, 2032
+; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: bstrins.w $sp, $zero, 10, 0
+; LA32-NEXT: slli.w $a0, $a0, 2
+; LA32-NEXT: ori $a1, $zero, 2048
+; LA32-NEXT: add.w $a1, $sp, $a1
+; LA32-NEXT: add.w $a0, $a1, $a0
+; LA32-NEXT: ori $a1, $zero, 1
+; LA32-NEXT: st.w $a1, $a0, 0
+; LA32-NEXT: ori $a0, $zero, 2048
+; LA32-NEXT: add.w $a0, $sp, $a0
+; LA32-NEXT: ld.w $a0, $a0, 0
+; LA32-NEXT: lu12i.w $a1, 2
+; LA32-NEXT: ori $a1, $a1, 2048
+; LA32-NEXT: sub.w $sp, $fp, $a1
+; LA32-NEXT: lu12i.w $a1, 2
+; LA32-NEXT: ori $a1, $a1, 16
+; LA32-NEXT: add.w $sp, $sp, $a1
+; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 2032
+; LA32-NEXT: ret
+ %a = alloca i32, i32 2000, align 2048
+ %b = getelementptr inbounds i32, ptr %a, i64 %i
+ store volatile i32 1, ptr %b
+ %c = load volatile i32, ptr %a
+ ret i32 %c
+}
+
+; alloca < probe_size, align < probe_size, alloca + align > probe_size
+define i32 @f10(i64 %i) local_unnamed_addr #0 {
+;
+; LA64-LABEL: f10:
+; LA64: # %bb.0:
+; LA64-NEXT: addi.d $sp, $sp, -2032
+; LA64-NEXT: .cfi_def_cfa_offset 2032
+; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: .cfi_offset 22, -16
+; LA64-NEXT: addi.d $fp, $sp, 2032
+; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: addi.d $sp, $sp, -2048
+; LA64-NEXT: addi.d $sp, $sp, -1040
+; LA64-NEXT: bstrins.d $sp, $zero, 9, 0
+; LA64-NEXT: slli.d $a0, $a0, 2
+; LA64-NEXT: addi.d $a1, $sp, 1024
+; LA64-NEXT: ori $a2, $zero, 1
+; LA64-NEXT: stx.w $a2, $a0, $a1
+; LA64-NEXT: ld.w $a0, $sp, 1024
+; LA64-NEXT: lu12i.w $a1, 1
+; LA64-NEXT: ori $a1, $a1, 1024
+; LA64-NEXT: sub.d $sp, $fp, $a1
+; LA64-NEXT: addi.d $sp, $sp, 2032
+; LA64-NEXT: addi.d $sp, $sp, 1056
+; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 2032
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f10:
+; LA32: # %bb.0:
+; LA32-NEXT: addi.w $sp, $sp, -2032
+; LA32-NEXT: .cfi_def_cfa_offset 2032
+; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: .cfi_offset 22, -8
+; LA32-NEXT: addi.w $fp, $sp, 2032
+; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: addi.w $sp, $sp, -2048
+; LA32-NEXT: addi.w $sp, $sp, -1040
+; LA32-NEXT: bstrins.w $sp, $zero, 9, 0
+; LA32-NEXT: slli.w $a0, $a0, 2
+; LA32-NEXT: addi.w $a1, $sp, 1024
+; LA32-NEXT: add.w $a0, $a1, $a0
+; LA32-NEXT: ori $a1, $zero, 1
+; LA32-NEXT: st.w $a1, $a0, 0
+; LA32-NEXT: ld.w $a0, $sp, 1024
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: ori $a1, $a1, 1024
+; LA32-NEXT: sub.w $sp, $fp, $a1
+; LA32-NEXT: addi.w $sp, $sp, 2032
+; LA32-NEXT: addi.w $sp, $sp, 1056
+; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 2032
+; LA32-NEXT: ret
+ %a = alloca i32, i32 1000, align 1024
+ %b = getelementptr inbounds i32, ptr %a, i64 %i
+ store volatile i32 1, ptr %b
+ %c = load volatile i32, ptr %a
+ ret i32 %c
+}
+
+define void @f11(i32 %vla_size, i64 %i) #0 {
+;
+; LA64-LABEL: f11:
+; LA64: # %bb.0:
+; LA64-NEXT: addi.d $sp, $sp, -2032
+; LA64-NEXT: .cfi_def_cfa_offset 2032
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s8, $sp, 2008 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: .cfi_offset 22, -16
+; LA64-NEXT: .cfi_offset 31, -24
+; LA64-NEXT: addi.d $fp, $sp, 2032
+; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: lu12i.w $a2, 15
+; LA64-NEXT: sub.d $t1, $sp, $a2
+; LA64-NEXT: lu12i.w $t2, 1
+; LA64-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $t2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bne $sp, $t1, .LBB11_1
+; LA64-NEXT: # %bb.2:
+; LA64-NEXT: addi.d $sp, $sp, -2048
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bstrins.d $sp, $zero, 14, 0
+; LA64-NEXT: move $s8, $sp
+; LA64-NEXT: slli.d $a1, $a1, 2
+; LA64-NEXT: lu12i.w $a2, 8
+; LA64-NEXT: add.d $a2, $s8, $a2
+; LA64-NEXT: ori $a3, $zero, 1
+; LA64-NEXT: stx.w $a3, $a1, $a2
+; LA64-NEXT: bstrpick.d $a0, $a0, 31, 0
+; LA64-NEXT: addi.d $a0, $a0, 15
+; LA64-NEXT: bstrpick.d $a0, $a0, 32, 4
+; LA64-NEXT: slli.d $a0, $a0, 4
+; LA64-NEXT: sub.d $a0, $sp, $a0
+; LA64-NEXT: bstrins.d $a0, $zero, 10, 0
+; LA64-NEXT: lu12i.w $a1, 1
+; LA64-NEXT: .LBB11_3: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $a1
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bltu $a0, $sp, .LBB11_3
+; LA64-NEXT: # %bb.4:
+; LA64-NEXT: move $sp, $a0
+; LA64-NEXT: ld.b $zero, $a0, 0
+; LA64-NEXT: lu12i.w $a0, 16
+; LA64-NEXT: sub.d $sp, $fp, $a0
+; LA64-NEXT: lu12i.w $a0, 15
+; LA64-NEXT: ori $a0, $a0, 2064
+; LA64-NEXT: add.d $sp, $sp, $a0
+; LA64-NEXT: ld.d $s8, $sp, 2008 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 2032
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f11:
+; LA32: # %bb.0:
+; LA32-NEXT: addi.w $sp, $sp, -2032
+; LA32-NEXT: .cfi_def_cfa_offset 2032
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s8, $sp, 2020 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: .cfi_offset 22, -8
+; LA32-NEXT: .cfi_offset 31, -12
+; LA32-NEXT: addi.w $fp, $sp, 2032
+; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: lu12i.w $a2, 15
+; LA32-NEXT: sub.w $t1, $sp, $a2
+; LA32-NEXT: lu12i.w $t2, 1
+; LA32-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $t2
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bne $sp, $t1, .LBB11_1
+; LA32-NEXT: # %bb.2:
+; LA32-NEXT: addi.w $sp, $sp, -2048
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bstrins.w $sp, $zero, 14, 0
+; LA32-NEXT: move $s8, $sp
+; LA32-NEXT: slli.w $a1, $a1, 2
+; LA32-NEXT: lu12i.w $a2, 8
+; LA32-NEXT: add.w $a2, $s8, $a2
+; LA32-NEXT: add.w $a1, $a2, $a1
+; LA32-NEXT: ori $a2, $zero, 1
+; LA32-NEXT: st.w $a2, $a1, 0
+; LA32-NEXT: addi.w $a0, $a0, 15
+; LA32-NEXT: addi.w $a1, $zero, -16
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: sub.w $a0, $sp, $a0
+; LA32-NEXT: addi.w $a1, $zero, -2048
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: .LBB11_3: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bltu $a0, $sp, .LBB11_3
+; LA32-NEXT: # %bb.4:
+; LA32-NEXT: move $sp, $a0
+; LA32-NEXT: ld.b $zero, $a0, 0
+; LA32-NEXT: lu12i.w $a0, 16
+; LA32-NEXT: sub.w $sp, $fp, $a0
+; LA32-NEXT: lu12i.w $a0, 15
+; LA32-NEXT: ori $a0, $a0, 2064
+; LA32-NEXT: add.w $sp, $sp, $a0
+; LA32-NEXT: ld.w $s8, $sp, 2020 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 2032
+; LA32-NEXT: ret
+ %a = alloca i32, i32 4096, align 32768
+ %b = getelementptr inbounds i32, ptr %a, i64 %i
+ store volatile i32 1, ptr %b
+ %1 = zext i32 %vla_size to i64
+ %vla = alloca i8, i64 %1, align 2048
+ %2 = load volatile i8, ptr %vla, align 2048
+ ret void
+}
+
+attributes #0 = { "probe-stack"="inline-asm" }
diff --git a/llvm/test/CodeGen/LoongArch/stack-probing-dynamic-nonentry.ll b/llvm/test/CodeGen/LoongArch/stack-probing-dynamic-nonentry.ll
new file mode 100644
index 0000000000000..9f6f0d5a53295
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/stack-probing-dynamic-nonentry.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=loongarch64 -O2 < %s -verify-machineinstrs | FileCheck %s -check-prefix=LA64
+; RUN: llc -mtriple=loongarch32 -O2 < %s -verify-machineinstrs | FileCheck %s -check-prefix=LA32
+
+; Test that very large outgoing call frames in functions with variable-sized
+; objects get proper stack probing. The outgoing args are large enough to force
+; the PROBED_STACKALLOC path, which must be expanded in a non-entry block.
+
+define void @f(i64 %n) #0 {
+; LA64-LABEL: f:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: .cfi_def_cfa_offset 16
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: .cfi_offset 22, -16
+; LA64-NEXT: addi.d $fp, $sp, 16
+; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: slli.d $a0, $a0, 2
+; LA64-NEXT: addi.d $a0, $a0, 15
+; LA64-NEXT: bstrins.d $a0, $zero, 3, 0
+; LA64-NEXT: sub.d $a0, $sp, $a0
+; LA64-NEXT: lu12i.w $a1, 1
+; LA64-NEXT: .LBB0_1: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $a1
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bltu $a0, $sp, .LBB0_1
+; LA64-NEXT: # %bb.2: # %entry
+; LA64-NEXT: move $sp, $a0
+; LA64-NEXT: lu12i.w $a1, 5
+; LA64-NEXT: sub.d $t1, $sp, $a1
+; LA64-NEXT: lu12i.w $t2, 1
+; LA64-NEXT: .LBB0_3: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $t2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bne $sp, $t1, .LBB0_3
+; LA64-NEXT: # %bb.4: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -2048
+; LA64-NEXT: addi.d $sp, $sp, -1424
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: pcaddu18i $ra, %call36(g)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: lu12i.w $a0, 5
+; LA64-NEXT: ori $a0, $a0, 3472
+; LA64-NEXT: add.d $sp, $sp, $a0
+; LA64-NEXT: addi.d $sp, $fp, -16
+; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 16
+; LA64-NEXT: ret
+;
+; LA32-LABEL: f:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: .cfi_def_cfa_offset 16
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: .cfi_offset 22, -8
+; LA32-NEXT: addi.w $fp, $sp, 16
+; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: slli.w $a0, $a0, 2
+; LA32-NEXT: addi.w $a0, $a0, 15
+; LA32-NEXT: addi.w $a1, $zero, -16
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: sub.w $a0, $sp, $a0
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: .LBB0_1: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bltu $a0, $sp, .LBB0_1
+; LA32-NEXT: # %bb.2: # %entry
+; LA32-NEXT: move $sp, $a0
+; LA32-NEXT: lu12i.w $a1, 5
+; LA32-NEXT: sub.w $t1, $sp, $a1
+; LA32-NEXT: lu12i.w $t2, 1
+; LA32-NEXT: .LBB0_3: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $t2
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bne $sp, $t1, .LBB0_3
+; LA32-NEXT: # %bb.4: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -2048
+; LA32-NEXT: addi.w $sp, $sp, -1456
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bl g
+; LA32-NEXT: lu12i.w $a0, 5
+; LA32-NEXT: ori $a0, $a0, 3504
+; LA32-NEXT: add.w $sp, $sp, $a0
+; LA32-NEXT: addi.w $sp, $fp, -16
+; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 16
+; LA32-NEXT: ret
+entry:
+ %v = alloca i32, i64 %n
+ call void @g(ptr %v, [3000 x i64] poison)
+ ret void
+}
+
+declare void @g(ptr, [3000 x i64])
+
+attributes #0 = { "probe-stack"="inline-asm" }
diff --git a/llvm/test/CodeGen/LoongArch/stack-probing-dynamic.ll b/llvm/test/CodeGen/LoongArch/stack-probing-dynamic.ll
new file mode 100644
index 0000000000000..fd928fe826340
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/stack-probing-dynamic.ll
@@ -0,0 +1,479 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=loongarch64 -O2 < %s | FileCheck %s -check-prefix=LA64
+; RUN: llc -mtriple=loongarch32 -O2 < %s | FileCheck %s -check-prefix=LA32
+
+; From llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
+
+; Dynamically-sized allocation, needs a loop which can handle any size at
+; runtime. The final iteration of the loop will temporarily put SP below the
+; target address, but this doesn't break any of the ABI constraints on the
+; stack, and also doesn't probe below the target SP value.
+define void @dynamic(i64 %size, ptr %out) #0 {
+;
+; LA64-LABEL: dynamic:
+; LA64: # %bb.0:
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: .cfi_def_cfa_offset 16
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: .cfi_offset 22, -16
+; LA64-NEXT: addi.d $fp, $sp, 16
+; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: addi.d $a0, $a0, 15
+; LA64-NEXT: bstrins.d $a0, $zero, 3, 0
+; LA64-NEXT: sub.d $a0, $sp, $a0
+; LA64-NEXT: lu12i.w $a2, 1
+; LA64-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $a2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bltu $a0, $sp, .LBB0_1
+; LA64-NEXT: # %bb.2:
+; LA64-NEXT: move $sp, $a0
+; LA64-NEXT: st.d $a0, $a1, 0
+; LA64-NEXT: addi.d $sp, $fp, -16
+; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 16
+; LA64-NEXT: ret
+;
+; LA32-LABEL: dynamic:
+; LA32: # %bb.0:
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: .cfi_def_cfa_offset 16
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: .cfi_offset 22, -8
+; LA32-NEXT: addi.w $fp, $sp, 16
+; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: addi.w $a0, $a0, 15
+; LA32-NEXT: addi.w $a1, $zero, -16
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: sub.w $a0, $sp, $a0
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bltu $a0, $sp, .LBB0_1
+; LA32-NEXT: # %bb.2:
+; LA32-NEXT: move $sp, $a0
+; LA32-NEXT: st.w $a0, $a2, 0
+; LA32-NEXT: addi.w $sp, $fp, -16
+; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 16
+; LA32-NEXT: ret
+ %v = alloca i8, i64 %size, align 1
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; This function has a fixed-size stack slot and a dynamic one. The fixed size
+; slot isn't large enough that we would normally probe it, but we need to do so
+; here otherwise the gap between the CSR save and the first probe of the
+; dynamic allocation could be too far apart when the size of the dynamic
+; allocation is close to the guard size.
+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
+;
+; LA64-LABEL: dynamic_fixed:
+; LA64: # %bb.0:
+; LA64-NEXT: addi.d $sp, $sp, -96
+; LA64-NEXT: .cfi_def_cfa_offset 96
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: .cfi_offset 22, -16
+; LA64-NEXT: addi.d $fp, $sp, 96
+; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: addi.d $a3, $fp, -88
+; LA64-NEXT: st.d $a3, $a1, 0
+; LA64-NEXT: addi.d $a0, $a0, 15
+; LA64-NEXT: bstrins.d $a0, $zero, 3, 0
+; LA64-NEXT: sub.d $a0, $sp, $a0
+; LA64-NEXT: lu12i.w $a1, 1
+; LA64-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $a1
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bltu $a0, $sp, .LBB1_1
+; LA64-NEXT: # %bb.2:
+; LA64-NEXT: move $sp, $a0
+; LA64-NEXT: st.d $a0, $a2, 0
+; LA64-NEXT: addi.d $sp, $fp, -96
+; LA64-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 96
+; LA64-NEXT: ret
+;
+; LA32-LABEL: dynamic_fixed:
+; LA32: # %bb.0:
+; LA32-NEXT: addi.w $sp, $sp, -80
+; LA32-NEXT: .cfi_def_cfa_offset 80
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: .cfi_offset 22, -8
+; LA32-NEXT: addi.w $fp, $sp, 80
+; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: addi.w $a1, $fp, -72
+; LA32-NEXT: st.w $a1, $a2, 0
+; LA32-NEXT: addi.w $a0, $a0, 15
+; LA32-NEXT: addi.w $a1, $zero, -16
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: sub.w $a0, $sp, $a0
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bltu $a0, $sp, .LBB1_1
+; LA32-NEXT: # %bb.2:
+; LA32-NEXT: move $sp, $a0
+; LA32-NEXT: st.w $a0, $a3, 0
+; LA32-NEXT: addi.w $sp, $fp, -80
+; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 80
+; LA32-NEXT: ret
+ %v1 = alloca i8, i64 64, align 1
+ store ptr %v1, ptr %out1, align 8
+ %v2 = alloca i8, i64 %size, align 1
+ store ptr %v2, ptr %out2, align 8
+ ret void
+}
+
+; Dynamic allocation, with an alignment requirement greater than the alignment
+; of SP. Done by ANDing the target SP with a constant to align it down, then
+; doing the loop as normal. Note that we also re-align the stack in the prolog,
+; which isn't actually needed because the only aligned allocations are dynamic,
+; this is done even without stack probing.
+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
+;
+; LA64-LABEL: dynamic_align_64:
+; LA64: # %bb.0:
+; LA64-NEXT: addi.d $sp, $sp, -128
+; LA64-NEXT: .cfi_def_cfa_offset 128
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s8, $sp, 104 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: .cfi_offset 22, -16
+; LA64-NEXT: .cfi_offset 31, -24
+; LA64-NEXT: addi.d $fp, $sp, 128
+; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: bstrins.d $sp, $zero, 5, 0
+; LA64-NEXT: move $s8, $sp
+; LA64-NEXT: addi.d $a0, $a0, 15
+; LA64-NEXT: bstrins.d $a0, $zero, 3, 0
+; LA64-NEXT: sub.d $a0, $sp, $a0
+; LA64-NEXT: bstrins.d $a0, $zero, 5, 0
+; LA64-NEXT: lu12i.w $a2, 1
+; LA64-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $a2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bltu $a0, $sp, .LBB2_1
+; LA64-NEXT: # %bb.2:
+; LA64-NEXT: move $sp, $a0
+; LA64-NEXT: st.d $a0, $a1, 0
+; LA64-NEXT: addi.d $sp, $fp, -128
+; LA64-NEXT: ld.d $s8, $sp, 104 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 128
+; LA64-NEXT: ret
+;
+; LA32-LABEL: dynamic_align_64:
+; LA32: # %bb.0:
+; LA32-NEXT: addi.w $sp, $sp, -64
+; LA32-NEXT: .cfi_def_cfa_offset 64
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: st.w $ra, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s8, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: .cfi_offset 22, -8
+; LA32-NEXT: .cfi_offset 31, -12
+; LA32-NEXT: addi.w $fp, $sp, 64
+; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: bstrins.w $sp, $zero, 5, 0
+; LA32-NEXT: move $s8, $sp
+; LA32-NEXT: addi.w $a0, $a0, 15
+; LA32-NEXT: addi.w $a1, $zero, -16
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: sub.w $a0, $sp, $a0
+; LA32-NEXT: addi.w $a1, $zero, -64
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bltu $a0, $sp, .LBB2_1
+; LA32-NEXT: # %bb.2:
+; LA32-NEXT: move $sp, $a0
+; LA32-NEXT: st.w $a0, $a2, 0
+; LA32-NEXT: addi.w $sp, $fp, -64
+; LA32-NEXT: ld.w $s8, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 64
+; LA32-NEXT: ret
+ %v = alloca i8, i64 %size, align 64
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; Dynamic allocation, with an alignment greater than the stack guard size. The
+; only difference to the dynamic allocation is the constant used for aligning
+; the target SP, the loop will probe the whole allocation without needing to
+; know about the alignment padding.
+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
+;
+; LA64-LABEL: dynamic_align_8192:
+; LA64: # %bb.0:
+; LA64-NEXT: addi.d $sp, $sp, -2032
+; LA64-NEXT: .cfi_def_cfa_offset 2032
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s8, $sp, 2008 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: .cfi_offset 22, -16
+; LA64-NEXT: .cfi_offset 31, -24
+; LA64-NEXT: addi.d $fp, $sp, 2032
+; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: lu12i.w $a2, 1
+; LA64-NEXT: sub.d $sp, $sp, $a2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: sub.d $sp, $sp, $a2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: sub.d $sp, $sp, $a2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: addi.d $sp, $sp, -2048
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bstrins.d $sp, $zero, 12, 0
+; LA64-NEXT: move $s8, $sp
+; LA64-NEXT: addi.d $a0, $a0, 15
+; LA64-NEXT: bstrins.d $a0, $zero, 3, 0
+; LA64-NEXT: sub.d $a0, $sp, $a0
+; LA64-NEXT: bstrins.d $a0, $zero, 12, 0
+; LA64-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $a2
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bltu $a0, $sp, .LBB3_1
+; LA64-NEXT: # %bb.2:
+; LA64-NEXT: move $sp, $a0
+; LA64-NEXT: st.d $a0, $a1, 0
+; LA64-NEXT: lu12i.w $a0, 4
+; LA64-NEXT: sub.d $sp, $fp, $a0
+; LA64-NEXT: lu12i.w $a0, 3
+; LA64-NEXT: ori $a0, $a0, 2064
+; LA64-NEXT: add.d $sp, $sp, $a0
+; LA64-NEXT: ld.d $s8, $sp, 2008 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 2032
+; LA64-NEXT: ret
+;
+; LA32-LABEL: dynamic_align_8192:
+; LA32: # %bb.0:
+; LA32-NEXT: addi.w $sp, $sp, -2032
+; LA32-NEXT: .cfi_def_cfa_offset 2032
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s8, $sp, 2020 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: .cfi_offset 22, -8
+; LA32-NEXT: .cfi_offset 31, -12
+; LA32-NEXT: addi.w $fp, $sp, 2032
+; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: addi.w $sp, $sp, -2048
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bstrins.w $sp, $zero, 12, 0
+; LA32-NEXT: move $s8, $sp
+; LA32-NEXT: addi.w $a0, $a0, 15
+; LA32-NEXT: addi.w $a1, $zero, -16
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: sub.w $a0, $sp, $a0
+; LA32-NEXT: lu12i.w $a1, -2
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bltu $a0, $sp, .LBB3_1
+; LA32-NEXT: # %bb.2:
+; LA32-NEXT: move $sp, $a0
+; LA32-NEXT: st.w $a0, $a2, 0
+; LA32-NEXT: lu12i.w $a0, 4
+; LA32-NEXT: sub.w $sp, $fp, $a0
+; LA32-NEXT: lu12i.w $a0, 3
+; LA32-NEXT: ori $a0, $a0, 2064
+; LA32-NEXT: add.w $sp, $sp, $a0
+; LA32-NEXT: ld.w $s8, $sp, 2020 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 2032
+; LA32-NEXT: ret
+ %v = alloca i8, i64 %size, align 8192
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; If a function has variable-sized stack objects, then any function calls which
+; need to pass arguments on the stack must allocate the stack space for them
+; dynamically, to ensure they are at the bottom of the frame.
+define void @no_reserved_call_frame(i64 %n) #0 {
+;
+; LA64-LABEL: no_reserved_call_frame:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -16
+; LA64-NEXT: .cfi_def_cfa_offset 16
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: .cfi_offset 22, -16
+; LA64-NEXT: addi.d $fp, $sp, 16
+; LA64-NEXT: .cfi_def_cfa 22, 0
+; LA64-NEXT: slli.d $a0, $a0, 2
+; LA64-NEXT: addi.d $a0, $a0, 15
+; LA64-NEXT: bstrins.d $a0, $zero, 3, 0
+; LA64-NEXT: sub.d $a0, $sp, $a0
+; LA64-NEXT: lu12i.w $a1, 1
+; LA64-NEXT: .LBB4_1: # %entry
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: sub.d $sp, $sp, $a1
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: bltu $a0, $sp, .LBB4_1
+; LA64-NEXT: # %bb.2: # %entry
+; LA64-NEXT: move $sp, $a0
+; LA64-NEXT: lu12i.w $a1, 1
+; LA64-NEXT: sub.d $sp, $sp, $a1
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: pcaddu18i $ra, %call36(callee_stack_args)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: lu12i.w $a0, 1
+; LA64-NEXT: add.d $sp, $sp, $a0
+; LA64-NEXT: addi.d $sp, $fp, -16
+; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 16
+; LA64-NEXT: ret
+;
+; LA32-LABEL: no_reserved_call_frame:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -16
+; LA32-NEXT: .cfi_def_cfa_offset 16
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: .cfi_offset 22, -8
+; LA32-NEXT: addi.w $fp, $sp, 16
+; LA32-NEXT: .cfi_def_cfa 22, 0
+; LA32-NEXT: slli.w $a0, $a0, 2
+; LA32-NEXT: addi.w $a0, $a0, 15
+; LA32-NEXT: addi.w $a1, $zero, -16
+; LA32-NEXT: and $a0, $a0, $a1
+; LA32-NEXT: sub.w $a0, $sp, $a0
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: .LBB4_1: # %entry
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bltu $a0, $sp, .LBB4_1
+; LA32-NEXT: # %bb.2: # %entry
+; LA32-NEXT: move $sp, $a0
+; LA32-NEXT: lu12i.w $a1, 1
+; LA32-NEXT: sub.w $sp, $sp, $a1
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: addi.w $sp, $sp, -32
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: bl callee_stack_args
+; LA32-NEXT: lu12i.w $a0, 1
+; LA32-NEXT: ori $a0, $a0, 32
+; LA32-NEXT: add.w $sp, $sp, $a0
+; LA32-NEXT: addi.w $sp, $fp, -16
+; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 16
+; LA32-NEXT: ret
+entry:
+ %v = alloca i32, i64 %n
+ call void @callee_stack_args(ptr %v, [518 x i64] poison)
+ ret void
+}
+
+; Same as above but without a variable-sized allocation, so the reserved call
+; frame can be folded into the fixed-size allocation in the prologue.
+define void @reserved_call_frame(i64 %n) #0 {
+;
+; LA64-LABEL: reserved_call_frame:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -2032
+; LA64-NEXT: .cfi_def_cfa_offset 2032
+; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill
+; LA64-NEXT: .cfi_offset 1, -8
+; LA64-NEXT: lu12i.w $a0, 1
+; LA64-NEXT: sub.d $sp, $sp, $a0
+; LA64-NEXT: st.d $zero, $sp, 0
+; LA64-NEXT: .cfi_def_cfa_offset 6128
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: .cfi_def_cfa_offset 6176
+; LA64-NEXT: lu12i.w $a0, 1
+; LA64-NEXT: add.d $a0, $sp, $a0
+; LA64-NEXT: pcaddu18i $ra, %call36(callee_stack_args)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: lu12i.w $a0, 1
+; LA64-NEXT: ori $a0, $a0, 48
+; LA64-NEXT: add.d $sp, $sp, $a0
+; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 2032
+; LA64-NEXT: ret
+;
+; LA32-LABEL: reserved_call_frame:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -2032
+; LA32-NEXT: .cfi_def_cfa_offset 2032
+; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill
+; LA32-NEXT: .cfi_offset 1, -4
+; LA32-NEXT: lu12i.w $a0, 1
+; LA32-NEXT: sub.w $sp, $sp, $a0
+; LA32-NEXT: st.w $zero, $sp, 0
+; LA32-NEXT: .cfi_def_cfa_offset 6128
+; LA32-NEXT: addi.w $sp, $sp, -80
+; LA32-NEXT: .cfi_def_cfa_offset 6208
+; LA32-NEXT: lu12i.w $a0, 1
+; LA32-NEXT: ori $a0, $a0, 36
+; LA32-NEXT: add.w $a0, $sp, $a0
+; LA32-NEXT: bl callee_stack_args
+; LA32-NEXT: lu12i.w $a0, 1
+; LA32-NEXT: ori $a0, $a0, 80
+; LA32-NEXT: add.w $sp, $sp, $a0
+; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 2032
+; LA32-NEXT: ret
+entry:
+ %v = alloca i32, i64 518
+ call void @callee_stack_args(ptr %v, [518 x i64] poison)
+ ret void
+}
+
+declare void @callee_stack_args(ptr, [518 x i64])
+
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
diff --git a/llvm/test/CodeGen/LoongArch/stack-probing-frame-setup.mir b/llvm/test/CodeGen/LoongArch/stack-probing-frame-setup.mir
new file mode 100644
index 0000000000000..465ddd012b134
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/stack-probing-frame-setup.mir
@@ -0,0 +1,185 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=loongarch64 -x mir -run-pass=prologepilog -verify-machineinstrs < %s | FileCheck %s --check-prefix=LA64
+# RUN: llc -mtriple=loongarch32 -x mir -run-pass=prologepilog -verify-machineinstrs < %s | FileCheck %s --check-prefix=LA32
+
+--- |
+ ; Function Attrs: uwtable
+ define void @no_reserved_call_frame(i64 %n) #0 {
+ entry:
+ %v = alloca i32, i64 %n, align 4
+ call void @callee_stack_args(ptr %v, [518 x i64] poison)
+ ret void
+ }
+
+ declare void @callee_stack_args(ptr, [518 x i64])
+
+ attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" }
+...
+---
+name: no_reserved_call_frame
+alignment: 4
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+noPhis: true
+isSSA: false
+noVRegs: true
+hasFakeUses: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: true
+registers: []
+liveins:
+ - { reg: '$r4', virtual-reg: '' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 8
+ adjustsStack: true
+ hasCalls: true
+ framePointerPolicy: none
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+fixedStack: []
+stack:
+ - { id: 0, name: v, type: variable-sized, offset: 0, alignment: 1, stack-id: default,
+ callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '',
+ debug-info-expression: '', debug-info-location: '' }
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ ; LA64-LABEL: name: no_reserved_call_frame
+ ; LA64: bb.0.entry:
+ ; LA64-NEXT: successors: %bb.1(0x80000000)
+ ; LA64-NEXT: liveins: $r4, $r1
+ ; LA64-NEXT: {{ $}}
+ ; LA64-NEXT: $r3 = frame-setup ADDI_D $r3, -16
+ ; LA64-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+ ; LA64-NEXT: ST_D killed $r1, $r3, 8 :: (store (s64) into %stack.1)
+ ; LA64-NEXT: ST_D killed $r22, $r3, 0 :: (store (s64) into %stack.2)
+ ; LA64-NEXT: frame-setup CFI_INSTRUCTION offset $r1, -8
+ ; LA64-NEXT: frame-setup CFI_INSTRUCTION offset $r22, -16
+ ; LA64-NEXT: $r22 = frame-setup ADDI_D $r3, 16
+ ; LA64-NEXT: frame-setup CFI_INSTRUCTION def_cfa $r22, 0
+ ; LA64-NEXT: renamable $r4 = SLLI_D killed renamable $r4, 2
+ ; LA64-NEXT: renamable $r4 = nuw ADDI_D killed renamable $r4, 15
+ ; LA64-NEXT: renamable $r4 = BSTRINS_D killed renamable $r4, $r0, 3, 0
+ ; LA64-NEXT: renamable $r4 = SUB_D $r3, killed renamable $r4
+ ; LA64-NEXT: renamable $r5 = LU12I_W 1
+ ; LA64-NEXT: {{ $}}
+ ; LA64-NEXT: bb.1.entry:
+ ; LA64-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; LA64-NEXT: liveins: $r4, $r5
+ ; LA64-NEXT: {{ $}}
+ ; LA64-NEXT: $r3 = SUB_D $r3, renamable $r5
+ ; LA64-NEXT: ST_D $r0, $r3, 0
+ ; LA64-NEXT: BLTU renamable $r4, $r3, %bb.1
+ ; LA64-NEXT: {{ $}}
+ ; LA64-NEXT: bb.2.entry:
+ ; LA64-NEXT: liveins: $r4
+ ; LA64-NEXT: {{ $}}
+ ; LA64-NEXT: $r3 = OR renamable $r4, $r0
+ ; LA64-NEXT: $r5 = LU12I_W 1
+ ; LA64-NEXT: $r3 = SUB_D $r3, killed $r5
+ ; LA64-NEXT: PseudoCALL_MEDIUM target-flags(loongarch-call-plt) @callee_stack_args, csr_ilp32d_lp64d, implicit-def dead $r1, implicit-def dead $r20, implicit $r4, implicit undef $r5, implicit undef $r6, implicit undef $r7, implicit undef $r8, implicit undef $r9, implicit undef $r10, implicit undef $r11, implicit-def $r3
+ ; LA64-NEXT: $r4 = LU12I_W 1
+ ; LA64-NEXT: $r3 = ADD_D $r3, killed $r4
+ ; LA64-NEXT: $r3 = frame-destroy ADDI_D $r22, -16
+ ; LA64-NEXT: $r22 = LD_D $r3, 0 :: (load (s64) from %stack.2)
+ ; LA64-NEXT: $r1 = LD_D $r3, 8 :: (load (s64) from %stack.1)
+ ; LA64-NEXT: $r3 = frame-destroy ADDI_D $r3, 16
+ ; LA64-NEXT: PseudoRET
+ ;
+ ; LA32-LABEL: name: no_reserved_call_frame
+ ; LA32: bb.0.entry:
+ ; LA32-NEXT: successors: %bb.1(0x80000000)
+ ; LA32-NEXT: liveins: $r4, $r1
+ ; LA32-NEXT: {{ $}}
+ ; LA32-NEXT: $r3 = frame-setup ADDI_W $r3, -16
+ ; LA32-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+ ; LA32-NEXT: ST_W killed $r1, $r3, 12 :: (store (s32) into %stack.1)
+ ; LA32-NEXT: ST_W killed $r22, $r3, 8 :: (store (s32) into %stack.2)
+ ; LA32-NEXT: frame-setup CFI_INSTRUCTION offset $r1, -4
+ ; LA32-NEXT: frame-setup CFI_INSTRUCTION offset $r22, -8
+ ; LA32-NEXT: $r22 = frame-setup ADDI_W $r3, 16
+ ; LA32-NEXT: frame-setup CFI_INSTRUCTION def_cfa $r22, 0
+ ; LA32-NEXT: renamable $r4 = SLLI_D killed renamable $r4, 2
+ ; LA32-NEXT: renamable $r4 = nuw ADDI_D killed renamable $r4, 15
+ ; LA32-NEXT: renamable $r4 = BSTRINS_D killed renamable $r4, $r0, 3, 0
+ ; LA32-NEXT: renamable $r4 = SUB_D $r3, killed renamable $r4
+ ; LA32-NEXT: renamable $r5 = LU12I_W 1
+ ; LA32-NEXT: {{ $}}
+ ; LA32-NEXT: bb.1.entry:
+ ; LA32-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; LA32-NEXT: liveins: $r4, $r5
+ ; LA32-NEXT: {{ $}}
+ ; LA32-NEXT: $r3 = SUB_D $r3, renamable $r5
+ ; LA32-NEXT: ST_D $r0, $r3, 0
+ ; LA32-NEXT: BLTU renamable $r4, $r3, %bb.1
+ ; LA32-NEXT: {{ $}}
+ ; LA32-NEXT: bb.2.entry:
+ ; LA32-NEXT: liveins: $r4
+ ; LA32-NEXT: {{ $}}
+ ; LA32-NEXT: $r3 = OR renamable $r4, $r0
+ ; LA32-NEXT: $r5 = LU12I_W 1
+ ; LA32-NEXT: $r3 = SUB_W $r3, killed $r5
+ ; LA32-NEXT: PseudoCALL_MEDIUM target-flags(loongarch-call-plt) @callee_stack_args, csr_ilp32d_lp64d, implicit-def dead $r1, implicit-def dead $r20, implicit $r4, implicit undef $r5, implicit undef $r6, implicit undef $r7, implicit undef $r8, implicit undef $r9, implicit undef $r10, implicit undef $r11, implicit-def $r3
+ ; LA32-NEXT: $r4 = LU12I_W 1
+ ; LA32-NEXT: $r3 = ADD_W $r3, killed $r4
+ ; LA32-NEXT: $r3 = frame-destroy ADDI_W $r22, -16
+ ; LA32-NEXT: $r22 = LD_W $r3, 8 :: (load (s32) from %stack.2)
+ ; LA32-NEXT: $r1 = LD_W $r3, 12 :: (load (s32) from %stack.1)
+ ; LA32-NEXT: $r3 = frame-destroy ADDI_W $r3, 16
+ ; LA32-NEXT: PseudoRET
+ bb.0.entry:
+ successors: %bb.1(0x80000000)
+ liveins: $r4
+
+ renamable $r4 = SLLI_D killed renamable $r4, 2
+ renamable $r4 = nuw ADDI_D killed renamable $r4, 15
+ renamable $r4 = BSTRINS_D killed renamable $r4, $r0, 3, 0
+ renamable $r4 = SUB_D $r3, killed renamable $r4
+ renamable $r5 = LU12I_W 1
+
+ bb.1.entry:
+ successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ liveins: $r4, $r5
+
+ $r3 = SUB_D $r3, renamable $r5
+ ST_D $r0, $r3, 0
+ BLTU renamable $r4, $r3, %bb.1
+
+ bb.2.entry:
+ liveins: $r4
+
+ $r3 = OR renamable $r4, $r0
+ ADJCALLSTACKDOWN 4088, 0, implicit-def dead $r3, implicit $r3
+ PseudoCALL_MEDIUM target-flags(loongarch-call-plt) @callee_stack_args, csr_ilp32d_lp64d, implicit-def dead $r1, implicit-def dead $r20, implicit $r4, implicit undef $r5, implicit undef $r6, implicit undef $r7, implicit undef $r8, implicit undef $r9, implicit undef $r10, implicit undef $r11, implicit-def $r3
+ ADJCALLSTACKUP 4088, 0, implicit-def dead $r3, implicit $r3
+ PseudoRET
+...
More information about the cfe-commits
mailing list