[llvm-branch-commits] [llvm] b1806e6 - [AArch64] Stack probing for dynamic allocas in SelectionDAG (#66525)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Dec 4 11:02:27 PST 2023
Author: Momchil Velikov
Date: 2023-12-02T10:09:41Z
New Revision: b1806e6a1f0589acc88499419531c4eb82488f1a
URL: https://github.com/llvm/llvm-project/commit/b1806e6a1f0589acc88499419531c4eb82488f1a
DIFF: https://github.com/llvm/llvm-project/commit/b1806e6a1f0589acc88499419531c4eb82488f1a.diff
LOG: [AArch64] Stack probing for dynamic allocas in SelectionDAG (#66525)
Add support for probing for dynamic allocas (variable-size objects and
outgoing stack arguments).
Co-authored-by: Oliver Stannard <oliver.stannard at linaro.org>
Added:
llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
Modified:
llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/AArch64/AArch64InstrInfo.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 96702cb8b255a..e4451a9a85f50 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -479,6 +479,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
/// included as part of the stack frame.
bool
AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+ // The stack probing code for the dynamically allocated outgoing arguments
+ // area assumes that the stack is probed at the top - either by the prologue
+ // code, which issues a probe if `hasVarSizedObjects` return true, or by the
+ // most recent variable-sized object allocation. Changing the condition here
+ // may need to be followed up by changes to the probe issuing logic.
return !MF.getFrameInfo().hasVarSizedObjects();
}
@@ -487,6 +492,9 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
MachineBasicBlock::iterator I) const {
const AArch64InstrInfo *TII =
static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+ const AArch64TargetLowering *TLI =
+ MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
DebugLoc DL = I->getDebugLoc();
unsigned Opc = I->getOpcode();
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
@@ -513,8 +521,24 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
// Most call frames will be allocated at the start of a function so
// this is OK, but it is a limitation that needs dealing with.
assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
- emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(Amount), TII);
+
+ if (TLI->hasInlineStackProbe(MF) &&
+ -Amount >= AArch64::StackProbeMaxUnprobedStack) {
+ // When stack probing is enabled, the decrement of SP may need to be
+ // probed. We only need to do this if the call site needs 1024 bytes of
+ // space or more, because a region smaller than that is allowed to be
+ // unprobed at an ABI boundary. We rely on the fact that SP has been
+ // probed exactly at this point, either by the prologue or most recent
+ // dynamic allocation.
+ assert(MFI.hasVarSizedObjects() &&
+ "non-reserved call frame without var sized objects?");
+ Register ScratchReg =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
+ } else {
+ emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(Amount), TII);
+ }
}
} else if (CalleePopAmount != 0) {
// If the calling convention demands that the callee pops arguments from the
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 011aedeba1eb7..b6a16217dfae3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -569,10 +569,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSHL, MVT::i32, Custom);
setOperationAction(ISD::FSHL, MVT::i64, Custom);
- if (Subtarget->isTargetWindows())
- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
- else
- setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
// Constant pool entries
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
@@ -2353,6 +2350,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::CSINC)
MAKE_CASE(AArch64ISD::THREAD_POINTER)
MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
+ MAKE_CASE(AArch64ISD::PROBED_ALLOCA)
MAKE_CASE(AArch64ISD::ABDS_PRED)
MAKE_CASE(AArch64ISD::ABDU_PRED)
MAKE_CASE(AArch64ISD::HADDS_PRED)
@@ -2719,6 +2717,22 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitDynamicProbedAlloc(MachineInstr &MI,
+ MachineBasicBlock *MBB) const {
+ MachineFunction &MF = *MBB->getParent();
+ MachineBasicBlock::iterator MBBI = MI.getIterator();
+ DebugLoc DL = MBB->findDebugLoc(MBBI);
+ const AArch64InstrInfo &TII =
+ *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+ Register TargetReg = MI.getOperand(0).getReg();
+ MachineBasicBlock::iterator NextInst =
+ TII.probedStackAlloc(MBBI, TargetReg, false);
+
+ MI.eraseFromParent();
+ return NextInst->getParent();
+}
+
MachineBasicBlock *
AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
@@ -2863,6 +2877,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
case AArch64::CATCHRET:
return EmitLoweredCatchRet(MI, BB);
+
+ case AArch64::PROBED_STACKALLOC_DYN:
+ return EmitDynamicProbedAlloc(MI, BB);
+
case AArch64::LD1_MXIPXX_H_PSEUDO_B:
return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
case AArch64::LD1_MXIPXX_H_PSEUDO_H:
@@ -14052,9 +14070,34 @@ SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
AN->getMemOperand());
}
-SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
- SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
+SDValue
+AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+
SDLoc dl(Op);
+ // Get the inputs.
+ SDNode *Node = Op.getNode();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ MaybeAlign Align =
+ cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+ EVT VT = Node->getValueType(0);
+
+ if (DAG.getMachineFunction().getFunction().hasFnAttribute(
+ "no-stack-arg-probe")) {
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+ Chain = SP.getValue(1);
+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+ if (Align)
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+ SDValue Ops[2] = {SP, Chain};
+ return DAG.getMergeValues(Ops, dl);
+ }
+
+ Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
PtrVT, 0);
@@ -14078,7 +14121,59 @@ SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
DAG.getConstant(4, dl, MVT::i64));
- return Chain;
+
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+ Chain = SP.getValue(1);
+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+ if (Align)
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+ Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
+
+ Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
+
+ SDValue Ops[2] = {SP, Chain};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue
+AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ // Get the inputs.
+ SDNode *Node = Op.getNode();
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+
+ MaybeAlign Align =
+ cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+ SDLoc dl(Op);
+ EVT VT = Node->getValueType(0);
+
+ // Construct the new SP value in a GPR.
+ SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
+ Chain = SP.getValue(1);
+ SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
+ if (Align)
+ SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+ DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
+
+ // Set the real SP to the new value with a probing loop.
+ Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
+ SDValue Ops[2] = {SP, Chain};
+ return DAG.getMergeValues(Ops, dl);
+}
+
+SDValue
+AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+ SelectionDAG &DAG) const {
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ if (Subtarget->isTargetWindows())
+ return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
+ else if (hasInlineStackProbe(MF))
+ return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
+ else
+ return SDValue();
}
// When x and y are extended, lower:
@@ -14132,51 +14227,6 @@ SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::ADD, dl, VT, Add, tmp);
}
-SDValue
-AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
- SelectionDAG &DAG) const {
- assert(Subtarget->isTargetWindows() &&
- "Only Windows alloca probing supported");
- SDLoc dl(Op);
- // Get the inputs.
- SDNode *Node = Op.getNode();
- SDValue Chain = Op.getOperand(0);
- SDValue Size = Op.getOperand(1);
- MaybeAlign Align =
- cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
- EVT VT = Node->getValueType(0);
-
- if (DAG.getMachineFunction().getFunction().hasFnAttribute(
- "no-stack-arg-probe")) {
- SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
- Chain = SP.getValue(1);
- SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
- if (Align)
- SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
- DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
- Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
- SDValue Ops[2] = {SP, Chain};
- return DAG.getMergeValues(Ops, dl);
- }
-
- Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
-
- Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
-
- SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
- Chain = SP.getValue(1);
- SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
- if (Align)
- SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
- DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
- Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
-
- Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
-
- SDValue Ops[2] = {SP, Chain};
- return DAG.getMergeValues(Ops, dl);
-}
-
SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e6d62f1704726..3c8479e1f6e3c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -83,6 +83,10 @@ enum NodeType : unsigned {
ADC,
SBC, // adc, sbc instructions
+ // To avoid stack clash, allocation is performed by block and each block is
+ // probed.
+ PROBED_ALLOCA,
+
// Predicated instructions where inactive lanes produce undefined results.
ABDS_PRED,
ABDU_PRED,
@@ -616,6 +620,9 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitDynamicProbedAlloc(MachineInstr &MI,
+ MachineBasicBlock *MBB) const;
+
MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg,
MachineInstr &MI,
MachineBasicBlock *BB) const;
@@ -1141,10 +1148,10 @@ class AArch64TargetLowering : public TargetLowering {
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerInlineDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
- SDValue &Size,
- SelectionDAG &DAG) const;
+
SDValue LowerAVG(SDValue Op, SelectionDAG &DAG, unsigned NewOp) const;
SDValue LowerFixedLengthVectorIntDivideToSVE(SDValue Op,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 935667488194a..44b0337fe7879 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -863,6 +863,12 @@ def AArch64stilp : SDNode<"AArch64ISD::STILP", SDT_AArch64stilp, [SDNPHasChain,
def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
+
+def AArch64probedalloca
+ : SDNode<"AArch64ISD::PROBED_ALLOCA",
+ SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPMayStore]>;
+
def AArch64mrs : SDNode<"AArch64ISD::MRS",
SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, i32>]>,
[SDNPHasChain, SDNPOutGlue]>;
@@ -963,6 +969,14 @@ def PROBED_STACKALLOC_VAR : Pseudo<(outs),
[]>,
Sched<[]>;
+// Probed stack allocations of a variable size, used for allocas of unknown size
+// when stack-clash protection is enabled.
+let usesCustomInserter = 1 in
+def PROBED_STACKALLOC_DYN : Pseudo<(outs),
+ (ins GPR64common:$target),
+ [(AArch64probedalloca GPR64common:$target)]>,
+ Sched<[]>;
+
} // Defs = [SP, NZCV], Uses = [SP] in
} // hasSideEffects = 1, isCodeGenOnly = 1
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
new file mode 100644
index 0000000000000..96f2f63d703c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic-no-frame-setup.ll
@@ -0,0 +1,14 @@
+; RUN: llc --stop-after=finalize-isel -o - | FileCheck %s
+target triple = "aarch64-linux"
+
+; Check dynamic stack allocation and probing instructions do not have
+; the FrameSetup flag.
+
+; CHECK-NOT: frame-setup
+define void @no_frame_setup(i64 %size, ptr %out) #0 {
+ %v = alloca i8, i64 %size, align 1
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
new file mode 100644
index 0000000000000..78b83099c7ed9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll
@@ -0,0 +1,362 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple aarch64-none-eabi < %s -verify-machineinstrs | FileCheck %s
+
+; Dynamically-sized allocation, needs a loop which can handle any size at
+; runtime. The final iteration of the loop will temporarily put SP below the
+; target address, but this doesn't break any of the ABI constraints on the
+; stack, and also doesn't probe below the target SP value.
+define void @dynamic(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: cmp sp, x8
+; CHECK-NEXT: b.le .LBB0_3
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b .LBB0_1
+; CHECK-NEXT: .LBB0_3:
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: str x8, [x1]
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+ %v = alloca i8, i64 %size, align 1
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; This function has a fixed-size stack slot and a dynamic one. The fixed size
+; slot isn't large enough that we would normally probe it, but we need to do so
+; here otherwise the gap between the CSR save and the first probe of the
+; dynamic allocation could be too far apart when the size of the dynamic
+; allocation is close to the guard size.
+define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
+; CHECK-LABEL: dynamic_fixed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: str xzr, [sp, #-64]!
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: sub x10, x29, #64
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: str x10, [x1]
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: cmp sp, x8
+; CHECK-NEXT: b.le .LBB1_3
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB1_1 Depth=1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b .LBB1_1
+; CHECK-NEXT: .LBB1_3:
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: str x8, [x2]
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+ %v1 = alloca i8, i64 64, align 1
+ store ptr %v1, ptr %out1, align 8
+ %v2 = alloca i8, i64 %size, align 1
+ store ptr %v2, ptr %out2, align 8
+ ret void
+}
+
+; Dynamic allocation, with an alignment requirement greater than the alignment
+; of SP. Done by ANDing the target SP with a constant to align it down, then
+; doing the loop as normal. Note that we also re-align the stack in the prolog,
+; which isn't actually needed because the only aligned allocations are dynamic,
+; this is done even without stack probing.
+define void @dynamic_align_64(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic_align_64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: sub x9, sp, #32
+; CHECK-NEXT: and sp, x9, #0xffffffffffffffc0
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0
+; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: cmp sp, x8
+; CHECK-NEXT: b.le .LBB2_3
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b .LBB2_1
+; CHECK-NEXT: .LBB2_3:
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: str x8, [x1]
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+ %v = alloca i8, i64 %size, align 64
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; Dynamic allocation, with an alignment greater than the stack guard size. The
+; only
diff erence to the dynamic allocation is the constant used for aligning
+; the target SP, the loop will probe the whole allocation without needing to
+; know about the alignment padding.
+define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
+; CHECK-LABEL: dynamic_align_8192:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: sub x9, x9, #4064
+; CHECK-NEXT: and x9, x9, #0xffffffffffffe000
+; CHECK-NEXT: .LBB3_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: cmp sp, x9
+; CHECK-NEXT: b.le .LBB3_3
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB3_1 Depth=1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b .LBB3_1
+; CHECK-NEXT: .LBB3_3:
+; CHECK-NEXT: mov sp, x9
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: mov x19, sp
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: and x8, x8, #0xffffffffffffe000
+; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: cmp sp, x8
+; CHECK-NEXT: b.le .LBB3_6
+; CHECK-NEXT: // %bb.5: // in Loop: Header=BB3_4 Depth=1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b .LBB3_4
+; CHECK-NEXT: .LBB3_6:
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: str x8, [x1]
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+ %v = alloca i8, i64 %size, align 8192
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; For 64k guard pages, the only
diff erence is the constant subtracted from SP
+; in the loop.
+define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" {
+; CHECK-LABEL: dynamic_64k_guard:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x9, x0, #15
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: .LBB4_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536
+; CHECK-NEXT: cmp sp, x8
+; CHECK-NEXT: b.le .LBB4_3
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB4_1 Depth=1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b .LBB4_1
+; CHECK-NEXT: .LBB4_3:
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: str x8, [x1]
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+ %v = alloca i8, i64 %size, align 1
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+; If a function has variable-sized stack objects, then any function calls which
+; need to pass arguments on the stack must allocate the stack space for them
+; dynamically, to ensure they are at the bottom of the frame. We need to probe
+; that space when it is larger than the unprobed space allowed by the ABI (1024
+; bytes), so this needs a very large number of arguments.
+define void @no_reserved_call_frame(i64 %n) #0 {
+; CHECK-LABEL: no_reserved_call_frame:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: lsl x9, x0, #2
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: add x9, x9, #15
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x0, x8, x9
+; CHECK-NEXT: .LBB5_1: // %entry
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: cmp sp, x0
+; CHECK-NEXT: b.le .LBB5_3
+; CHECK-NEXT: // %bb.2: // %entry
+; CHECK-NEXT: // in Loop: Header=BB5_1 Depth=1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b .LBB5_1
+; CHECK-NEXT: .LBB5_3: // %entry
+; CHECK-NEXT: mov sp, x0
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: sub sp, sp, #1104
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: bl callee_stack_args
+; CHECK-NEXT: add sp, sp, #1104
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: .cfi_def_cfa wsp, 16
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+entry:
+ %v = alloca i32, i64 %n
+ call void @callee_stack_args(ptr %v, [138 x i64] undef)
+ ret void
+}
+
+; Same as above but without a variable-sized allocation, so the reserved call
+; frame can be folded into the fixed-size allocation in the prologue.
+define void @reserved_call_frame(i64 %n) #0 {
+; CHECK-LABEL: reserved_call_frame:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w28, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: sub sp, sp, #1504
+; CHECK-NEXT: add x0, sp, #1104
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: bl callee_stack_args
+; CHECK-NEXT: add sp, sp, #1504
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
+; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+entry:
+ %v = alloca i32, i64 100
+ call void @callee_stack_args(ptr %v, [138 x i64] undef)
+ ret void
+}
+
+declare void @callee_stack_args(ptr, [138 x i64])
+
+; Dynamic allocation of SVE vectors
+define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" {
+; CHECK-LABEL: dynamic_sve:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w19, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: rdvl x9, #1
+; CHECK-NEXT: mov x10, #15 // =0xf
+; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: madd x9, x0, x9, x10
+; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT: sub x8, x8, x9
+; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096
+; CHECK-NEXT: cmp sp, x8
+; CHECK-NEXT: b.le .LBB7_3
+; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: b .LBB7_1
+; CHECK-NEXT: .LBB7_3:
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: str x8, [x1]
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: .cfi_def_cfa wsp, 32
+; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+ %v = alloca <vscale x 4 x float>, i64 %size, align 16
+ store ptr %v, ptr %out, align 8
+ ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
More information about the llvm-branch-commits
mailing list