[llvm] [RISCV] Add stack clash vector support (PR #119458)
Raphael Moreira Zinsly via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 17 07:40:38 PST 2024
https://github.com/rzinsly updated https://github.com/llvm/llvm-project/pull/119458
>From 68e37b28c712a3ecaf130131ea8e90b263b62d6f Mon Sep 17 00:00:00 2001
From: Raphael Moreira Zinsly <rzinsly at ventanamicro.com>
Date: Tue, 10 Dec 2024 17:50:50 -0300
Subject: [PATCH] [RISCV] Add stack clash vector support
Use the probe loop structure to allocate vector code in the stack as well.
We add the pseudo instruction RISCV::PROBED_STACKALLOC_RVV to
differentiate from the normal loop.
---
llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 148 +++++--
llvm/lib/Target/RISCV/RISCVFrameLowering.h | 11 +
llvm/lib/Target/RISCV/RISCVInstrInfo.td | 4 +
.../RISCV/rvv/access-fixed-objects-by-rvv.ll | 46 ++
.../CodeGen/RISCV/rvv/stack-probing-rvv.ll | 400 ++++++++++++++++++
5 files changed, 585 insertions(+), 24 deletions(-)
create mode 100644 llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 1028149bf513f4..04f841d589ce89 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -499,6 +499,54 @@ getPushOrLibCallsSavedInfo(const MachineFunction &MF,
return PushOrLibCallsCSI;
}
+void RISCVFrameLowering::allocateAndProbeStackForRVV(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount,
+ MachineInstr::MIFlag Flag, bool EmitCFI) const {
+ assert(Amount != 0 && "Did not need to adjust stack pointer for RVV.");
+
+ // Emit a variable-length allocation probing loop.
+
+ // Get VLEN in TargetReg
+ const RISCVInstrInfo *TII = STI.getInstrInfo();
+ Register TargetReg = RISCV::X6;
+ uint32_t NumOfVReg = Amount / (RISCV::RVVBitsPerBlock / 8);
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoReadVLENB), TargetReg)
+ .setMIFlag(Flag);
+ TII->mulImm(MF, MBB, MBBI, DL, TargetReg, NumOfVReg, Flag);
+
+ if (EmitCFI) {
+ // Set the CFA register to TargetReg.
+ unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(TargetReg, true);
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, -Amount));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+
+ // It will be expanded to a probe loop in `inlineStackProbe`.
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::PROBED_STACKALLOC_RVV))
+ .addReg(SPReg)
+ .addReg(TargetReg);
+
+ if (EmitCFI) {
+ // Set the CFA register back to SP.
+ unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(SPReg, true);
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+
+ // SUB SP, SP, T1
+ BuildMI(MBB, MBBI, DL, TII->get(RISCV::SUB), SPReg)
+ .addReg(SPReg)
+ .addReg(TargetReg)
+ .setMIFlag(Flag);
+}
+
static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
SmallVectorImpl<char> &Expr,
int FixedOffset, int ScalableOffset,
@@ -857,10 +905,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
+ uint64_t SecondSPAdjustAmount = 0;
// Emit the second SP adjustment after saving callee saved registers.
if (FirstSPAdjustAmount) {
- uint64_t SecondSPAdjustAmount =
- getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
+ SecondSPAdjustAmount = getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
assert(SecondSPAdjustAmount > 0 &&
"SecondSPAdjustAmount should be greater than zero");
@@ -870,11 +918,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
}
if (RVVStackSize) {
- // We must keep the stack pointer aligned through any intermediate
- // updates.
- RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
- StackOffset::getScalable(-RVVStackSize),
- MachineInstr::FrameSetup, getStackAlign());
+ if (NeedProbe)
+ allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
+ MachineInstr::FrameSetup, !hasFP(MF));
+ else
+ // We must keep the stack pointer aligned through any intermediate
+ // updates.
+ RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
+ StackOffset::getScalable(-RVVStackSize),
+ MachineInstr::FrameSetup, getStackAlign());
if (!hasFP(MF)) {
// Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb".
@@ -914,6 +966,19 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
.addImm(ShiftAmount)
.setMIFlag(MachineInstr::FrameSetup);
}
+ if (NeedProbe && RVVStackSize == 0) {
+ // Do a probe if the align + size allocated just passed the probe size
+ // and was not yet probed.
+ if (SecondSPAdjustAmount < ProbeSize &&
+ SecondSPAdjustAmount + MaxAlignment.value() >= ProbeSize) {
+ bool IsRV64 = STI.is64Bit();
+ BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+ .addReg(RISCV::X0)
+ .addReg(SPReg)
+ .addImm(0)
+ .setMIFlags(MachineInstr::FrameSetup);
+ }
+ }
// FP will be used to restore the frame in the epilogue, so we need
// another base register BP to record SP after re-alignment. SP will
// track the current stack after allocating variable sized objects.
@@ -2016,9 +2081,11 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
}
// Synthesize the probe loop.
-static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI,
- DebugLoc DL) {
+MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline(
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, DebugLoc DL, Register TargetReg,
+ bool IsRVV) const {
+ assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP");
auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
@@ -2034,7 +2101,6 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
MF.insert(MBBInsertPoint, ExitMBB);
MachineInstr::MIFlag Flags = MachineInstr::FrameSetup;
- Register TargetReg = RISCV::X6;
Register ScratchReg = RISCV::X7;
// ScratchReg = ProbeSize
@@ -2055,12 +2121,29 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
.addImm(0)
.setMIFlags(Flags);
- // BNE SP, TargetReg, LoopTest
- BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE))
- .addReg(SPReg)
- .addReg(TargetReg)
- .addMBB(LoopTestMBB)
- .setMIFlags(Flags);
+ if (IsRVV) {
+ // SUB TargetReg, TargetReg, ProbeSize
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB),
+ TargetReg)
+ .addReg(TargetReg)
+ .addReg(ScratchReg)
+ .setMIFlags(Flags);
+
+ // BGE TargetReg, ProbeSize, LoopTest
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BGE))
+ .addReg(TargetReg)
+ .addReg(ScratchReg)
+ .addMBB(LoopTestMBB)
+ .setMIFlags(Flags);
+
+ } else {
+ // BNE SP, TargetReg, LoopTest
+ BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE))
+ .addReg(SPReg)
+ .addReg(TargetReg)
+ .addMBB(LoopTestMBB)
+ .setMIFlags(Flags);
+ }
ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
@@ -2069,16 +2152,33 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
MBB.addSuccessor(LoopTestMBB);
// Update liveins.
fullyRecomputeLiveIns({ExitMBB, LoopTestMBB});
+
+ return ExitMBB;
}
void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
MachineBasicBlock &MBB) const {
- auto Where = llvm::find_if(MBB, [](MachineInstr &MI) {
- return MI.getOpcode() == RISCV::PROBED_STACKALLOC;
- });
- if (Where != MBB.end()) {
- DebugLoc DL = MBB.findDebugLoc(Where);
- emitStackProbeInline(MF, MBB, Where, DL);
- Where->eraseFromParent();
+ // Get the instructions that need to be replaced. We emit at most two of
+ // these. Remember them in order to avoid complications coming from the need
+ // to traverse the block while potentially creating more blocks.
+ SmallVector<MachineInstr *, 4> ToReplace;
+ for (MachineInstr &MI : MBB) {
+ int Opc = MI.getOpcode();
+ if (Opc == RISCV::PROBED_STACKALLOC ||
+ Opc == RISCV::PROBED_STACKALLOC_RVV) {
+ ToReplace.push_back(&MI);
+ }
+ }
+
+ for (MachineInstr *MI : ToReplace) {
+ if (MI->getOpcode() == RISCV::PROBED_STACKALLOC ||
+ MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV) {
+ MachineBasicBlock::iterator MBBI = MI->getIterator();
+ DebugLoc DL = MBB.findDebugLoc(MBBI);
+ Register TargetReg = MI->getOperand(1).getReg();
+ emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg,
+ (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
+ MBBI->eraseFromParent();
+ }
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 190c063d9d3b5d..1a2c6e0302623d 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -83,6 +83,12 @@ class RISCVFrameLowering : public TargetFrameLowering {
uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
uint64_t ProbeSize) const;
+ MachineBasicBlock *emitStackProbeInline(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ DebugLoc DL, Register TargetReg,
+ bool IsRVV) const;
+
protected:
const RISCVSubtarget &STI;
@@ -107,6 +113,11 @@ class RISCVFrameLowering : public TargetFrameLowering {
// Replace a StackProbe stub (if any) with the actual probe code inline
void inlineStackProbe(MachineFunction &MF,
MachineBasicBlock &PrologueMBB) const override;
+ void allocateAndProbeStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, int64_t Amount,
+ MachineInstr::MIFlag Flag,
+ bool EmitCFI) const;
};
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 14b571cebe1fec..d77e416a970b2f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1382,6 +1382,10 @@ def PROBED_STACKALLOC : Pseudo<(outs GPR:$sp),
(ins GPR:$scratch),
[]>,
Sched<[]>;
+def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp),
+ (ins GPR:$scratch),
+ []>,
+ Sched<[]>;
}
/// HI and ADD_LO address nodes.
diff --git a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
index c6a3649c9ba8fe..0052f4b9c041ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
@@ -64,3 +64,49 @@ define <vscale x 1 x i64> @access_fixed_and_vector_objects(ptr %val) {
ret <vscale x 1 x i64> %a
}
+
+define <vscale x 1 x i64> @probe_fixed_and_vector_objects(ptr %val, <vscale x 1 x i64> %dummy) "probe-stack"="inline-asm" {
+; RV64IV-LABEL: probe_fixed_and_vector_objects:
+; RV64IV: # %bb.0:
+; RV64IV-NEXT: addi sp, sp, -528
+; RV64IV-NEXT: .cfi_def_cfa_offset 528
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: .cfi_def_cfa t1, -8
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB2_1
+; RV64IV-NEXT: # %bb.2:
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 1 * vlenb
+; RV64IV-NEXT: addi a0, sp, 8
+; RV64IV-NEXT: vl1re64.v v9, (a0)
+; RV64IV-NEXT: addi a0, sp, 528
+; RV64IV-NEXT: vl1re64.v v10, (a0)
+; RV64IV-NEXT: ld a0, 520(sp)
+; RV64IV-NEXT: vsetvli zero, a0, e64, m1, tu, ma
+; RV64IV-NEXT: vadd.vv v8, v9, v10
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 528
+; RV64IV-NEXT: addi sp, sp, 528
+; RV64IV-NEXT: .cfi_def_cfa_offset 0
+; RV64IV-NEXT: ret
+ %local = alloca i64
+ %vector = alloca <vscale x 1 x i64>
+ %array = alloca [64 x i64]
+ %v1 = load <vscale x 1 x i64>, ptr %array
+ %v2 = load <vscale x 1 x i64>, ptr %vector
+ %len = load i64, ptr %local
+
+ %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(
+ <vscale x 1 x i64> %dummy,
+ <vscale x 1 x i64> %v1,
+ <vscale x 1 x i64> %v2,
+ i64 %len)
+
+ ret <vscale x 1 x i64> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll
new file mode 100644
index 00000000000000..d7f9ae73eaea54
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll
@@ -0,0 +1,400 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
+; RUN: | FileCheck %s -check-prefix=RV64IV
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
+; RUN: | FileCheck %s -check-prefix=RV32IV
+
+; Tests adapted from AArch64.
+
+; Test prolog sequences for stack probing when vector is involved.
+
+; The space for vector objects needs probing in the general case, because
+; the stack adjustment may happen to be too big (i.e. greater than the
+; probe size).
+
+define void @f_vector(ptr %out) #0 {
+; RV64IV-LABEL: f_vector:
+; RV64IV: # %bb.0: # %entry
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: slli t1, t1, 1
+; RV64IV-NEXT: .cfi_def_cfa t1, -16
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB0_1: # %entry
+; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB0_1
+; RV64IV-NEXT: # %bb.2: # %entry
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: slli a0, a0, 1
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 0
+; RV64IV-NEXT: ret
+;
+; RV32IV-LABEL: f_vector:
+; RV32IV: # %bb.0: # %entry
+; RV32IV-NEXT: csrr t1, vlenb
+; RV32IV-NEXT: slli t1, t1, 1
+; RV32IV-NEXT: .cfi_def_cfa t1, -16
+; RV32IV-NEXT: lui t2, 1
+; RV32IV-NEXT: .LBB0_1: # %entry
+; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT: sub sp, sp, t2
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: sub t1, t1, t2
+; RV32IV-NEXT: bge t1, t2, .LBB0_1
+; RV32IV-NEXT: # %bb.2: # %entry
+; RV32IV-NEXT: .cfi_def_cfa_register sp
+; RV32IV-NEXT: sub sp, sp, t1
+; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
+; RV32IV-NEXT: csrr a0, vlenb
+; RV32IV-NEXT: slli a0, a0, 1
+; RV32IV-NEXT: add sp, sp, a0
+; RV32IV-NEXT: .cfi_def_cfa sp, 0
+; RV32IV-NEXT: ret
+entry:
+ %vec = alloca <vscale x 4 x float>, align 16
+ ret void
+}
+
+; As above, but with 4 vectors of stack space.
+define void @f4_vector(ptr %out) #0 {
+; RV64IV-LABEL: f4_vector:
+; RV64IV: # %bb.0: # %entry
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: slli t1, t1, 3
+; RV64IV-NEXT: .cfi_def_cfa t1, -64
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB1_1: # %entry
+; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB1_1
+; RV64IV-NEXT: # %bb.2: # %entry
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: slli a0, a0, 3
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 0
+; RV64IV-NEXT: ret
+;
+; RV32IV-LABEL: f4_vector:
+; RV32IV: # %bb.0: # %entry
+; RV32IV-NEXT: csrr t1, vlenb
+; RV32IV-NEXT: slli t1, t1, 3
+; RV32IV-NEXT: .cfi_def_cfa t1, -64
+; RV32IV-NEXT: lui t2, 1
+; RV32IV-NEXT: .LBB1_1: # %entry
+; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT: sub sp, sp, t2
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: sub t1, t1, t2
+; RV32IV-NEXT: bge t1, t2, .LBB1_1
+; RV32IV-NEXT: # %bb.2: # %entry
+; RV32IV-NEXT: .cfi_def_cfa_register sp
+; RV32IV-NEXT: sub sp, sp, t1
+; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
+; RV32IV-NEXT: csrr a0, vlenb
+; RV32IV-NEXT: slli a0, a0, 3
+; RV32IV-NEXT: add sp, sp, a0
+; RV32IV-NEXT: .cfi_def_cfa sp, 0
+; RV32IV-NEXT: ret
+entry:
+ %vec1 = alloca <vscale x 4 x float>, align 16
+ %vec2 = alloca <vscale x 4 x float>, align 16
+ %vec3 = alloca <vscale x 4 x float>, align 16
+ %vec4 = alloca <vscale x 4 x float>, align 16
+ ret void
+}
+
+; As above, but with 16 vectors of stack space.
+; The stack adjustment is less than or equal to 16 x 256 = 4096, so
+; we can allocate the locals at once.
+define void @f16_vector(ptr %out) #0 {
+; RV64IV-LABEL: f16_vector:
+; RV64IV: # %bb.0: # %entry
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: slli t1, t1, 5
+; RV64IV-NEXT: .cfi_def_cfa t1, -256
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB2_1: # %entry
+; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB2_1
+; RV64IV-NEXT: # %bb.2: # %entry
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: slli a0, a0, 5
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 0
+; RV64IV-NEXT: ret
+;
+; RV32IV-LABEL: f16_vector:
+; RV32IV: # %bb.0: # %entry
+; RV32IV-NEXT: csrr t1, vlenb
+; RV32IV-NEXT: slli t1, t1, 5
+; RV32IV-NEXT: .cfi_def_cfa t1, -256
+; RV32IV-NEXT: lui t2, 1
+; RV32IV-NEXT: .LBB2_1: # %entry
+; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT: sub sp, sp, t2
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: sub t1, t1, t2
+; RV32IV-NEXT: bge t1, t2, .LBB2_1
+; RV32IV-NEXT: # %bb.2: # %entry
+; RV32IV-NEXT: .cfi_def_cfa_register sp
+; RV32IV-NEXT: sub sp, sp, t1
+; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
+; RV32IV-NEXT: csrr a0, vlenb
+; RV32IV-NEXT: slli a0, a0, 5
+; RV32IV-NEXT: add sp, sp, a0
+; RV32IV-NEXT: .cfi_def_cfa sp, 0
+; RV32IV-NEXT: ret
+entry:
+ %vec1 = alloca <vscale x 4 x float>, align 16
+ %vec2 = alloca <vscale x 4 x float>, align 16
+ %vec3 = alloca <vscale x 4 x float>, align 16
+ %vec4 = alloca <vscale x 4 x float>, align 16
+ %vec5 = alloca <vscale x 4 x float>, align 16
+ %vec6 = alloca <vscale x 4 x float>, align 16
+ %vec7 = alloca <vscale x 4 x float>, align 16
+ %vec8 = alloca <vscale x 4 x float>, align 16
+ %vec9 = alloca <vscale x 4 x float>, align 16
+ %vec10 = alloca <vscale x 4 x float>, align 16
+ %vec11 = alloca <vscale x 4 x float>, align 16
+ %vec12 = alloca <vscale x 4 x float>, align 16
+ %vec13 = alloca <vscale x 4 x float>, align 16
+ %vec14 = alloca <vscale x 4 x float>, align 16
+ %vec15 = alloca <vscale x 4 x float>, align 16
+ %vec16 = alloca <vscale x 4 x float>, align 16
+ ret void
+}
+
+; As above, but with 17 vectors of stack space.
+define void @f17_vector(ptr %out) #0 {
+; RV64IV-LABEL: f17_vector:
+; RV64IV: # %bb.0: # %entry
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: li a0, 34
+; RV64IV-NEXT: mul t1, t1, a0
+; RV64IV-NEXT: .cfi_def_cfa t1, -272
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB3_1: # %entry
+; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB3_1
+; RV64IV-NEXT: # %bb.2: # %entry
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: li a1, 34
+; RV64IV-NEXT: mul a0, a0, a1
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 0
+; RV64IV-NEXT: ret
+;
+; RV32IV-LABEL: f17_vector:
+; RV32IV: # %bb.0: # %entry
+; RV32IV-NEXT: csrr t1, vlenb
+; RV32IV-NEXT: li a0, 34
+; RV32IV-NEXT: mul t1, t1, a0
+; RV32IV-NEXT: .cfi_def_cfa t1, -272
+; RV32IV-NEXT: lui t2, 1
+; RV32IV-NEXT: .LBB3_1: # %entry
+; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT: sub sp, sp, t2
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: sub t1, t1, t2
+; RV32IV-NEXT: bge t1, t2, .LBB3_1
+; RV32IV-NEXT: # %bb.2: # %entry
+; RV32IV-NEXT: .cfi_def_cfa_register sp
+; RV32IV-NEXT: sub sp, sp, t1
+; RV32IV-NEXT: .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
+; RV32IV-NEXT: csrr a0, vlenb
+; RV32IV-NEXT: li a1, 34
+; RV32IV-NEXT: mul a0, a0, a1
+; RV32IV-NEXT: add sp, sp, a0
+; RV32IV-NEXT: .cfi_def_cfa sp, 0
+; RV32IV-NEXT: ret
+entry:
+ %vec1 = alloca <vscale x 4 x float>, align 16
+ %vec2 = alloca <vscale x 4 x float>, align 16
+ %vec3 = alloca <vscale x 4 x float>, align 16
+ %vec4 = alloca <vscale x 4 x float>, align 16
+ %vec5 = alloca <vscale x 4 x float>, align 16
+ %vec6 = alloca <vscale x 4 x float>, align 16
+ %vec7 = alloca <vscale x 4 x float>, align 16
+ %vec8 = alloca <vscale x 4 x float>, align 16
+ %vec9 = alloca <vscale x 4 x float>, align 16
+ %vec10 = alloca <vscale x 4 x float>, align 16
+ %vec11 = alloca <vscale x 4 x float>, align 16
+ %vec12 = alloca <vscale x 4 x float>, align 16
+ %vec13 = alloca <vscale x 4 x float>, align 16
+ %vec14 = alloca <vscale x 4 x float>, align 16
+ %vec15 = alloca <vscale x 4 x float>, align 16
+ %vec16 = alloca <vscale x 4 x float>, align 16
+ %vec17 = alloca <vscale x 4 x float>, align 16
+ ret void
+}
+
+; A vector and a 16-byte fixed size object.
+define void @f1_vector_16_arr(ptr %out) #0 {
+; RV64IV-LABEL: f1_vector_16_arr:
+; RV64IV: # %bb.0: # %entry
+; RV64IV-NEXT: addi sp, sp, -16
+; RV64IV-NEXT: .cfi_def_cfa_offset 16
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: slli t1, t1, 1
+; RV64IV-NEXT: .cfi_def_cfa t1, -16
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB4_1: # %entry
+; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB4_1
+; RV64IV-NEXT: # %bb.2: # %entry
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: slli a0, a0, 1
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 16
+; RV64IV-NEXT: addi sp, sp, 16
+; RV64IV-NEXT: .cfi_def_cfa_offset 0
+; RV64IV-NEXT: ret
+;
+; RV32IV-LABEL: f1_vector_16_arr:
+; RV32IV: # %bb.0: # %entry
+; RV32IV-NEXT: addi sp, sp, -16
+; RV32IV-NEXT: .cfi_def_cfa_offset 16
+; RV32IV-NEXT: csrr t1, vlenb
+; RV32IV-NEXT: slli t1, t1, 1
+; RV32IV-NEXT: .cfi_def_cfa t1, -16
+; RV32IV-NEXT: lui t2, 1
+; RV32IV-NEXT: .LBB4_1: # %entry
+; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT: sub sp, sp, t2
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: sub t1, t1, t2
+; RV32IV-NEXT: bge t1, t2, .LBB4_1
+; RV32IV-NEXT: # %bb.2: # %entry
+; RV32IV-NEXT: .cfi_def_cfa_register sp
+; RV32IV-NEXT: sub sp, sp, t1
+; RV32IV-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; RV32IV-NEXT: csrr a0, vlenb
+; RV32IV-NEXT: slli a0, a0, 1
+; RV32IV-NEXT: add sp, sp, a0
+; RV32IV-NEXT: .cfi_def_cfa sp, 16
+; RV32IV-NEXT: addi sp, sp, 16
+; RV32IV-NEXT: .cfi_def_cfa_offset 0
+; RV32IV-NEXT: ret
+entry:
+ %vec = alloca <vscale x 4 x float>, align 16
+ %arr = alloca i8, i64 16, align 1
+ ret void
+}
+
+; A large vector object and a large slot, both of which need probing.
+define void @f1_vector_4096_arr(ptr %out) #0 {
+; RV64IV-LABEL: f1_vector_4096_arr:
+; RV64IV: # %bb.0: # %entry
+; RV64IV-NEXT: lui a0, 1
+; RV64IV-NEXT: sub sp, sp, a0
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: .cfi_def_cfa_offset 4096
+; RV64IV-NEXT: lui a0, 1
+; RV64IV-NEXT: sub sp, sp, a0
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: .cfi_def_cfa_offset 8192
+; RV64IV-NEXT: lui a0, 1
+; RV64IV-NEXT: sub sp, sp, a0
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: .cfi_def_cfa_offset 12288
+; RV64IV-NEXT: addi sp, sp, -16
+; RV64IV-NEXT: .cfi_def_cfa_offset 12304
+; RV64IV-NEXT: csrr t1, vlenb
+; RV64IV-NEXT: slli t1, t1, 7
+; RV64IV-NEXT: .cfi_def_cfa t1, -1024
+; RV64IV-NEXT: lui t2, 1
+; RV64IV-NEXT: .LBB5_1: # %entry
+; RV64IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT: sub sp, sp, t2
+; RV64IV-NEXT: sd zero, 0(sp)
+; RV64IV-NEXT: sub t1, t1, t2
+; RV64IV-NEXT: bge t1, t2, .LBB5_1
+; RV64IV-NEXT: # %bb.2: # %entry
+; RV64IV-NEXT: .cfi_def_cfa_register sp
+; RV64IV-NEXT: sub sp, sp, t1
+; RV64IV-NEXT: .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
+; RV64IV-NEXT: csrr a0, vlenb
+; RV64IV-NEXT: slli a0, a0, 7
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa sp, 12304
+; RV64IV-NEXT: lui a0, 3
+; RV64IV-NEXT: addiw a0, a0, 16
+; RV64IV-NEXT: add sp, sp, a0
+; RV64IV-NEXT: .cfi_def_cfa_offset 0
+; RV64IV-NEXT: ret
+;
+; RV32IV-LABEL: f1_vector_4096_arr:
+; RV32IV: # %bb.0: # %entry
+; RV32IV-NEXT: lui a0, 1
+; RV32IV-NEXT: sub sp, sp, a0
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: .cfi_def_cfa_offset 4096
+; RV32IV-NEXT: lui a0, 1
+; RV32IV-NEXT: sub sp, sp, a0
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: .cfi_def_cfa_offset 8192
+; RV32IV-NEXT: lui a0, 1
+; RV32IV-NEXT: sub sp, sp, a0
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: .cfi_def_cfa_offset 12288
+; RV32IV-NEXT: addi sp, sp, -16
+; RV32IV-NEXT: .cfi_def_cfa_offset 12304
+; RV32IV-NEXT: csrr t1, vlenb
+; RV32IV-NEXT: slli t1, t1, 7
+; RV32IV-NEXT: .cfi_def_cfa t1, -1024
+; RV32IV-NEXT: lui t2, 1
+; RV32IV-NEXT: .LBB5_1: # %entry
+; RV32IV-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT: sub sp, sp, t2
+; RV32IV-NEXT: sw zero, 0(sp)
+; RV32IV-NEXT: sub t1, t1, t2
+; RV32IV-NEXT: bge t1, t2, .LBB5_1
+; RV32IV-NEXT: # %bb.2: # %entry
+; RV32IV-NEXT: .cfi_def_cfa_register sp
+; RV32IV-NEXT: sub sp, sp, t1
+; RV32IV-NEXT: .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
+; RV32IV-NEXT: csrr a0, vlenb
+; RV32IV-NEXT: slli a0, a0, 7
+; RV32IV-NEXT: add sp, sp, a0
+; RV32IV-NEXT: .cfi_def_cfa sp, 12304
+; RV32IV-NEXT: lui a0, 3
+; RV32IV-NEXT: addi a0, a0, 16
+; RV32IV-NEXT: add sp, sp, a0
+; RV32IV-NEXT: .cfi_def_cfa_offset 0
+; RV32IV-NEXT: ret
+entry:
+ %vec = alloca <vscale x 256 x float>, align 16
+ %arr = alloca i8, i64 12288, align 1
+ ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
More information about the llvm-commits
mailing list