[llvm-branch-commits] [llvm] release/22.x v2: [CodeGen][RISCV] Inline stack probes immediately after allocateStack in eliminateCallFramePseudoInstr (#195456) (PR #202882)

Wed Jun 10 01:09:55 PDT 2026

https://github.com/CSharperMantle created https://github.com/llvm/llvm-project/pull/202882

Backport 589faedadf141e5e63f7a1e92a0327fc9bdc9b09 to release/22.x.

Supersedes #202821.

>From cee06e47014e8d74bd7918229a3f65c5c97b84d3 Mon Sep 17 00:00:00 2001
From: "Rong \"Mantle\" Bao" <rong.bao at csmantle.top>
Date: Fri, 8 May 2026 08:14:18 +0800
Subject: [PATCH] Inline stack probes immediately after `allocateStack` in
 `eliminateCallFramePseudoInstr` (#195456)

[ Upstream commit 589faedadf141e5e63f7a1e92a0327fc9bdc9b09 ]

Revert `bltu` in probing loops to `blt` because commit
f162be248636046a20e71209e139347e084b637a isn't applied on release/22.x
yet.

Link: https://github.com/llvm/llvm-project/pull/192485 ("[RISCV] Use
 unsigned comparison for stack clash probing loop")

---

This PR adds a call to `inlineStackProbe` immediately after
`allocateStack` in `eliminateCallFramePseudoInstr`. This allows code
generation for stack probe pseudoinstructions in non-entry BBs.

Fixes #195454.
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  |   1 +
 .../RISCV/stack-probing-dynamic-nonentry.ll   | 115 ++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/stack-probing-dynamic-nonentry.ll

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 8246623e8e5aa..20b43538d69c4 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -1898,6 +1898,7 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
                       needsDwarfCFI(MF) && !hasFP(MF),
                       /*NeedProbe=*/true, ProbeSize, DynAllocation,
                       MachineInstr::NoFlags);
+        inlineStackProbe(MF, MBB);
       } else {
         const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
         RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount),
diff --git a/llvm/test/CodeGen/RISCV/stack-probing-dynamic-nonentry.ll b/llvm/test/CodeGen/RISCV/stack-probing-dynamic-nonentry.ll
new file mode 100644
index 0000000000000..4c8bb653b4cff
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/stack-probing-dynamic-nonentry.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv64 -mattr=+m -O2 < %s | FileCheck %s -check-prefix=RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m -O2 < %s | FileCheck %s -check-prefix=RV32
+
+; Test that very large outgoing call frames in functions with variable-sized
+; objects get proper stack probing. The outgoing args are large enough to force
+; the PROBED_STACKALLOC path, which must be expanded in a non-entry block.
+
+define void @f(i64 %n) #0 {
+; RV64-LABEL: f:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd zero, 0(sp)
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    addi s0, sp, 16
+; RV64-NEXT:    .cfi_def_cfa s0, 0
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    addi a0, a0, 15
+; RV64-NEXT:    andi a0, a0, -16
+; RV64-NEXT:    sub a0, sp, a0
+; RV64-NEXT:    lui a1, 1
+; RV64-NEXT:  .LBB0_1: # %entry
+; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    sub sp, sp, a1
+; RV64-NEXT:    sd zero, 0(sp)
+; RV64-NEXT:    blt a0, sp, .LBB0_1
+; RV64-NEXT:  # %bb.2: # %entry
+; RV64-NEXT:    mv sp, a0
+; RV64-NEXT:    lui a1, 5
+; RV64-NEXT:    sub t1, sp, a1
+; RV64-NEXT:    lui t2, 1
+; RV64-NEXT:  .LBB0_3: # %entry
+; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    sub sp, sp, t2
+; RV64-NEXT:    sd zero, 0(sp)
+; RV64-NEXT:    bne sp, t1, .LBB0_3
+; RV64-NEXT:  # %bb.4: # %entry
+; RV64-NEXT:    addi sp, sp, -2048
+; RV64-NEXT:    addi sp, sp, -1424
+; RV64-NEXT:    sd zero, 0(sp)
+; RV64-NEXT:    call g
+; RV64-NEXT:    lui a0, 6
+; RV64-NEXT:    addi a0, a0, -624
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    addi sp, s0, -16
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+;
+; RV32-LABEL: f:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw zero, 0(sp)
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    addi s0, sp, 16
+; RV32-NEXT:    .cfi_def_cfa s0, 0
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    addi a0, a0, 15
+; RV32-NEXT:    andi a0, a0, -16
+; RV32-NEXT:    sub a0, sp, a0
+; RV32-NEXT:    lui a1, 1
+; RV32-NEXT:  .LBB0_1: # %entry
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    sub sp, sp, a1
+; RV32-NEXT:    sw zero, 0(sp)
+; RV32-NEXT:    blt a0, sp, .LBB0_1
+; RV32-NEXT:  # %bb.2: # %entry
+; RV32-NEXT:    mv sp, a0
+; RV32-NEXT:    lui a1, 5
+; RV32-NEXT:    sub t1, sp, a1
+; RV32-NEXT:    lui t2, 1
+; RV32-NEXT:  .LBB0_3: # %entry
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    sub sp, sp, t2
+; RV32-NEXT:    sw zero, 0(sp)
+; RV32-NEXT:    bne sp, t1, .LBB0_3
+; RV32-NEXT:  # %bb.4: # %entry
+; RV32-NEXT:    addi sp, sp, -2048
+; RV32-NEXT:    addi sp, sp, -1456
+; RV32-NEXT:    sw zero, 0(sp)
+; RV32-NEXT:    call g
+; RV32-NEXT:    lui a0, 6
+; RV32-NEXT:    addi a0, a0, -592
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    addi sp, s0, -16
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+entry:
+  %v = alloca i32, i64 %n
+  call void @g(ptr %v, [3000 x i64] poison)
+  ret void
+}
+
+declare void @g(ptr, [3000 x i64])
+
+attributes #0 = { "probe-stack"="inline-asm" }