[llvm] [Hexagon] Add probe-stack=inline-asm support for stack clash protection (PR #190568)

via llvm-commits llvm-commits at lists.llvm.org
Sun Apr 5 18:13:56 PDT 2026


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-hexagon

Author: Brian Cain (androm3da)

<details>
<summary>Changes</summary>

Large stack allocations can skip over guard pages, causing stack clash vulnerabilities. The probe-stack=inline-asm function attribute tells LLVM to emit inline probing code that touches each page during stack allocation, ensuring guard pages are hit.

`framelimit` provides hardware bounds checking, but only on the allocframe instruction itself. SP decrements via A2_addi -- used for frames >= 16k bytes and all no-FP prologues bypass it.  Software probing closes that gap.

When the attribute is present and the frame size exceeds the probe size, the prologue now emits a PS_probed_stackalloc pseudo that inlineStackProbe() expands into a compare-and-branch loop:

  r29 = add(r29, #-ProbeSize)
  memw(r29+#<!-- -->0) = #<!-- -->0
  p0 = cmp.gtu(r29, r28)
  if (p0) jump LoopMBB
  r29 = r28

Both the frame-pointer and no-frame-pointer prologue paths are handled. The stack-probe-size attribute is respected for custom probe sizes.

---
Full diff: https://github.com/llvm/llvm-project/pull/190568.diff


6 Files Affected:

- (modified) llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp (+144-4) 
- (modified) llvm/lib/Target/Hexagon/HexagonFrameLowering.h (+3) 
- (modified) llvm/lib/Target/Hexagon/HexagonISelLowering.cpp (+18) 
- (modified) llvm/lib/Target/Hexagon/HexagonISelLowering.h (+3) 
- (modified) llvm/lib/Target/Hexagon/HexagonPseudo.td (+6) 
- (added) llvm/test/CodeGen/Hexagon/stack-probing.ll (+166) 


``````````diff
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 0f73efc243986..2bd40d0eb6f28 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -9,6 +9,7 @@
 
 #include "HexagonFrameLowering.h"
 #include "HexagonBlockRanges.h"
+#include "HexagonISelLowering.h"
 #include "HexagonInstrInfo.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonRegisterInfo.h"
@@ -746,9 +747,29 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
              .addExternalSymbol("__runtime_stack_check");
   } else if (NumBytes > 0) {
     assert(alignTo(NumBytes, 8) == NumBytes);
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
-      .addReg(SP)
-      .addImm(-int(NumBytes));
+    auto *TLI = HST.getTargetLowering();
+    bool NeedsProbing = TLI->hasInlineStackProbe(MF);
+    unsigned ProbeSize = 0;
+    if (NeedsProbing) {
+      Align StackAlign = getStackAlign();
+      ProbeSize = TLI->getStackProbeSize(MF, StackAlign);
+    }
+    if (NeedsProbing && NumBytes > ProbeSize) {
+      // Compute target SP in R28 (caller-saved scratch).
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), Hexagon::R28)
+          .addReg(SP)
+          .addImm(-int(NumBytes))
+          .setMIFlag(MachineInstr::FrameSetup);
+      // Emit pseudo to be expanded by inlineStackProbe().
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_probed_stackalloc))
+          .addReg(Hexagon::R28)
+          .setMIFlag(MachineInstr::FrameSetup);
+    } else {
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+          .addReg(SP)
+          .addImm(-int(NumBytes))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
   }
 }
 
@@ -892,7 +913,34 @@ void HexagonFrameLowering::insertAllocframe(MachineBasicBlock &MBB,
   DebugLoc dl = MBB.findDebugLoc(InsertPt);
   Register SP = HRI.getStackRegister();
 
-  if (NumBytes >= ALLOCFRAME_MAX) {
+  auto *TLI = HST.getTargetLowering();
+  bool NeedsProbing = TLI->hasInlineStackProbe(MF) && NumBytes > 0;
+  unsigned ProbeSize = 0;
+  if (NeedsProbing) {
+    Align StackAlign = getStackAlign();
+    ProbeSize = TLI->getStackProbeSize(MF, StackAlign);
+  }
+
+  if (NeedsProbing && NumBytes > ProbeSize) {
+    // Emit allocframe(#0) to save FP/LR only.
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
+        .addDef(SP)
+        .addReg(SP)
+        .addImm(0)
+        .addMemOperand(MMO)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+    // Compute target SP in R28 (caller-saved scratch).
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), Hexagon::R28)
+        .addReg(SP)
+        .addImm(-int(NumBytes))
+        .setMIFlag(MachineInstr::FrameSetup);
+
+    // Emit pseudo to be expanded by inlineStackProbe().
+    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::PS_probed_stackalloc))
+        .addReg(Hexagon::R28)
+        .setMIFlag(MachineInstr::FrameSetup);
+  } else if (NumBytes >= ALLOCFRAME_MAX) {
     // Emit allocframe(#0).
     BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_allocframe))
         .addDef(SP)
@@ -917,6 +965,98 @@ void HexagonFrameLowering::insertAllocframe(MachineBasicBlock &MBB,
   }
 }
 
+void HexagonFrameLowering::inlineStackProbe(
+    MachineFunction &MF, MachineBasicBlock &PrologueMBB) const {
+  // Collect PS_probed_stackalloc pseudos to expand. Collecting first avoids
+  // issues with modifying the block while iterating.
+  SmallVector<MachineInstr *, 2> ToReplace;
+  for (MachineInstr &MI : PrologueMBB)
+    if (MI.getOpcode() == Hexagon::PS_probed_stackalloc)
+      ToReplace.push_back(&MI);
+
+  auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  auto &HII = *HST.getInstrInfo();
+  auto *TLI = HST.getTargetLowering();
+  Align StackAlign = getStackAlign();
+  unsigned ProbeSize = TLI->getStackProbeSize(MF, StackAlign);
+  MachineInstr::MIFlag Flags = MachineInstr::FrameSetup;
+
+  for (MachineInstr *MI : ToReplace) {
+    MachineBasicBlock::iterator MBBI = MI->getIterator();
+    DebugLoc DL = PrologueMBB.findDebugLoc(MBBI);
+    Register TargetReg = MI->getOperand(0).getReg();
+
+    // Split the block: everything after the pseudo goes into ExitMBB.
+    MachineBasicBlock *MBB = MI->getParent();
+    MachineFunction::iterator InsertPt = std::next(MBB->getIterator());
+    MachineBasicBlock *LoopMBB =
+        MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+    MF.insert(InsertPt, LoopMBB);
+    MachineBasicBlock *ExitMBB =
+        MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+    MF.insert(InsertPt, ExitMBB);
+
+    // Move everything after the pseudo into ExitMBB.
+    ExitMBB->splice(ExitMBB->end(), MBB, std::next(MBBI), MBB->end());
+    ExitMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+    // LoopMBB: probe each page by decrementing SP and storing zero.
+    // When NumBytes is not an exact multiple of ProbeSize the loop
+    // will overshoot by up to ProbeSize-1 bytes; the final r29 = r28
+    // in ExitMBB corrects SP to the true target.
+    //
+    // The store is placed before the compare+branch so that the
+    // packetizer can bundle them into a single VLIW packet.  All
+    // non-predicated instructions in a packet commit unconditionally,
+    // so the probe store executes on every iteration including the
+    // last (when the branch falls through).
+    //
+    //   r29 = add(r29, #-ProbeSize)
+    //   memw(r29+#0) = #0
+    //   p0 = cmp.gtu(r29, r28)
+    //   if (p0) jump LoopMBB
+    BuildMI(*LoopMBB, LoopMBB->end(), DL, HII.get(Hexagon::A2_addi),
+            Hexagon::R29)
+        .addReg(Hexagon::R29)
+        .addImm(-int(ProbeSize))
+        .setMIFlags(Flags);
+
+    BuildMI(*LoopMBB, LoopMBB->end(), DL, HII.get(Hexagon::S4_storeiri_io))
+        .addReg(Hexagon::R29)
+        .addImm(0)
+        .addImm(0)
+        .setMIFlags(Flags);
+
+    BuildMI(*LoopMBB, LoopMBB->end(), DL, HII.get(Hexagon::C2_cmpgtu),
+            Hexagon::P0)
+        .addReg(Hexagon::R29)
+        .addReg(TargetReg)
+        .setMIFlags(Flags);
+
+    BuildMI(*LoopMBB, LoopMBB->end(), DL, HII.get(Hexagon::J2_jumpt))
+        .addReg(Hexagon::P0)
+        .addMBB(LoopMBB)
+        .setMIFlags(Flags);
+
+    // ExitMBB: set final SP.
+    BuildMI(*ExitMBB, ExitMBB->begin(), DL, HII.get(Hexagon::A2_tfr),
+            Hexagon::R29)
+        .addReg(TargetReg)
+        .setMIFlags(Flags);
+
+    // Set up CFG edges.
+    MBB->addSuccessor(LoopMBB);
+    LoopMBB->addSuccessor(LoopMBB);
+    LoopMBB->addSuccessor(ExitMBB);
+
+    // Remove the pseudo.
+    MI->eraseFromParent();
+
+    // Recompute live-ins for the new blocks.
+    fullyRecomputeLiveIns({ExitMBB, LoopMBB});
+  }
+}
+
 void HexagonFrameLowering::updateEntryPaths(MachineFunction &MF,
       MachineBasicBlock &SaveB) const {
   SetVector<unsigned> Worklist;
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
index 926aadb01f50e..285e6a5896461 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -113,6 +113,9 @@ class HexagonFrameLowering : public TargetFrameLowering {
 
   void insertCFIInstructions(MachineFunction &MF) const;
 
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologueMBB) const override;
+
 protected:
   bool hasFPImpl(const MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 6b2a963e2e777..a81fd20d715f2 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -4004,3 +4004,21 @@ bool HexagonTargetLowering::isUsedByReturnOnly(SDNode *N,
   Chain = Copy->getOperand(0);
   return true;
 }
+
+bool HexagonTargetLowering::hasInlineStackProbe(
+    const MachineFunction &MF) const {
+  if (MF.getFunction().hasFnAttribute("probe-stack"))
+    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+           "inline-asm";
+  return false;
+}
+
+unsigned HexagonTargetLowering::getStackProbeSize(const MachineFunction &MF,
+                                                  Align StackAlign) const {
+  const Function &Fn = MF.getFunction();
+  unsigned StackProbeSize =
+      Fn.getFnAttributeAsParsedInteger("stack-probe-size", 4096);
+  // Round down to the stack alignment.
+  StackProbeSize = alignDown(StackProbeSize, StackAlign.value());
+  return StackProbeSize ? StackProbeSize : StackAlign.value();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 55be8fd043ad5..60cf38a6abcc7 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -287,6 +287,9 @@ class HexagonTargetLowering : public TargetLowering {
     return AtomicExpansionKind::LLSC;
   }
 
+  bool hasInlineStackProbe(const MachineFunction &MF) const override;
+  unsigned getStackProbeSize(const MachineFunction &MF, Align StackAlign) const;
+
 private:
   void initializeHVXLowering();
   unsigned getPreferredHvxVectorAction(MVT VecTy) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonPseudo.td b/llvm/lib/Target/Hexagon/HexagonPseudo.td
index 4e5e8c3d26f00..fc74d8e2d7882 100644
--- a/llvm/lib/Target/Hexagon/HexagonPseudo.td
+++ b/llvm/lib/Target/Hexagon/HexagonPseudo.td
@@ -335,6 +335,12 @@ let Defs = [R29], hasSideEffects = 1 in
 def PS_alloca: Pseudo <(outs IntRegs:$Rd),
                        (ins IntRegs:$Rs, u32_0Imm:$A), "", []>;
 
+// Probed stack allocation pseudo. Expanded in inlineStackProbe() into a
+// compare-and-branch loop that touches each page.
+let Defs = [R29, P0], Uses = [R29], hasSideEffects = 1, mayStore = 1,
+    isCodeGenOnly = 1 in
+def PS_probed_stackalloc : Pseudo<(outs), (ins IntRegs:$target), "", []>;
+
 // Load predicate.
 let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
     isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
diff --git a/llvm/test/CodeGen/Hexagon/stack-probing.ll b/llvm/test/CodeGen/Hexagon/stack-probing.ll
new file mode 100644
index 0000000000000..a59556a782a53
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/stack-probing.ll
@@ -0,0 +1,166 @@
+; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+
+;; Small frame (< probe size): no probing loop, normal allocframe.
+; CHECK-LABEL: small_frame:
+; CHECK: allocframe(r29,#128):raw
+; CHECK-NOT: cmp.gtu
+; CHECK: dealloc_return
+define void @small_frame() #0 {
+entry:
+  %a = alloca [128 x i8], align 1
+  call void @use(ptr %a)
+  ret void
+}
+
+;; Large frame (> probe size): probing loop emitted.
+; CHECK-LABEL: large_frame:
+; CHECK: allocframe(r29,#0):raw
+; CHECK: r28 = add(r29,#-8192)
+; CHECK: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; CHECK: r29 = add(r29,#-4096)
+; CHECK: p0 = cmp.gtu(r29,r28)
+; CHECK: if (p0.new) jump:t .LBB
+; CHECK: memw(r29+#0) = #0
+; CHECK: r29 = r28
+define void @large_frame() #0 {
+entry:
+  %a = alloca [8192 x i8], align 1
+  call void @use(ptr %a)
+  ret void
+}
+
+;; Exact multiple of probe size: probing loop still emitted.
+; CHECK-LABEL: exact_multiple:
+; CHECK: allocframe(r29,#0):raw
+; CHECK: r28 = add(r29,#-12288)
+; CHECK: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; CHECK: r29 = add(r29,#-4096)
+; CHECK: p0 = cmp.gtu(r29,r28)
+; CHECK: if (p0.new) jump:t .LBB
+; CHECK: memw(r29+#0) = #0
+; CHECK: r29 = r28
+define void @exact_multiple() #0 {
+entry:
+  %a = alloca [12288 x i8], align 1
+  call void @use(ptr %a)
+  ret void
+}
+
+;; No frame pointer path with no call: probing works without allocframe.
+; CHECK-LABEL: no_fp_large:
+; CHECK-NOT: allocframe
+; CHECK: r28 = add(r29,#-8192)
+; CHECK: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; CHECK: r29 = add(r29,#-4096)
+; CHECK: p0 = cmp.gtu(r29,r28)
+; CHECK: if (p0.new) jump:t .LBB
+; CHECK: memw(r29+#0) = #0
+; CHECK: r29 = r28
+define void @no_fp_large() #1 {
+entry:
+  %a = alloca [8192 x i8], align 1
+  store volatile i8 0, ptr %a
+  ret void
+}
+
+;; Custom probe size of 512 bytes.
+; CHECK-LABEL: custom_probe_size:
+; CHECK: allocframe(r29,#0):raw
+; CHECK: r28 = add(r29,#-8192)
+; CHECK: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; CHECK: r29 = add(r29,#-512)
+; CHECK: p0 = cmp.gtu(r29,r28)
+; CHECK: if (p0.new) jump:t .LBB
+; CHECK: memw(r29+#0) = #0
+; CHECK: r29 = r28
+define void @custom_probe_size() #2 {
+entry:
+  %a = alloca [8192 x i8], align 1
+  call void @use(ptr %a)
+  ret void
+}
+
+;; No probe attribute: normal codegen, no probing.
+; CHECK-LABEL: no_probe:
+; CHECK: allocframe
+; CHECK-NOT: cmp.gtu
+; CHECK: dealloc_return
+define void @no_probe() {
+entry:
+  %a = alloca [8192 x i8], align 1
+  call void @use(ptr %a)
+  ret void
+}
+
+;; Frame >= ALLOCFRAME_MAX (16384): allocframe(#0) + probed alloc.
+; CHECK-LABEL: very_large_frame:
+; CHECK: allocframe(r29,#0):raw
+; CHECK: r28 = add(r29,#-20480)
+; CHECK: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; CHECK: r29 = add(r29,#-4096)
+; CHECK: p0 = cmp.gtu(r29,r28)
+; CHECK: if (p0.new) jump:t .LBB
+; CHECK: memw(r29+#0) = #0
+; CHECK: r29 = r28
+define void @very_large_frame() #0 {
+entry:
+  %a = alloca [20480 x i8], align 1
+  call void @use(ptr %a)
+  ret void
+}
+
+;; Frame == probe size exactly: no probing loop, normal allocframe.
+; CHECK-LABEL: exact_probe_size:
+; CHECK: allocframe(r29,#4096):raw
+; CHECK-NOT: cmp.gtu
+; CHECK: dealloc_return
+define void @exact_probe_size() #0 {
+entry:
+  %a = alloca [4096 x i8], align 1
+  call void @use(ptr %a)
+  ret void
+}
+
+;; Large frame requiring constant-extended immediate (> 32767).
+; CHECK-LABEL: const_extd_frame:
+; CHECK: allocframe(r29,#0):raw
+; CHECK: r28 = add(r29,##-65536)
+; CHECK: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; CHECK: r29 = add(r29,#-4096)
+; CHECK: p0 = cmp.gtu(r29,r28)
+; CHECK: if (p0.new) jump:t .LBB
+; CHECK: memw(r29+#0) = #0
+; CHECK: r29 = r28
+define void @const_extd_frame() #0 {
+entry:
+  %a = alloca [65536 x i8], align 1
+  call void @use(ptr %a)
+  ret void
+}
+
+;; Callee-saved register spills coexist with probing.
+; CHECK-LABEL: callee_saved_regs:
+; CHECK: allocframe(r29,#0):raw
+; CHECK: r28 = add(r29,#-8216)
+; CHECK: .LBB{{[0-9]+}}_{{[0-9]+}}:
+; CHECK: r29 = add(r29,#-4096)
+; CHECK: p0 = cmp.gtu(r29,r28)
+; CHECK: if (p0.new) jump:t .LBB
+; CHECK: memw(r29+#0) = #0
+; CHECK: r29 = r28
+; CHECK: memd(r29+##{{[0-9]+}}) = r{{[0-9]+}}:{{[0-9]+}}
+; CHECK: dealloc_return
+define void @callee_saved_regs(ptr %p) #0 {
+entry:
+  %a = alloca [8192 x i8], align 1
+  call void @use(ptr %a)
+  call void asm sideeffect "", "~{r16},~{r17},~{r18},~{r19}"()
+  call void @use(ptr %p)
+  ret void
+}
+
+declare void @use(ptr)
+
+attributes #0 = { "probe-stack"="inline-asm" }
+attributes #1 = { nounwind "frame-pointer"="none" "probe-stack"="inline-asm" }
+attributes #2 = { "probe-stack"="inline-asm" "stack-probe-size"="512" }

``````````

</details>


https://github.com/llvm/llvm-project/pull/190568


More information about the llvm-commits mailing list