[llvm] [RISCV] Add stack clash vector support (PR #119458)

Wed Dec 18 07:31:34 PST 2024

https://github.com/rzinsly updated https://github.com/llvm/llvm-project/pull/119458

>From 68e37b28c712a3ecaf130131ea8e90b263b62d6f Mon Sep 17 00:00:00 2001
From: Raphael Moreira Zinsly <rzinsly at ventanamicro.com>
Date: Tue, 10 Dec 2024 17:50:50 -0300
Subject: [PATCH 1/2] [RISCV] Add stack clash vector support

Use the probe loop structure to allocate vector code in the stack as well.
We add the pseudo instruction RISCV::PROBED_STACKALLOC_RVV to
differentiate from the normal loop.
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  | 148 +++++--
 llvm/lib/Target/RISCV/RISCVFrameLowering.h    |  11 +
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       |   4 +
 .../RISCV/rvv/access-fixed-objects-by-rvv.ll  |  46 ++
 .../CodeGen/RISCV/rvv/stack-probing-rvv.ll    | 400 ++++++++++++++++++
 5 files changed, 585 insertions(+), 24 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 1028149bf513f4..04f841d589ce89 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -499,6 +499,54 @@ getPushOrLibCallsSavedInfo(const MachineFunction &MF,
   return PushOrLibCallsCSI;
 }
 
+void RISCVFrameLowering::allocateAndProbeStackForRVV(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, int64_t Amount,
+    MachineInstr::MIFlag Flag, bool EmitCFI) const {
+  assert(Amount != 0 && "Did not need to adjust stack pointer for RVV.");
+
+  // Emit a variable-length allocation probing loop.
+
+  // Get VLEN in TargetReg
+  const RISCVInstrInfo *TII = STI.getInstrInfo();
+  Register TargetReg = RISCV::X6;
+  uint32_t NumOfVReg = Amount / (RISCV::RVVBitsPerBlock / 8);
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoReadVLENB), TargetReg)
+      .setMIFlag(Flag);
+  TII->mulImm(MF, MBB, MBBI, DL, TargetReg, NumOfVReg, Flag);
+
+  if (EmitCFI) {
+    // Set the CFA register to TargetReg.
+    unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(TargetReg, true);
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, -Amount));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  // It will be expanded to a probe loop in `inlineStackProbe`.
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::PROBED_STACKALLOC_RVV))
+      .addReg(SPReg)
+      .addReg(TargetReg);
+
+  if (EmitCFI) {
+    // Set the CFA register back to SP.
+    unsigned Reg = STI.getRegisterInfo()->getDwarfRegNum(SPReg, true);
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+  }
+
+  // SUB SP, SP, T1
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::SUB), SPReg)
+      .addReg(SPReg)
+      .addReg(TargetReg)
+      .setMIFlag(Flag);
+}
+
 static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
                                            SmallVectorImpl<char> &Expr,
                                            int FixedOffset, int ScalableOffset,
@@ -857,10 +905,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  uint64_t SecondSPAdjustAmount = 0;
   // Emit the second SP adjustment after saving callee saved registers.
   if (FirstSPAdjustAmount) {
-    uint64_t SecondSPAdjustAmount =
-        getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
+    SecondSPAdjustAmount = getStackSizeWithRVVPadding(MF) - FirstSPAdjustAmount;
     assert(SecondSPAdjustAmount > 0 &&
            "SecondSPAdjustAmount should be greater than zero");
 
@@ -870,11 +918,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (RVVStackSize) {
-    // We must keep the stack pointer aligned through any intermediate
-    // updates.
-    RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
-                  StackOffset::getScalable(-RVVStackSize),
-                  MachineInstr::FrameSetup, getStackAlign());
+    if (NeedProbe)
+      allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize,
+                                  MachineInstr::FrameSetup, !hasFP(MF));
+    else
+      // We must keep the stack pointer aligned through any intermediate
+      // updates.
+      RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
+                    StackOffset::getScalable(-RVVStackSize),
+                    MachineInstr::FrameSetup, getStackAlign());
 
     if (!hasFP(MF)) {
       // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb".
@@ -914,6 +966,19 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
             .addImm(ShiftAmount)
             .setMIFlag(MachineInstr::FrameSetup);
       }
+      if (NeedProbe && RVVStackSize == 0) {
+        // Do a probe if the align + size allocated just passed the probe size
+        // and was not yet probed.
+        if (SecondSPAdjustAmount < ProbeSize &&
+            SecondSPAdjustAmount + MaxAlignment.value() >= ProbeSize) {
+          bool IsRV64 = STI.is64Bit();
+          BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
+              .addReg(RISCV::X0)
+              .addReg(SPReg)
+              .addImm(0)
+              .setMIFlags(MachineInstr::FrameSetup);
+        }
+      }
       // FP will be used to restore the frame in the epilogue, so we need
       // another base register BP to record SP after re-alignment. SP will
       // track the current stack after allocating variable sized objects.
@@ -2016,9 +2081,11 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
 }
 
 // Synthesize the probe loop.
-static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 DebugLoc DL) {
+MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, DebugLoc DL, Register TargetReg,
+    bool IsRVV) const {
+  assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP");
 
   auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
   const RISCVInstrInfo *TII = Subtarget.getInstrInfo();
@@ -2034,7 +2101,6 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
   MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock());
   MF.insert(MBBInsertPoint, ExitMBB);
   MachineInstr::MIFlag Flags = MachineInstr::FrameSetup;
-  Register TargetReg = RISCV::X6;
   Register ScratchReg = RISCV::X7;
 
   // ScratchReg = ProbeSize
@@ -2055,12 +2121,29 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
       .addImm(0)
       .setMIFlags(Flags);
 
-  //   BNE SP, TargetReg, LoopTest
-  BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE))
-      .addReg(SPReg)
-      .addReg(TargetReg)
-      .addMBB(LoopTestMBB)
-      .setMIFlags(Flags);
+  if (IsRVV) {
+    //  SUB TargetReg, TargetReg, ProbeSize
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::SUB),
+            TargetReg)
+        .addReg(TargetReg)
+        .addReg(ScratchReg)
+        .setMIFlags(Flags);
+
+    //  BGE TargetReg, ProbeSize, LoopTest
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BGE))
+        .addReg(TargetReg)
+        .addReg(ScratchReg)
+        .addMBB(LoopTestMBB)
+        .setMIFlags(Flags);
+
+  } else {
+    //  BNE SP, TargetReg, LoopTest
+    BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(RISCV::BNE))
+        .addReg(SPReg)
+        .addReg(TargetReg)
+        .addMBB(LoopTestMBB)
+        .setMIFlags(Flags);
+  }
 
   ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end());
 
@@ -2069,16 +2152,33 @@ static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.addSuccessor(LoopTestMBB);
   // Update liveins.
   fullyRecomputeLiveIns({ExitMBB, LoopTestMBB});
+
+  return ExitMBB;
 }
 
 void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
-  auto Where = llvm::find_if(MBB, [](MachineInstr &MI) {
-    return MI.getOpcode() == RISCV::PROBED_STACKALLOC;
-  });
-  if (Where != MBB.end()) {
-    DebugLoc DL = MBB.findDebugLoc(Where);
-    emitStackProbeInline(MF, MBB, Where, DL);
-    Where->eraseFromParent();
+  // Get the instructions that need to be replaced. We emit at most two of
+  // these. Remember them in order to avoid complications coming from the need
+  // to traverse the block while potentially creating more blocks.
+  SmallVector<MachineInstr *, 4> ToReplace;
+  for (MachineInstr &MI : MBB) {
+    int Opc = MI.getOpcode();
+    if (Opc == RISCV::PROBED_STACKALLOC ||
+        Opc == RISCV::PROBED_STACKALLOC_RVV) {
+      ToReplace.push_back(&MI);
+    }
+  }
+
+  for (MachineInstr *MI : ToReplace) {
+    if (MI->getOpcode() == RISCV::PROBED_STACKALLOC ||
+        MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV) {
+      MachineBasicBlock::iterator MBBI = MI->getIterator();
+      DebugLoc DL = MBB.findDebugLoc(MBBI);
+      Register TargetReg = MI->getOperand(1).getReg();
+      emitStackProbeInline(MF, MBB, MBBI, DL, TargetReg,
+                           (MI->getOpcode() == RISCV::PROBED_STACKALLOC_RVV));
+      MBBI->eraseFromParent();
+    }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 190c063d9d3b5d..1a2c6e0302623d 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -83,6 +83,12 @@ class RISCVFrameLowering : public TargetFrameLowering {
                      uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
                      uint64_t ProbeSize) const;
 
+  MachineBasicBlock *emitStackProbeInline(MachineFunction &MF,
+                                          MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          DebugLoc DL, Register TargetReg,
+                                          bool IsRVV) const;
+
 protected:
   const RISCVSubtarget &STI;
 
@@ -107,6 +113,11 @@ class RISCVFrameLowering : public TargetFrameLowering {
   // Replace a StackProbe stub (if any) with the actual probe code inline
   void inlineStackProbe(MachineFunction &MF,
                         MachineBasicBlock &PrologueMBB) const override;
+  void allocateAndProbeStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   const DebugLoc &DL, int64_t Amount,
+                                   MachineInstr::MIFlag Flag,
+                                   bool EmitCFI) const;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 14b571cebe1fec..d77e416a970b2f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1382,6 +1382,10 @@ def PROBED_STACKALLOC : Pseudo<(outs GPR:$sp),
                                (ins GPR:$scratch),
                                []>,
                                Sched<[]>;
+def PROBED_STACKALLOC_RVV : Pseudo<(outs GPR:$sp),
+                               (ins GPR:$scratch),
+                               []>,
+                               Sched<[]>;
 }
 
 /// HI and ADD_LO address nodes.
diff --git a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
index c6a3649c9ba8fe..0052f4b9c041ee 100644
--- a/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/access-fixed-objects-by-rvv.ll
@@ -64,3 +64,49 @@ define <vscale x 1 x i64> @access_fixed_and_vector_objects(ptr %val) {
 
   ret <vscale x 1 x i64> %a
 }
+
+define <vscale x 1 x i64> @probe_fixed_and_vector_objects(ptr %val, <vscale x 1 x i64> %dummy) "probe-stack"="inline-asm" {
+; RV64IV-LABEL: probe_fixed_and_vector_objects:
+; RV64IV:       # %bb.0:
+; RV64IV-NEXT:    addi sp, sp, -528
+; RV64IV-NEXT:    .cfi_def_cfa_offset 528
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    .cfi_def_cfa t1, -8
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB2_1
+; RV64IV-NEXT:  # %bb.2:
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x04, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 528 + 1 * vlenb
+; RV64IV-NEXT:    addi a0, sp, 8
+; RV64IV-NEXT:    vl1re64.v v9, (a0)
+; RV64IV-NEXT:    addi a0, sp, 528
+; RV64IV-NEXT:    vl1re64.v v10, (a0)
+; RV64IV-NEXT:    ld a0, 520(sp)
+; RV64IV-NEXT:    vsetvli zero, a0, e64, m1, tu, ma
+; RV64IV-NEXT:    vadd.vv v8, v9, v10
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 528
+; RV64IV-NEXT:    addi sp, sp, 528
+; RV64IV-NEXT:    .cfi_def_cfa_offset 0
+; RV64IV-NEXT:    ret
+  %local = alloca i64
+  %vector = alloca <vscale x 1 x i64>
+  %array = alloca [64 x i64]
+  %v1 = load <vscale x 1 x i64>, ptr %array
+  %v2 = load <vscale x 1 x i64>, ptr %vector
+  %len = load i64, ptr %local
+
+  %a = call <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %dummy,
+    <vscale x 1 x i64> %v1,
+    <vscale x 1 x i64> %v2,
+    i64 %len)
+
+  ret <vscale x 1 x i64> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll
new file mode 100644
index 00000000000000..d7f9ae73eaea54
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-probing-rvv.ll
@@ -0,0 +1,400 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64IV
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32IV
+
+; Tests adapted from AArch64.
+
+; Test prolog sequences for stack probing when vector is involved.
+
+; The space for vector objects needs probing in the general case, because
+; the stack adjustment may happen to be too big (i.e. greater than the
+; probe size).
+
+define void @f_vector(ptr %out) #0 {
+; RV64IV-LABEL: f_vector:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    slli t1, t1, 1
+; RV64IV-NEXT:    .cfi_def_cfa t1, -16
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB0_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB0_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    slli a0, a0, 1
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f_vector:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    slli t1, t1, 1
+; RV32IV-NEXT:    .cfi_def_cfa t1, -16
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB0_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB0_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    slli a0, a0, 1
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 4 vectors of stack space.
+define void @f4_vector(ptr %out) #0 {
+; RV64IV-LABEL: f4_vector:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    slli t1, t1, 3
+; RV64IV-NEXT:    .cfi_def_cfa t1, -64
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB1_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB1_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    slli a0, a0, 3
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f4_vector:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    slli t1, t1, 3
+; RV32IV-NEXT:    .cfi_def_cfa t1, -64
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB1_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB1_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    slli a0, a0, 3
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 16 vectors of stack space.
+; The stack adjustment is less than or equal to 16 x 256 = 4096, so
+; we can allocate the locals at once.
+define void @f16_vector(ptr %out) #0 {
+; RV64IV-LABEL: f16_vector:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    slli t1, t1, 5
+; RV64IV-NEXT:    .cfi_def_cfa t1, -256
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB2_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB2_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    slli a0, a0, 5
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f16_vector:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    slli t1, t1, 5
+; RV32IV-NEXT:    .cfi_def_cfa t1, -256
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB2_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB2_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    slli a0, a0, 5
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  %vec5 = alloca <vscale x 4 x float>, align 16
+  %vec6 = alloca <vscale x 4 x float>, align 16
+  %vec7 = alloca <vscale x 4 x float>, align 16
+  %vec8 = alloca <vscale x 4 x float>, align 16
+  %vec9 = alloca <vscale x 4 x float>, align 16
+  %vec10 = alloca <vscale x 4 x float>, align 16
+  %vec11 = alloca <vscale x 4 x float>, align 16
+  %vec12 = alloca <vscale x 4 x float>, align 16
+  %vec13 = alloca <vscale x 4 x float>, align 16
+  %vec14 = alloca <vscale x 4 x float>, align 16
+  %vec15 = alloca <vscale x 4 x float>, align 16
+  %vec16 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; As above, but with 17 vectors of stack space.
+define void @f17_vector(ptr %out) #0 {
+; RV64IV-LABEL: f17_vector:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    li a0, 34
+; RV64IV-NEXT:    mul t1, t1, a0
+; RV64IV-NEXT:    .cfi_def_cfa t1, -272
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB3_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB3_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    li a1, 34
+; RV64IV-NEXT:    mul a0, a0, a1
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f17_vector:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    li a0, 34
+; RV32IV-NEXT:    mul t1, t1, a0
+; RV32IV-NEXT:    .cfi_def_cfa t1, -272
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB3_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB3_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    li a1, 34
+; RV32IV-NEXT:    mul a0, a0, a1
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec1 = alloca <vscale x 4 x float>, align 16
+  %vec2 = alloca <vscale x 4 x float>, align 16
+  %vec3 = alloca <vscale x 4 x float>, align 16
+  %vec4 = alloca <vscale x 4 x float>, align 16
+  %vec5 = alloca <vscale x 4 x float>, align 16
+  %vec6 = alloca <vscale x 4 x float>, align 16
+  %vec7 = alloca <vscale x 4 x float>, align 16
+  %vec8 = alloca <vscale x 4 x float>, align 16
+  %vec9 = alloca <vscale x 4 x float>, align 16
+  %vec10 = alloca <vscale x 4 x float>, align 16
+  %vec11 = alloca <vscale x 4 x float>, align 16
+  %vec12 = alloca <vscale x 4 x float>, align 16
+  %vec13 = alloca <vscale x 4 x float>, align 16
+  %vec14 = alloca <vscale x 4 x float>, align 16
+  %vec15 = alloca <vscale x 4 x float>, align 16
+  %vec16 = alloca <vscale x 4 x float>, align 16
+  %vec17 = alloca <vscale x 4 x float>, align 16
+  ret void
+}
+
+; A vector and a 16-byte fixed size object.
+define void @f1_vector_16_arr(ptr %out) #0 {
+; RV64IV-LABEL: f1_vector_16_arr:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    addi sp, sp, -16
+; RV64IV-NEXT:    .cfi_def_cfa_offset 16
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    slli t1, t1, 1
+; RV64IV-NEXT:    .cfi_def_cfa t1, -16
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB4_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB4_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    slli a0, a0, 1
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 16
+; RV64IV-NEXT:    addi sp, sp, 16
+; RV64IV-NEXT:    .cfi_def_cfa_offset 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f1_vector_16_arr:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    addi sp, sp, -16
+; RV32IV-NEXT:    .cfi_def_cfa_offset 16
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    slli t1, t1, 1
+; RV32IV-NEXT:    .cfi_def_cfa t1, -16
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB4_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB4_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    slli a0, a0, 1
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 16
+; RV32IV-NEXT:    addi sp, sp, 16
+; RV32IV-NEXT:    .cfi_def_cfa_offset 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 4 x float>, align 16
+  %arr = alloca i8, i64 16, align 1
+  ret void
+}
+
+; A large vector object and a large slot, both of which need probing.
+define void @f1_vector_4096_arr(ptr %out) #0 {
+; RV64IV-LABEL: f1_vector_4096_arr:
+; RV64IV:       # %bb.0: # %entry
+; RV64IV-NEXT:    lui a0, 1
+; RV64IV-NEXT:    sub sp, sp, a0
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    .cfi_def_cfa_offset 4096
+; RV64IV-NEXT:    lui a0, 1
+; RV64IV-NEXT:    sub sp, sp, a0
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    .cfi_def_cfa_offset 8192
+; RV64IV-NEXT:    lui a0, 1
+; RV64IV-NEXT:    sub sp, sp, a0
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    .cfi_def_cfa_offset 12288
+; RV64IV-NEXT:    addi sp, sp, -16
+; RV64IV-NEXT:    .cfi_def_cfa_offset 12304
+; RV64IV-NEXT:    csrr t1, vlenb
+; RV64IV-NEXT:    slli t1, t1, 7
+; RV64IV-NEXT:    .cfi_def_cfa t1, -1024
+; RV64IV-NEXT:    lui t2, 1
+; RV64IV-NEXT:  .LBB5_1: # %entry
+; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64IV-NEXT:    sub sp, sp, t2
+; RV64IV-NEXT:    sd zero, 0(sp)
+; RV64IV-NEXT:    sub t1, t1, t2
+; RV64IV-NEXT:    bge t1, t2, .LBB5_1
+; RV64IV-NEXT:  # %bb.2: # %entry
+; RV64IV-NEXT:    .cfi_def_cfa_register sp
+; RV64IV-NEXT:    sub sp, sp, t1
+; RV64IV-NEXT:    .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
+; RV64IV-NEXT:    csrr a0, vlenb
+; RV64IV-NEXT:    slli a0, a0, 7
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa sp, 12304
+; RV64IV-NEXT:    lui a0, 3
+; RV64IV-NEXT:    addiw a0, a0, 16
+; RV64IV-NEXT:    add sp, sp, a0
+; RV64IV-NEXT:    .cfi_def_cfa_offset 0
+; RV64IV-NEXT:    ret
+;
+; RV32IV-LABEL: f1_vector_4096_arr:
+; RV32IV:       # %bb.0: # %entry
+; RV32IV-NEXT:    lui a0, 1
+; RV32IV-NEXT:    sub sp, sp, a0
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    .cfi_def_cfa_offset 4096
+; RV32IV-NEXT:    lui a0, 1
+; RV32IV-NEXT:    sub sp, sp, a0
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    .cfi_def_cfa_offset 8192
+; RV32IV-NEXT:    lui a0, 1
+; RV32IV-NEXT:    sub sp, sp, a0
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    .cfi_def_cfa_offset 12288
+; RV32IV-NEXT:    addi sp, sp, -16
+; RV32IV-NEXT:    .cfi_def_cfa_offset 12304
+; RV32IV-NEXT:    csrr t1, vlenb
+; RV32IV-NEXT:    slli t1, t1, 7
+; RV32IV-NEXT:    .cfi_def_cfa t1, -1024
+; RV32IV-NEXT:    lui t2, 1
+; RV32IV-NEXT:  .LBB5_1: # %entry
+; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32IV-NEXT:    sub sp, sp, t2
+; RV32IV-NEXT:    sw zero, 0(sp)
+; RV32IV-NEXT:    sub t1, t1, t2
+; RV32IV-NEXT:    bge t1, t2, .LBB5_1
+; RV32IV-NEXT:  # %bb.2: # %entry
+; RV32IV-NEXT:    .cfi_def_cfa_register sp
+; RV32IV-NEXT:    sub sp, sp, t1
+; RV32IV-NEXT:    .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
+; RV32IV-NEXT:    csrr a0, vlenb
+; RV32IV-NEXT:    slli a0, a0, 7
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa sp, 12304
+; RV32IV-NEXT:    lui a0, 3
+; RV32IV-NEXT:    addi a0, a0, 16
+; RV32IV-NEXT:    add sp, sp, a0
+; RV32IV-NEXT:    .cfi_def_cfa_offset 0
+; RV32IV-NEXT:    ret
+entry:
+  %vec = alloca <vscale x 256 x float>, align 16
+  %arr = alloca i8, i64 12288, align 1
+  ret void
+}
+
+attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }

>From 5a54950e292e7458964eb7ae27d83d5f6b9cbbbd Mon Sep 17 00:00:00 2001
From: Raphael Moreira Zinsly <rzinsly at ventanamicro.com>
Date: Wed, 18 Dec 2024 11:50:11 -0300
Subject: [PATCH 2/2] Add align test and fix types

---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  | 11 ++-
 llvm/lib/Target/RISCV/RISCVFrameLowering.h    |  6 --
 .../CodeGen/RISCV/stack-clash-prologue.ll     | 68 +++++++++++++++++++
 3 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 04f841d589ce89..504c7936b32881 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -2081,10 +2081,9 @@ TargetStackID::Value RISCVFrameLowering::getStackIDForScalableVectors() const {
 }
 
 // Synthesize the probe loop.
-MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline(
-    MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator MBBI, DebugLoc DL, Register TargetReg,
-    bool IsRVV) const {
+static void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                                 Register TargetReg, bool IsRVV) {
   assert(TargetReg != RISCV::X2 && "New top of stack cannot already be in SP");
 
   auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
@@ -2152,8 +2151,6 @@ MachineBasicBlock *RISCVFrameLowering::emitStackProbeInline(
   MBB.addSuccessor(LoopTestMBB);
   // Update liveins.
   fullyRecomputeLiveIns({ExitMBB, LoopTestMBB});
-
-  return ExitMBB;
 }
 
 void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
@@ -2163,7 +2160,7 @@ void RISCVFrameLowering::inlineStackProbe(MachineFunction &MF,
   // to traverse the block while potentially creating more blocks.
   SmallVector<MachineInstr *, 4> ToReplace;
   for (MachineInstr &MI : MBB) {
-    int Opc = MI.getOpcode();
+    unsigned Opc = MI.getOpcode();
     if (Opc == RISCV::PROBED_STACKALLOC ||
         Opc == RISCV::PROBED_STACKALLOC_RVV) {
       ToReplace.push_back(&MI);
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 1a2c6e0302623d..26d2a26d681c35 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -83,12 +83,6 @@ class RISCVFrameLowering : public TargetFrameLowering {
                      uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
                      uint64_t ProbeSize) const;
 
-  MachineBasicBlock *emitStackProbeInline(MachineFunction &MF,
-                                          MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MBBI,
-                                          DebugLoc DL, Register TargetReg,
-                                          bool IsRVV) const;
-
 protected:
   const RISCVSubtarget &STI;
 
diff --git a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
index 18af080e86747b..843e57a42d926d 100644
--- a/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
+++ b/llvm/test/CodeGen/RISCV/stack-clash-prologue.ll
@@ -538,4 +538,72 @@ define i32 @f9(i64 %i) local_unnamed_addr #0 {
   ret i32 %c
 }
 
+; alloca < probe_size, align < probe_size, alloca + align > probe_size
+define i32 @f10(i64 %i) local_unnamed_addr #0 {
+; RV64I-LABEL: f10:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 2032
+; RV64I-NEXT:    sd ra, 2024(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 2016(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    .cfi_offset s0, -16
+; RV64I-NEXT:    addi s0, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa s0, 0
+; RV64I-NEXT:    addi sp, sp, -2048
+; RV64I-NEXT:    addi sp, sp, -1040
+; RV64I-NEXT:    andi sp, sp, -1024
+; RV64I-NEXT:    sd zero, 0(sp)
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    addi a1, sp, 1024
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    sw a1, 0(a0)
+; RV64I-NEXT:    lw a0, 1024(sp)
+; RV64I-NEXT:    addi sp, s0, -2032
+; RV64I-NEXT:    .cfi_def_cfa sp, 2032
+; RV64I-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 2016(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    .cfi_restore ra
+; RV64I-NEXT:    .cfi_restore s0
+; RV64I-NEXT:    addi sp, sp, 2032
+; RV64I-NEXT:    .cfi_def_cfa_offset 0
+; RV64I-NEXT:    ret
+;
+; RV32I-LABEL: f10:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 2032
+; RV32I-NEXT:    sw ra, 2028(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 2024(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    .cfi_offset ra, -4
+; RV32I-NEXT:    .cfi_offset s0, -8
+; RV32I-NEXT:    addi s0, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa s0, 0
+; RV32I-NEXT:    addi sp, sp, -2048
+; RV32I-NEXT:    addi sp, sp, -1040
+; RV32I-NEXT:    andi sp, sp, -1024
+; RV32I-NEXT:    sw zero, 0(sp)
+; RV32I-NEXT:    slli a0, a0, 2
+; RV32I-NEXT:    addi a1, sp, 1024
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    li a1, 1
+; RV32I-NEXT:    sw a1, 0(a0)
+; RV32I-NEXT:    lw a0, 1024(sp)
+; RV32I-NEXT:    addi sp, s0, -2032
+; RV32I-NEXT:    .cfi_def_cfa sp, 2032
+; RV32I-NEXT:    lw ra, 2028(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 2024(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    .cfi_restore ra
+; RV32I-NEXT:    .cfi_restore s0
+; RV32I-NEXT:    addi sp, sp, 2032
+; RV32I-NEXT:    .cfi_def_cfa_offset 0
+; RV32I-NEXT:    ret
+  %a = alloca i32, i32 1000, align 1024
+  %b = getelementptr inbounds i32, ptr %a, i64 %i
+  store volatile i32 1, ptr %b
+  %c = load volatile i32, ptr %a
+  ret i32 %c
+}
+
 attributes #0 = { "probe-stack"="inline-asm" }