[llvm] Reland [AMDGPU] Support block load/store for CSR #130013 (PR #137169)
Diana Picus via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 25 02:22:54 PDT 2025
https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/137169
>From b8c2867d50411a91192170dc527fa7c24cbd2667 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Thu, 24 Apr 2025 13:50:17 +0200
Subject: [PATCH 1/5] Reapply "[AMDGPU] Support block load/store for CSR"
(#136846)
This reverts commit 6bb2f90557fb2b4b216299cc2beb4afb641476aa.
---
llvm/include/llvm/CodeGen/MachineFrameInfo.h | 1 +
.../llvm/CodeGen/TargetFrameLowering.h | 17 +
llvm/lib/CodeGen/PrologEpilogInserter.cpp | 35 +--
llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp | 35 +++
llvm/lib/Target/AMDGPU/AMDGPU.td | 8 +
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 37 +++
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 204 ++++++++++++
llvm/lib/Target/AMDGPU/SIFrameLowering.h | 17 +
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 14 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 21 +-
.../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 32 ++
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 71 ++++-
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 16 +
.../AMDGPU/pei-vgpr-block-spill-csr.mir | 294 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll | 93 ++++++
.../CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir | 47 +++
llvm/unittests/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp | 160 ++++++++++
19 files changed, 1065 insertions(+), 41 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir
create mode 100644 llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp
diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
index 172c3e8c9a847..9d1b536d23331 100644
--- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h
@@ -61,6 +61,7 @@ class CalleeSavedInfo {
MCRegister getReg() const { return Reg; }
int getFrameIdx() const { return FrameIdx; }
MCRegister getDstReg() const { return DstReg; }
+ void setReg(MCRegister R) { Reg = R; }
void setFrameIdx(int FI) {
FrameIdx = FI;
SpilledToReg = false;
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index cdbefb36c00c7..58b63f1769003 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -270,6 +270,14 @@ class TargetFrameLowering {
return false;
}
+ /// spillCalleeSavedRegister - Default implementation for spilling a single
+ /// callee saved register.
+ void spillCalleeSavedRegister(MachineBasicBlock &SaveBlock,
+ MachineBasicBlock::iterator MI,
+ const CalleeSavedInfo &CS,
+ const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI) const;
+
/// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee
/// saved registers and returns true if it isn't possible / profitable to do
/// so by issuing a series of load instructions via loadRegToStackSlot().
@@ -284,6 +292,15 @@ class TargetFrameLowering {
return false;
}
+ // restoreCalleeSavedRegister - Default implementation for restoring a single
+ // callee saved register. Should be called in reverse order. Can insert
+ // multiple instructions.
+ void restoreCalleeSavedRegister(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const CalleeSavedInfo &CS,
+ const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI) const;
+
/// hasFP - Return true if the specified function should have a dedicated
/// frame pointer register. For most targets this is true only if the function
/// has variable sized allocas or if frame pointer elimination is disabled.
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index ac4090252cea0..0cd25c4feb8b9 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -476,8 +476,8 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
// Now that we know which registers need to be saved and restored, allocate
// stack slots for them.
for (auto &CS : CSI) {
- // If the target has spilled this register to another register, we don't
- // need to allocate a stack slot.
+ // If the target has spilled this register to another register or already
+ // handled it , we don't need to allocate a stack slot.
if (CS.isSpilledToReg())
continue;
@@ -597,25 +597,14 @@ static void updateLiveness(MachineFunction &MF) {
static void insertCSRSaves(MachineBasicBlock &SaveBlock,
ArrayRef<CalleeSavedInfo> CSI) {
MachineFunction &MF = *SaveBlock.getParent();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
MachineBasicBlock::iterator I = SaveBlock.begin();
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CS : CSI) {
- // Insert the spill to the stack frame.
- MCRegister Reg = CS.getReg();
-
- if (CS.isSpilledToReg()) {
- BuildMI(SaveBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY),
- CS.getDstReg())
- .addReg(Reg, getKillRegState(true));
- } else {
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
- TRI, Register());
- }
+ TFI->spillCalleeSavedRegister(SaveBlock, I, CS, TII, TRI);
}
}
}
@@ -624,7 +613,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
std::vector<CalleeSavedInfo> &CSI) {
MachineFunction &MF = *RestoreBlock.getParent();
- const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
@@ -634,19 +623,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CI : reverse(CSI)) {
- MCRegister Reg = CI.getReg();
- if (CI.isSpilledToReg()) {
- BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg)
- .addReg(CI.getDstReg(), getKillRegState(true));
- } else {
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
- TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC,
- TRI, Register());
- assert(I != RestoreBlock.begin() &&
- "loadRegFromStackSlot didn't insert any code!");
- // Insert in reverse order. loadRegFromStackSlot can insert
- // multiple instructions.
- }
+ TFI->restoreCalleeSavedRegister(RestoreBlock, I, CI, TII, TRI);
}
}
}
diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index be73b73c93989..70c3b2cbae9a6 100644
--- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -15,6 +15,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Function.h"
@@ -182,3 +183,37 @@ TargetFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
return DwarfFrameBase{DwarfFrameBase::Register, {RI->getFrameRegister(MF).id()}};
}
+
+void TargetFrameLowering::spillCalleeSavedRegister(
+ MachineBasicBlock &SaveBlock, MachineBasicBlock::iterator MI,
+ const CalleeSavedInfo &CS, const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI) const {
+ // Insert the spill to the stack frame.
+ MCRegister Reg = CS.getReg();
+
+ if (CS.isSpilledToReg()) {
+ BuildMI(SaveBlock, MI, DebugLoc(), TII->get(TargetOpcode::COPY),
+ CS.getDstReg())
+ .addReg(Reg, getKillRegState(true));
+ } else {
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII->storeRegToStackSlot(SaveBlock, MI, Reg, true, CS.getFrameIdx(), RC,
+ TRI, Register());
+ }
+}
+
+void TargetFrameLowering::restoreCalleeSavedRegister(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ const CalleeSavedInfo &CS, const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI) const {
+ MCRegister Reg = CS.getReg();
+ if (CS.isSpilledToReg()) {
+ BuildMI(MBB, MI, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+ .addReg(CS.getDstReg(), getKillRegState(true));
+ } else {
+ const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+ TII->loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
+ Register());
+ assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
+ }
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index b2098b41acb7e..d896589825fc7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1275,6 +1275,14 @@ def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32
"Use a block size of 32 for dynamic VGPR allocation (default is 16)"
>;
+// Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
+// restoring the callee-saved registers.
+def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
+ "UseBlockVGPROpsForCSR",
+ "true",
+ "Use block load/store for VGPR callee saved registers"
+>;
+
def FeatureLshlAddU64Inst
: SubtargetFeature<"lshl-add-u64-inst", "HasLshlAddU64Inst", "true",
"Has v_lshl_add_u64 instruction">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 3d6b974d1f027..2dec16de940d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -19,6 +19,7 @@
#include "MCTargetDesc/AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCExpr.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/IR/Constants.h"
@@ -243,6 +244,36 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV,
return AsmPrinter::lowerConstant(CV, BaseCV, Offset);
}
+static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
+ const TargetRegisterInfo *TRI,
+ const SIMachineFunctionInfo *MFI,
+ MCStreamer &OS) {
+ // The instruction will only transfer a subset of the registers in the block,
+ // based on the mask that is stored in m0. We could search for the instruction
+ // that sets m0, but most of the time we'll already have the mask stored in
+ // the machine function info. Try to use that. This assumes that we only use
+ // block loads/stores for CSR spills.
+ Register RegBlock =
+ TII->getNamedOperand(*MI, MI->mayLoad() ? AMDGPU::OpName::vdst
+ : AMDGPU::OpName::vdata)
+ ->getReg();
+ Register FirstRegInBlock = TRI->getSubReg(RegBlock, AMDGPU::sub0);
+ uint32_t Mask = MFI->getMaskForVGPRBlockOps(RegBlock);
+
+ if (!Mask)
+ return; // Nothing to report
+
+ SmallString<512> TransferredRegs;
+ for (unsigned I = 0; I < sizeof(Mask) * 8; ++I) {
+ if (Mask & (1 << I)) {
+ (llvm::Twine(" ") + TRI->getRegAsmName(FirstRegInBlock + I))
+ .toVector(TransferredRegs);
+ }
+ }
+
+ OS.emitRawComment(" transferring at most " + TransferredRegs);
+}
+
void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
// FIXME: Enable feature predicate checks once all the test pass.
// AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
@@ -331,6 +362,12 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}
+ if (isVerbose())
+ if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode()))
+ emitVGPRBlockComment(MI, STI.getInstrInfo(), STI.getRegisterInfo(),
+ MF->getInfo<SIMachineFunctionInfo>(),
+ *OutStreamer);
+
MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 7dd91c0775a48..fea17baa17722 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -262,6 +262,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasPointSampleAccel = false;
bool RequiresCOV6 = false;
+ bool UseBlockVGPROpsForCSR = false;
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
@@ -1277,6 +1278,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool requiresCodeObjectV6() const { return RequiresCOV6; }
+ bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
+
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 9c737b4f3e378..0c1cd9ceddb02 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1739,6 +1739,105 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
}
}
+static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
+ const GCNSubtarget &ST,
+ std::vector<CalleeSavedInfo> &CSI,
+ unsigned &MinCSFrameIndex,
+ unsigned &MaxCSFrameIndex) {
+ SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+ assert(std::is_sorted(CSI.begin(), CSI.end(),
+ [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
+ return A.getReg() < B.getReg();
+ }) &&
+ "Callee saved registers not sorted");
+
+ auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
+ return !CSI.isSpilledToReg() &&
+ TRI->getPhysRegBaseClass(CSI.getReg()) == &AMDGPU::VGPR_32RegClass &&
+ !FuncInfo->isWWMReservedRegister(CSI.getReg());
+ };
+
+ auto CSEnd = CSI.end();
+ for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
+ Register Reg = CSIt->getReg();
+ if (!CanUseBlockOps(*CSIt))
+ continue;
+
+ // Find all the regs that will fit in a 32-bit mask starting at the current
+ // reg and build said mask. It should have 1 for every register that's
+ // included, with the current register as the least significant bit.
+ uint32_t Mask = 1;
+ CSEnd = std::remove_if(
+ CSIt + 1, CSEnd, [&](const CalleeSavedInfo &CSI) -> bool {
+ if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) {
+ Mask |= 1 << (CSI.getReg() - Reg);
+ return true;
+ } else {
+ return false;
+ }
+ });
+
+ const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF);
+ Register RegBlock =
+ TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, BlockRegClass);
+ if (!RegBlock) {
+ // We couldn't find a super register for the block. This can happen if
+ // the register we started with is too high (e.g. v232 if the maximum is
+ // v255). We therefore try to get the last register block and figure out
+ // the mask from there.
+ Register LastBlockStart =
+ AMDGPU::VGPR0 + alignDown(Reg - AMDGPU::VGPR0, 32);
+ RegBlock =
+ TRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass);
+ assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) &&
+ "Couldn't find super register");
+ int RegDelta = Reg - LastBlockStart;
+ assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta &&
+ "Bad shift amount");
+ Mask <<= RegDelta;
+ }
+
+ FuncInfo->setMaskForVGPRBlockOps(RegBlock, Mask);
+
+ // The stack objects can be a bit smaller than the register block if we know
+ // some of the high bits of Mask are 0. This may happen often with calling
+ // conventions where the caller and callee-saved VGPRs are interleaved at
+ // a small boundary (e.g. 8 or 16).
+ int UnusedBits = llvm::countl_zero(Mask);
+ unsigned BlockSize = TRI->getSpillSize(*BlockRegClass) - UnusedBits * 4;
+ int FrameIdx =
+ MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass),
+ /*isSpillSlot=*/true);
+ if ((unsigned)FrameIdx < MinCSFrameIndex)
+ MinCSFrameIndex = FrameIdx;
+ if ((unsigned)FrameIdx > MaxCSFrameIndex)
+ MaxCSFrameIndex = FrameIdx;
+
+ CSIt->setFrameIdx(FrameIdx);
+ CSIt->setReg(RegBlock);
+ }
+ CSI.erase(CSEnd, CSI.end());
+}
+
+bool SIFrameLowering::assignCalleeSavedSpillSlots(
+ MachineFunction &MF, const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
+ unsigned &MaxCSFrameIndex) const {
+ if (CSI.empty())
+ return true; // Early exit if no callee saved registers are modified!
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
+
+ if (UseVGPRBlocks)
+ assignSlotsUsingVGPRBlocks(MF, ST, CSI, MinCSFrameIndex, MaxCSFrameIndex);
+
+ return assignCalleeSavedSpillSlots(MF, TRI, CSI) || UseVGPRBlocks;
+}
+
bool SIFrameLowering::assignCalleeSavedSpillSlots(
MachineFunction &MF, const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const {
@@ -1808,6 +1907,111 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
return true;
}
+bool SIFrameLowering::spillCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+ MachineFunction *MF = MBB.getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ if (!ST.useVGPRBlockOpsForCSR())
+ return false;
+
+ MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+ SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
+
+ const TargetRegisterClass *BlockRegClass =
+ static_cast<const SIRegisterInfo *>(TRI)->getRegClassForBlockOp(*MF);
+ for (const CalleeSavedInfo &CS : CSI) {
+ Register Reg = CS.getReg();
+ if (!BlockRegClass->contains(Reg) ||
+ !FuncInfo->hasMaskForVGPRBlockOps(Reg)) {
+ spillCalleeSavedRegister(MBB, MI, CS, TII, TRI);
+ continue;
+ }
+
+ // Build a scratch block store.
+ uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
+ int FrameIndex = CS.getFrameIdx();
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+ MachineMemOperand *MMO =
+ MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+ FrameInfo.getObjectSize(FrameIndex),
+ FrameInfo.getObjectAlign(FrameIndex));
+
+ BuildMI(MBB, MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
+ .addReg(Reg, getKillRegState(false))
+ .addFrameIndex(FrameIndex)
+ .addReg(MFI->getStackPtrOffsetReg())
+ .addImm(0)
+ .addImm(Mask)
+ .addMemOperand(MMO);
+
+ FuncInfo->setHasSpilledVGPRs();
+
+ // Add the register to the liveins. This is necessary because if any of the
+ // VGPRs in the register block is reserved (e.g. if it's a WWM register),
+ // then the whole block will be marked as reserved and `updateLiveness` will
+ // skip it.
+ MBB.addLiveIn(Reg);
+ }
+ MBB.sortUniqueLiveIns();
+
+ return true;
+}
+
+bool SIFrameLowering::restoreCalleeSavedRegisters(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+ MachineFunction *MF = MBB.getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ if (!ST.useVGPRBlockOpsForCSR())
+ return false;
+
+ SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI);
+ const TargetRegisterClass *BlockRegClass = SITRI->getRegClassForBlockOp(*MF);
+ for (const CalleeSavedInfo &CS : reverse(CSI)) {
+ Register Reg = CS.getReg();
+ if (!BlockRegClass->contains(Reg) ||
+ !FuncInfo->hasMaskForVGPRBlockOps(Reg)) {
+ restoreCalleeSavedRegister(MBB, MI, CS, TII, TRI);
+ continue;
+ }
+
+ // Build a scratch block load.
+ uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
+ int FrameIndex = CS.getFrameIdx();
+ MachinePointerInfo PtrInfo =
+ MachinePointerInfo::getFixedStack(*MF, FrameIndex);
+ MachineMemOperand *MMO = MF->getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+ MFI.getObjectAlign(FrameIndex));
+
+ auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
+ .addFrameIndex(FrameIndex)
+ .addReg(FuncInfo->getStackPtrOffsetReg())
+ .addImm(0)
+ .addImm(Mask)
+ .addMemOperand(MMO);
+ SITRI->addImplicitUsesForBlockCSRLoad(MIB, Reg);
+
+ // Add the register to the liveins. This is necessary because if any of the
+ // VGPRs in the register block is reserved (e.g. if it's a WWM register),
+ // then the whole block will be marked as reserved and `updateLiveness` will
+ // skip it.
+ MBB.addLiveIn(Reg);
+ }
+
+ MBB.sortUniqueLiveIns();
+ return true;
+}
+
MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF,
MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 9dac4bc8951e5..a72772987262e 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -49,6 +49,23 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
const TargetRegisterInfo *TRI,
std::vector<CalleeSavedInfo> &CSI) const override;
+ bool assignCalleeSavedSpillSlots(MachineFunction &MF,
+ const TargetRegisterInfo *TRI,
+ std::vector<CalleeSavedInfo> &CSI,
+ unsigned &MinCSFrameIndex,
+ unsigned &MaxCSFrameIndex) const override;
+
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ ArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI) const override;
+
+ bool
+ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ MutableArrayRef<CalleeSavedInfo> CSI,
+ const TargetRegisterInfo *TRI) const override;
+
bool allocateScavengingFrameIndexesNearIncomingSP(
const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a3a54659d299a..4b97f58ce92b9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -665,6 +665,20 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
}
+ static bool isBlockLoadStore(uint16_t Opcode) {
+ switch (Opcode) {
+ case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
+ case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
+ case AMDGPU::SCRATCH_STORE_BLOCK_SADDR:
+ case AMDGPU::SCRATCH_LOAD_BLOCK_SADDR:
+ case AMDGPU::SCRATCH_STORE_BLOCK_SVS:
+ case AMDGPU::SCRATCH_LOAD_BLOCK_SVS:
+ return true;
+ default:
+ return false;
+ }
+ }
+
static bool isEXP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::EXP;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ed45cf8851146..a144ae2104da6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1025,13 +1025,16 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
// needs to be used and an extra instruction to move between VGPR and AGPR.
// UsesTmp adds to the total size of an expanded spill in this case.
-multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> {
+multiclass SI_SPILL_VGPR <RegisterClass vgpr_class,
+ bit UsesTmp = 0, bit HasMask = 0> {
let UseNamedOperandTable = 1, Spill = 1, VALU = 1,
SchedRW = [WriteVMEM] in {
def _SAVE : VPseudoInstSI <
(outs),
- (ins vgpr_class:$vdata, i32imm:$vaddr,
- SReg_32:$soffset, i32imm:$offset)> {
+ !con(
+ (ins vgpr_class:$vdata, i32imm:$vaddr,
+ SReg_32:$soffset, i32imm:$offset),
+ !if(HasMask, (ins SReg_32:$mask), (ins)))> {
let mayStore = 1;
let mayLoad = 0;
// (2 * 4) + (8 * num_subregs) bytes maximum
@@ -1042,8 +1045,10 @@ multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> {
def _RESTORE : VPseudoInstSI <
(outs vgpr_class:$vdata),
- (ins i32imm:$vaddr,
- SReg_32:$soffset, i32imm:$offset)> {
+ !con(
+ (ins i32imm:$vaddr,
+ SReg_32:$soffset, i32imm:$offset),
+ !if(HasMask, (ins SReg_32:$mask), (ins)))> {
let mayStore = 0;
let mayLoad = 1;
@@ -1071,6 +1076,12 @@ defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>;
defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
+let Defs = [M0] in {
+ // Spills a block of 32 VGPRs. M0 will contain a mask describing which
+ // registers in the block need to be transferred.
+ defm SI_BLOCK_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024, 0, 1>;
+}
+
defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>;
defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>;
defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index a60409b5a7e09..9c1014a0e5cfe 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -388,6 +388,16 @@ class PrologEpilogSGPRSaveRestoreInfo {
SGPRSaveKind getKind() const { return Kind; }
};
+const MCRegister FirstVGPRBlock = AMDGPU::VReg_1024RegClass.getRegister(0);
+
+struct VGPRBlock2IndexFunctor {
+ using argument_type = Register;
+ unsigned operator()(Register Reg) const {
+ assert(AMDGPU::VReg_1024RegClass.contains(Reg) && "Expecting a VGPR block");
+ return Reg - FirstVGPRBlock;
+ }
+};
+
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
@@ -574,6 +584,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
// frame, so save it here and add it to the RegScavenger later.
std::optional<int> ScavengeFI;
+ // Map each VGPR CSR to the mask needed to save and restore it using block
+ // load/store instructions. Only used if the subtarget feature for VGPR block
+ // load/store is enabled.
+ IndexedMap<uint32_t, VGPRBlock2IndexFunctor> MaskForVGPRBlockOps;
+
private:
Register VGPRForAGPRCopy;
@@ -594,6 +609,19 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const;
+ void setMaskForVGPRBlockOps(Register RegisterBlock, uint32_t Mask) {
+ MaskForVGPRBlockOps.grow(RegisterBlock);
+ MaskForVGPRBlockOps[RegisterBlock] = Mask;
+ }
+
+ uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const {
+ return MaskForVGPRBlockOps[RegisterBlock];
+ }
+
+ bool hasMaskForVGPRBlockOps(Register RegisterBlock) const {
+ return MaskForVGPRBlockOps.inBounds(RegisterBlock);
+ }
+
public:
SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI) = default;
SIMachineFunctionInfo(const Function &F, const GCNSubtarget *STI);
@@ -634,6 +662,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
+ bool isWWMReservedRegister(Register Reg) const {
+ return WWMReservedRegs.contains(Reg);
+ }
+
ArrayRef<PrologEpilogSGPRSpill> getPrologEpilogSGPRSpills() const {
assert(is_sorted(PrologEpilogSGPRSpills, llvm::less_first()));
return PrologEpilogSGPRSpills;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 7838fd91a94da..0e4cd12e57d77 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1177,9 +1177,18 @@ SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
return RC;
}
-static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI,
+ const SIInstrInfo *TII) {
+ unsigned Op = MI.getOpcode();
switch (Op) {
+ case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
+ case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
+ // FIXME: This assumes the mask is statically known and not computed at
+ // runtime. However, some ABIs may want to compute the mask dynamically and
+ // this will need to be updated.
+ return llvm::popcount(
+ (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_V1024_SAVE:
@@ -1520,6 +1529,10 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
bool UseST =
!HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
+ // Handle block load/store first.
+ if (TII->isBlockLoadStore(LoadStoreOp))
+ return LoadStoreOp;
+
switch (EltSize) {
case 4:
LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
@@ -1564,6 +1577,7 @@ void SIRegisterInfo::buildSpillLoadStore(
const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
bool IsStore = Desc->mayStore();
bool IsFlat = TII->isFLATScratch(LoadStoreOp);
+ bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
bool CanClobberSCC = false;
bool Scavenged = false;
@@ -1576,7 +1590,10 @@ void SIRegisterInfo::buildSpillLoadStore(
// Always use 4 byte operations for AGPRs because we need to scavenge
// a temporary VGPR.
- unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
+ // If we're using a block operation, the element should be the whole block.
+ unsigned EltSize = IsBlock ? RegWidth
+ : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
+ : 4u;
unsigned NumSubRegs = RegWidth / EltSize;
unsigned Size = NumSubRegs * EltSize;
unsigned RemSize = RegWidth - Size;
@@ -1731,6 +1748,7 @@ void SIRegisterInfo::buildSpillLoadStore(
LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
} else {
assert(ST.hasFlatScratchSTMode());
+ assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
}
@@ -1939,6 +1957,14 @@ void SIRegisterInfo::buildSpillLoadStore(
MIB.addReg(SubReg, RegState::Implicit);
MIB->tieOperands(0, MIB->getNumOperands() - 1);
}
+
+ // If we're building a block load, we should add artificial uses for the
+ // CSR VGPRs that are *not* being transferred. This is because liveness
+ // analysis is not aware of the mask, so we need to somehow inform it that
+ // those registers are not available before the load and they should not be
+ // scavenged.
+ if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
+ addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
}
if (ScratchOffsetRegDelta != 0) {
@@ -1949,6 +1975,18 @@ void SIRegisterInfo::buildSpillLoadStore(
}
}
+void SIRegisterInfo::addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB,
+ Register BlockReg) const {
+ const MachineFunction *MF = MIB->getParent()->getParent();
+ const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
+ uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
+ Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
+ for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
+ if (!(Mask & (1 << RegOffset)) &&
+ isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
+ MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
+}
+
void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
int Offset, bool IsLoad,
bool IsKill) const {
@@ -2367,6 +2405,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
// VGPR register spill
+ case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
+ // Put mask into M0.
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
+ AMDGPU::M0)
+ .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
+ LLVM_FALLTHROUGH;
+ }
case AMDGPU::SI_SPILL_V1024_SAVE:
case AMDGPU::SI_SPILL_V512_SAVE:
case AMDGPU::SI_SPILL_V384_SAVE:
@@ -2427,8 +2472,10 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
} else {
- Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
- : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+ Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
+ ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
+ : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+ : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
}
auto *MBB = MI->getParent();
@@ -2441,13 +2488,20 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
*MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
*MI->memoperands_begin(), RS);
- MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
+ MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(*MI, TII));
if (IsWWMRegSpill)
TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
MI->eraseFromParent();
return true;
}
+ case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
+ // Put mask into M0.
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
+ AMDGPU::M0)
+ .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
+ LLVM_FALLTHROUGH;
+ }
case AMDGPU::SI_SPILL_V16_RESTORE:
case AMDGPU::SI_SPILL_V32_RESTORE:
case AMDGPU::SI_SPILL_V64_RESTORE:
@@ -2503,9 +2557,12 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
} else {
- Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
- : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+ Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
+ ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
+ : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+ : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
}
+
auto *MBB = MI->getParent();
bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
if (IsWWMRegSpill) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index f3068963fd10f..a4b135d5e0b59 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -26,6 +26,7 @@ namespace llvm {
class GCNSubtarget;
class LiveIntervals;
class LiveRegUnits;
+class MachineInstrBuilder;
class RegisterBank;
struct SGPRSpillBuilder;
@@ -115,6 +116,16 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
return 100;
}
+ // When building a block VGPR load, we only really transfer a subset of the
+ // registers in the block, based on a mask. Liveness analysis is not aware of
+ // the mask, so it might consider that any register in the block is available
+ // before the load and may therefore be scavenged. This is not ok for CSRs
+ // that are not clobbered, since the caller will expect them to be preserved.
+ // This method will add artificial implicit uses for those registers on the
+ // load instruction, so liveness analysis knows they're unavailable.
+ void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB,
+ Register BlockReg) const;
+
const TargetRegisterClass *
getLargestLegalSuperClass(const TargetRegisterClass *RC,
const MachineFunction &MF) const override;
@@ -158,6 +169,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
const TargetRegisterClass *
getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+ const TargetRegisterClass *
+ getRegClassForBlockOp(const MachineFunction &MF) const {
+ return &AMDGPU::VReg_1024RegClass;
+ }
+
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset,
bool IsLoad, bool IsKill = true) const;
diff --git a/llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir b/llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir
new file mode 100644
index 0000000000000..086390f575fbb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pei-vgpr-block-spill-csr.mir
@@ -0,0 +1,294 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+block-vgpr-csr,+wavefrontsize32,-wavefrontsize64 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,W32
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+block-vgpr-csr,-wavefrontsize32,+wavefrontsize64 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,W64
+
+--- |
+ define void @one_block() { ret void }
+ define void @one_block_csr_only() { ret void }
+ define void @multiple_blocks() { ret void }
+ define void @reg_tuples() { ret void }
+ define void @locals() { ret void }
+ define void @other_regs() { ret void }
+ define amdgpu_kernel void @entry_func() { ret void }
+ define void @multiple_basic_blocks() { ret void }
+...
+
+# Block load/store v42 and v45. The mask should be 0x9.
+
+---
+name: one_block
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ liveins: $sgpr30_sgpr31
+ ; CHECK-LABEL: name: one_block
+ ; CHECK: liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $m0 = S_MOV_B32 9
+ ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
+ ; CHECK-NEXT: $m0 = S_MOV_B32 9
+ ; CHECK-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: (load (s1024) from %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
+
+# Block load/store v40-47 and v56-63 (v48-55 and v64-71 are caller-saved). The
+# mask should be 0x00FF00FF.
+
+---
+name: one_block_csr_only
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ liveins: $sgpr30_sgpr31
+ ; CHECK-LABEL: name: one_block_csr_only
+ ; CHECK: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $m0 = S_MOV_B32 16711935
+ ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit-def $vgpr65, implicit-def $vgpr66
+ ; CHECK-NEXT: $m0 = S_MOV_B32 16711935
+ ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (load (s1024) from %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr42, implicit-def $vgpr43, implicit-def $vgpr44, implicit-def $vgpr45, implicit-def $vgpr46, implicit-def $vgpr47, implicit-def $vgpr48, implicit-def $vgpr49, implicit-def $vgpr50, implicit-def $vgpr51, implicit-def $vgpr52, implicit-def $vgpr53, implicit-def $vgpr54, implicit-def $vgpr55, implicit-def $vgpr56, implicit-def $vgpr57, implicit-def $vgpr58, implicit-def $vgpr59, implicit-def $vgpr60, implicit-def $vgpr61, implicit-def $vgpr62, implicit-def $vgpr63, implicit-def $vgpr64, implicit-def $vgpr65, implicit-def $vgpr66
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
+
+# Block load/store to/from different blocks.
+# Note the mask for storing v232, which is 0x100 because we have to start the
+# block at v224 (since the upper limit is 255). For the same reason, the first
+# stack slot will be 36 bytes long (the first 32 will be empty, since the memory
+# will not get compacted). The second slot, which will hold registers v104 and
+# v110, will be 28 bytes long, and finally the third, holding registers v40 and
+# v41, will be 8 bytes long.
+---
+name: multiple_blocks
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ liveins: $sgpr30_sgpr31
+ ; CHECK-LABEL: name: multiple_blocks
+ ; CHECK: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $m0 = S_MOV_B32 3
+ ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 64, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: $m0 = S_MOV_B32 65
+ ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135, $sgpr32, 36, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.1, align 4, addrspace 5)
+ ; CHECK-NEXT: $m0 = S_MOV_B32 256
+ ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr104, implicit-def $vgpr110, implicit-def $vgpr232
+ ; CHECK-NEXT: $m0 = S_MOV_B32 256
+ ; CHECK-NEXT: $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239_vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247_vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr233, implicit $vgpr234, implicit $vgpr235, implicit $vgpr236, implicit $vgpr237, implicit $vgpr238, implicit $vgpr239, implicit $vgpr248, implicit $vgpr249, implicit $vgpr250, implicit $vgpr251, implicit $vgpr252, implicit $vgpr253, implicit $vgpr254, implicit $vgpr255 :: (load (s1024) from %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: $m0 = S_MOV_B32 65
+ ; CHECK-NEXT: $vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111_vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127_vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 36, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr105, implicit $vgpr106, implicit $vgpr107, implicit $vgpr108, implicit $vgpr109, implicit $vgpr111, implicit $vgpr120, implicit $vgpr121, implicit $vgpr122, implicit $vgpr123, implicit $vgpr124, implicit $vgpr125, implicit $vgpr126, implicit $vgpr127 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+ ; CHECK-NEXT: $m0 = S_MOV_B32 3
+ ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 64, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr42, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: (load (s1024) from %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr104, implicit-def $vgpr110, implicit-def $vgpr232
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
+
+# Make sure we handle register tuples correctly, even when they're straddling
+# the boundary between blocks. The first mask should be 0x00000007 (the bottom
+# 2 registers from the second tuple are not callee saves), the second
+# 0x00000003.
+
+---
+name: reg_tuples
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ liveins: $sgpr30_sgpr31
+ ; CHECK-LABEL: name: reg_tuples
+ ; CHECK: liveins: $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $m0 = S_MOV_B32 7
+ ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: $m0 = S_MOV_B32 3
+ ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.1, align 4, addrspace 5)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42, implicit-def $vgpr70_vgpr71_vgpr72_vgpr73
+ ; CHECK-NEXT: $m0 = S_MOV_B32 3
+ ; CHECK-NEXT: $vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79_vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95_vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr74, implicit $vgpr75, implicit $vgpr76, implicit $vgpr77, implicit $vgpr78, implicit $vgpr79, implicit $vgpr88, implicit $vgpr89, implicit $vgpr90, implicit $vgpr91, implicit $vgpr92, implicit $vgpr93, implicit $vgpr94, implicit $vgpr95 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+ ; CHECK-NEXT: $m0 = S_MOV_B32 7
+ ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: (load (s1024) from %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42, implicit-def $vgpr70_vgpr71_vgpr72_vgpr73
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
+
+# Make sure we don't overwrite any stack variables.
+
+---
+name: locals
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+stack:
+- { id: 0, type: default, offset: 0, size: 12, alignment: 4,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+ debug-info-location: '' }
+- { id: 1, type: default, offset: 12, size: 20, alignment: 4,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+ debug-info-location: '' }
+body: |
+ bb.0:
+ liveins: $sgpr30_sgpr31, $vgpr48
+ ; CHECK-LABEL: name: locals
+ ; CHECK: liveins: $vgpr48, $sgpr30_sgpr31, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $m0 = S_MOV_B32 1
+ ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr48, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr48, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40
+ ; CHECK-NEXT: $m0 = S_MOV_B32 1
+ ; CHECK-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr41, implicit $vgpr42, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: (load (s1024) from %stack.2, align 4, addrspace 5)
+ ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ SCRATCH_STORE_DWORD_SADDR $vgpr48, %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ SCRATCH_STORE_DWORD_SADDR $vgpr48, %stack.1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ S_NOP 0, implicit-def $vgpr40
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
+
+# Make sure we don't break SGPR or WWM handling, and also that we don't
+# block-spill WWM VGPRs that have already been spilled (the mask for the block
+# load/store should be 0x9 because we don't want to include v41 or v42).
+# Use all VGPRs up to v40, so the WWM registers v41 and v42 and the VGPR used
+# for SGPR spills remain within the block.
+
+---
+name: other_regs
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+ wwmReservedRegs:
+ - '$vgpr41'
+ - '$vgpr42'
+body: |
+ bb.0:
+ liveins: $sgpr30_sgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40
+ ; W32-LABEL: name: other_regs
+ ; W32: liveins: $sgpr48, $sgpr30_sgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71
+ ; W32-NEXT: {{ $}}
+ ; W32-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; W32-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr41, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; W32-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr42, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+ ; W32-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr44, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
+ ; W32-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
+ ; W32-NEXT: $m0 = S_MOV_B32 9
+ ; W32-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.4, align 4, addrspace 5)
+ ; W32-NEXT: $vgpr44 = SI_SPILL_S32_TO_VGPR $sgpr48, 0, $vgpr44
+ ; W32-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr43, implicit-def $sgpr22, implicit-def $sgpr48, implicit-def $m0, implicit-def $exec
+ ; W32-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40
+ ; W32-NEXT: $sgpr48 = SI_RESTORE_S32_FROM_VGPR $vgpr44, 0
+ ; W32-NEXT: $m0 = S_MOV_B32 9
+ ; W32-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr41, implicit $vgpr42, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: (load (s1024) from %stack.4, align 4, addrspace 5)
+ ; W32-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; W32-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+ ; W32-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+ ; W32-NEXT: $vgpr44 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
+ ; W32-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0
+ ; W32-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ ;
+ ; W64-LABEL: name: other_regs
+ ; W64: liveins: $sgpr48, $sgpr30_sgpr31, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40, $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71
+ ; W64-NEXT: {{ $}}
+ ; W64-NEXT: $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; W64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr41, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ ; W64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr42, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
+ ; W64-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr44, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
+ ; W64-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
+ ; W64-NEXT: $m0 = S_MOV_B32 9
+ ; W64-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.4, align 4, addrspace 5)
+ ; W64-NEXT: $vgpr44 = SI_SPILL_S32_TO_VGPR $sgpr48, 0, $vgpr44
+ ; W64-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr43, implicit-def $sgpr22, implicit-def $sgpr48, implicit-def $m0, implicit-def $exec
+ ; W64-NEXT: S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40
+ ; W64-NEXT: $sgpr48 = SI_RESTORE_S32_FROM_VGPR $vgpr44, 0
+ ; W64-NEXT: $m0 = S_MOV_B32 9
+ ; W64-NEXT: $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr41, implicit $vgpr42, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63 :: (load (s1024) from %stack.4, align 4, addrspace 5)
+ ; W64-NEXT: $sgpr0_sgpr1 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; W64-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
+ ; W64-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
+ ; W64-NEXT: $vgpr44 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
+ ; W64-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1
+ ; W64-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr41, implicit-def $vgpr43, implicit-def $sgpr22, implicit-def $sgpr48, implicit-def $m0, implicit-def $exec
+ S_NOP 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, implicit $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40
+
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
+
+# Make sure we don't break anything for entry functions.
+
+---
+name: entry_func
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+body: |
+ bb.0:
+ liveins: $sgpr30_sgpr31
+ ; CHECK-LABEL: name: entry_func
+ ; CHECK: liveins: $sgpr30_sgpr31
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45, implicit-def $vgpr51
+ ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45, implicit-def $vgpr51
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
+
+---
+name: multiple_basic_blocks
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+body: |
+ ; CHECK-LABEL: name: multiple_basic_blocks
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr44, $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $m0 = S_MOV_B32 11
+ ; CHECK-NEXT: SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
+ ; CHECK-NEXT: S_BRANCH %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.2(0x80000000)
+ ; CHECK-NEXT: liveins: $vgpr44, $sgpr30_sgpr31
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr43, implicit $vgpr44
+ ; CHECK-NEXT: S_BRANCH %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $m0 = S_MOV_B32 11
+ ; CHECK-NEXT: $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr44, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: (load (s1024) from %stack.0, align 4, addrspace 5)
+ ; CHECK-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
+ bb.0:
+ liveins: $sgpr30_sgpr31, $vgpr44
+ S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr45
+ S_BRANCH %bb.1
+
+ bb.1:
+ liveins: $sgpr30_sgpr31, $vgpr44
+ S_NOP 0, implicit-def $vgpr43, implicit $vgpr44
+ S_BRANCH %bb.2
+
+ bb.2:
+ liveins: $sgpr30_sgpr31
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll
new file mode 100644
index 0000000000000..91ad9742f7b28
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+block-vgpr-csr < %s | FileCheck -check-prefixes=CHECK,GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+block-vgpr-csr < %s | FileCheck -check-prefixes=CHECK,DAGISEL %s
+
+define i32 @non_entry_func(i32 %x) {
+; CHECK-LABEL: non_entry_func:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_expcnt 0x0
+; CHECK-NEXT: s_wait_samplecnt 0x0
+; CHECK-NEXT: s_wait_bvhcnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: s_xor_saveexec_b32 s0, -1
+; CHECK-NEXT: scratch_store_b32 off, v2, s32 offset:100 ; 4-byte Folded Spill
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_mov_b32 exec_lo, s0
+; CHECK-NEXT: s_mov_b32 m0, 0x110003
+; CHECK-NEXT: v_writelane_b32 v2, s48, 0
+; CHECK-NEXT: ; transferring at most v40 v41 v56 v60 ; 128-byte Folded Spill
+; CHECK-NEXT: scratch_store_block off, v[40:71], s32 offset:4
+; CHECK-NEXT: s_mov_b32 m0, 1
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: ; transferring at most v120 ; 128-byte Folded Spill
+; CHECK-NEXT: scratch_store_block off, v[120:151], s32
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: s_nop
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; transferring at most v120 ; 128-byte Folded Reload
+; CHECK-NEXT: scratch_load_block v[120:151], off, s32
+; CHECK-NEXT: s_mov_b32 m0, 0x110003
+; CHECK-NEXT: scratch_store_b32 off, v1, s32 offset:88
+; CHECK-NEXT: ; transferring at most v40 v41 v56 v60 ; 128-byte Folded Reload
+; CHECK-NEXT: scratch_load_block v[40:71], off, s32 offset:4
+; CHECK-NEXT: v_mov_b32_e32 v0, v1
+; CHECK-NEXT: v_readlane_b32 s48, v2, 0
+; CHECK-NEXT: s_xor_saveexec_b32 s0, -1
+; CHECK-NEXT: scratch_load_b32 v2, off, s32 offset:100 ; 4-byte Folded Reload
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_mov_b32 exec_lo, s0
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %local = alloca i32, i32 3, addrspace(5)
+ store i32 %x, ptr addrspace(5) %local
+ call void asm "s_nop", "~{v0},~{v8},~{v40},~{v41},~{v49},~{v52},~{v56},~{v60},~{v120},~{s0},~{s48}"()
+ ret i32 %x
+}
+
+define amdgpu_kernel void @entry_func(i32 %x) {
+; GISEL-LABEL: entry_func:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GISEL-NEXT: s_load_b32 s6, s[4:5], 0x0
+; GISEL-NEXT: v_mov_b32_e32 v31, v0
+; GISEL-NEXT: s_mov_b64 s[12:13], s[0:1]
+; GISEL-NEXT: ;;#ASMSTART
+; GISEL-NEXT: s_nop
+; GISEL-NEXT: ;;#ASMEND
+; GISEL-NEXT: s_add_co_u32 s8, s4, 4
+; GISEL-NEXT: s_mov_b32 s0, non_entry_func at abs32@lo
+; GISEL-NEXT: s_mov_b32 s1, non_entry_func at abs32@hi
+; GISEL-NEXT: s_add_co_ci_u32 s9, s5, 0
+; GISEL-NEXT: s_mov_b64 s[4:5], s[12:13]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v0, s6
+; GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GISEL-NEXT: s_endpgm
+;
+; DAGISEL-LABEL: entry_func:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_load_b32 s12, s[4:5], 0x0
+; DAGISEL-NEXT: s_mov_b64 s[10:11], s[6:7]
+; DAGISEL-NEXT: v_mov_b32_e32 v31, v0
+; DAGISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; DAGISEL-NEXT: ;;#ASMSTART
+; DAGISEL-NEXT: s_nop
+; DAGISEL-NEXT: ;;#ASMEND
+; DAGISEL-NEXT: s_add_nc_u64 s[8:9], s[4:5], 4
+; DAGISEL-NEXT: s_mov_b32 s1, non_entry_func at abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, non_entry_func at abs32@lo
+; DAGISEL-NEXT: s_mov_b64 s[4:5], s[6:7]
+; DAGISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; DAGISEL-NEXT: s_mov_b32 s32, 0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: v_mov_b32_e32 v0, s12
+; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; DAGISEL-NEXT: s_endpgm
+ call void asm "s_nop", "~{v0},~{v8},~{v40},~{v41},~{v49},~{v52},~{v56},~{v60},~{v120},~{s0},~{s48}"()
+ %res = call i32 @non_entry_func(i32 %x)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir b/llvm/test/CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir
new file mode 100644
index 0000000000000..6ef1c33ed18f6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-blocks-funcinfo.mir
@@ -0,0 +1,47 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+block-vgpr-csr -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s
+
+# The spill slot for the VGPR block needs to hold v40 and v43, so it needs to be
+# 16 bytes large.
+---
+name: locals
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: $sgpr32
+stack:
+- { id: 0, type: default, offset: 0, size: 12, alignment: 4,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+ debug-info-location: '' }
+- { id: 1, type: default, offset: 12, size: 20, alignment: 4,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+ debug-info-location: '' }
+# CHECK-LABEL: name: locals
+# CHECK: frameInfo:
+# CHECK: stackSize: 52
+# CHECK: stack:
+# CHECK-NEXT: - { id: 0, name: '', type: default, offset: 16, size: 12, alignment: 4,
+# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+# CHECK-NEXT: local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+# CHECK-NEXT: debug-info-location: '' }
+# CHECK-NEXT: - { id: 1, name: '', type: default, offset: 28, size: 20, alignment: 4,
+# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+# CHECK-NEXT: local-offset: 0, debug-info-variable: '', debug-info-expression: '',
+# CHECK-NEXT: debug-info-location: '' }
+# CHECK-NEXT: - { id: 2, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4,
+# CHECK-NEXT: stack-id: default, callee-saved-register: '$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71',
+# CHECK-NEXT: callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
+# CHECK-NEXT: debug-info-location: '' }
+# CHECK-NEXT: - { id: 3, name: '', type: default, offset: 48, size: 4, alignment: 4,
+# CHECK-NEXT: stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+# CHECK-NEXT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+# CHECK: machineFunctionInfo:
+# CHECK: hasSpilledVGPRs: true
+body: |
+ bb.0:
+ liveins: $sgpr30_sgpr31, $vgpr48
+ SCRATCH_STORE_DWORD_SADDR $vgpr48, %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
+ SCRATCH_STORE_DWORD_SADDR $vgpr48, %stack.1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
+ S_NOP 0, implicit-def $vgpr40, implicit-def $vgpr43
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
index 6d6f17883a07e..d6cbaf3f3fb5d 100644
--- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
@@ -23,5 +23,6 @@ add_llvm_target_unittest(AMDGPUTests
CSETest.cpp
DwarfRegMappings.cpp
ExecMayBeModifiedBeforeAnyUse.cpp
+ LiveRegUnits.cpp
PALMetadata.cpp
)
diff --git a/llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp b/llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp
new file mode 100644
index 0000000000000..95266dc853bfd
--- /dev/null
+++ b/llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp
@@ -0,0 +1,160 @@
+//===--------- llvm/unittests/Target/AMDGPU/LiveRegUnits.cpp --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPUUnitTests.h"
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/TargetParser/TargetParser.h"
+#include "gtest/gtest.h"
+
+#include "AMDGPUGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+// FIXME: Consolidate parseMIR and other common helpers (this one is copied from
+// unittests/MIR/MachineMetadata.cpp).
+std::unique_ptr<Module> parseMIR(LLVMContext &Context, const TargetMachine &TM,
+ StringRef MIRCode, const char *FnName,
+ MachineModuleInfo &MMI) {
+ SMDiagnostic Diagnostic;
+ std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
+ auto MIR = createMIRParser(std::move(MBuffer), Context);
+ if (!MIR)
+ return nullptr;
+
+ std::unique_ptr<Module> Mod = MIR->parseIRModule();
+ if (!Mod)
+ return nullptr;
+
+ Mod->setDataLayout(TM.createDataLayout());
+
+ if (MIR->parseMachineFunctions(*Mod, MMI)) {
+ return nullptr;
+ }
+
+ return Mod;
+}
+
+TEST(AMDGPULiveRegUnits, TestVGPRBlockLoadStore) {
+ auto TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx1200", "");
+ ASSERT_TRUE(TM) << "No target machine";
+
+ GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+ std::string(TM->getTargetFeatureString()), *TM);
+
+ // Add a very simple MIR snippet that saves and restores a block of VGPRs. The
+ // body of the function, represented by a S_NOP, clobbers one CSR (v42) and
+ // one caller-saved register (v49), and reads one CSR (v61) and one
+ // callee-saved register (v53).
+ StringRef MIRString = R"MIR(
+name: vgpr-block-insts
+stack:
+- { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 4,
+ stack-id: default, callee-saved-register: '$vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71',
+ callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '',
+ debug-info-location: '' }
+body: |
+ bb.0:
+ liveins: $sgpr30_sgpr31, $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73
+
+ $m0 = S_MOV_B32 1
+ SCRATCH_STORE_BLOCK_SADDR $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0 :: (store (s1024) into %stack.0, align 4, addrspace 5)
+ S_NOP 0, implicit-def $vgpr42, implicit-def $vgpr49, implicit $vgpr53, implicit $vgpr61
+ $m0 = S_MOV_B32 1
+ $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63_vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73 = SCRATCH_LOAD_BLOCK_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $m0, implicit $vgpr43, implicit $vgpr44, implicit $vgpr45, implicit $vgpr46, implicit $vgpr47, implicit $vgpr56, implicit $vgpr57, implicit $vgpr58, implicit $vgpr59, implicit $vgpr60, implicit $vgpr61, implicit $vgpr62, implicit $vgpr63, implicit $vgpr72, implicit $vgpr73 :: (load (s1024) from %stack.0, align 4, addrspace 5)
+ S_SETPC_B64_return $sgpr30_sgpr31
+...
+)MIR";
+
+ LLVMContext Context;
+ MachineModuleInfo MMI(TM.get());
+ auto M = parseMIR(Context, *TM, MIRString, "vgpr-block-insts", MMI);
+
+ auto *MF = MMI.getMachineFunction(*M->getFunction("vgpr-block-insts"));
+ auto *MBB = MF->getBlockNumbered(0);
+
+ auto MIt = --MBB->instr_end();
+
+ LiveRegUnits LiveUnits;
+ LiveUnits.init(*ST.getRegisterInfo());
+
+ LiveUnits.addLiveOuts(*MBB);
+ LiveUnits.stepBackward(*MIt);
+
+ // Right after the restore, we expect all the CSRs to be unavailable.
+ // Check v40-v88 (callee and caller saved regs interleaved in blocks of 8).
+ for (unsigned I = 0; I < 8; ++I) {
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I;
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR48 + I)) << "I = " << I;
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR56 + I)) << "I = " << I;
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR64 + I)) << "I = " << I;
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR72 + I)) << "I = " << I;
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR80 + I)) << "I = " << I;
+ }
+
+ --MIt;
+ LiveUnits.stepBackward(*MIt);
+
+ // Right before the restore, we expect the CSRs that are actually transferred
+ // (in this case v42) to be available. Everything else should be the same as
+ // before.
+ for (unsigned I = 0; I < 8; ++I) {
+ if (I == 2)
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I;
+ else
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I;
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR48 + I)) << "I = " << I;
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR56 + I)) << "I = " << I;
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR64 + I)) << "I = " << I;
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR72 + I)) << "I = " << I;
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR80 + I)) << "I = " << I;
+ }
+
+ --MIt; // Set m0 has no effect on VGPRs.
+ LiveUnits.stepBackward(*MIt);
+ --MIt; // S_NOP.
+ LiveUnits.stepBackward(*MIt);
+
+ // The S_NOP uses one of the caller-saved registers (v53), so that won't be
+ // available anymore.
+ for (unsigned I = 0; I < 8; ++I) {
+ if (I == 2)
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I;
+ else
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I;
+ if (I == 5)
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR48 + I)) << "I = " << I;
+ else
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR48 + I)) << "I = " << I;
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR56 + I)) << "I = " << I;
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR64 + I)) << "I = " << I;
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR72 + I)) << "I = " << I;
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR80 + I)) << "I = " << I;
+ }
+
+ --MIt;
+ LiveUnits.stepBackward(*MIt);
+
+ // Right before the save, all the VGPRs in the block that we're saving will be
+ // unavailable, regardless of whether they're callee or caller saved. This is
+ // unfortunate and should probably be fixed somehow.
+ // VGPRs outside the block will only be unavailable if they're callee saved.
+ for (unsigned I = 0; I < 8; ++I) {
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR40 + I)) << "I = " << I;
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR48 + I)) << "I = " << I;
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR56 + I)) << "I = " << I;
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR64 + I)) << "I = " << I;
+ EXPECT_FALSE(LiveUnits.available(AMDGPU::VGPR72 + I)) << "I = " << I;
+ EXPECT_TRUE(LiveUnits.available(AMDGPU::VGPR80 + I)) << "I = " << I;
+ }
+}
>From f14397aeb9f919768bec3cf168581cf4934f7eb0 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Wed, 23 Apr 2025 13:13:21 +0200
Subject: [PATCH 2/5] Make var local
---
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9c1014a0e5cfe..2632aa3863c37 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -388,12 +388,13 @@ class PrologEpilogSGPRSaveRestoreInfo {
SGPRSaveKind getKind() const { return Kind; }
};
-const MCRegister FirstVGPRBlock = AMDGPU::VReg_1024RegClass.getRegister(0);
struct VGPRBlock2IndexFunctor {
using argument_type = Register;
unsigned operator()(Register Reg) const {
assert(AMDGPU::VReg_1024RegClass.contains(Reg) && "Expecting a VGPR block");
+
+ const MCRegister FirstVGPRBlock = AMDGPU::VReg_1024RegClass.getRegister(0);
return Reg - FirstVGPRBlock;
}
};
>From fc8624b45bff157ac35a4b1a0698326a1129029c Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Thu, 24 Apr 2025 13:20:50 +0200
Subject: [PATCH 3/5] Fix LiveIntervals/SlotIndexes in SILowerSGPRSpills
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 19 +++++++++++++++++++
llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll | 8 +++++---
2 files changed, 24 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index d87466cd49288..3920bcdb73b0e 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -149,6 +149,14 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
if (LIS)
LIS->removeAllRegUnitsForPhysReg(Reg);
}
+ } else {
+ // TFI doesn't update Indexes and LIS, so we have to do it separately.
+ if (Indexes)
+ Indexes->repairIndexesInRange(&SaveBlock, SaveBlock.begin(), I);
+
+ if (LIS)
+ for (const CalleeSavedInfo &CS : CSI)
+ LIS->removeAllRegUnitsForPhysReg(CS.getReg());
}
}
@@ -165,6 +173,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
// Restore all registers immediately before the return and any
// terminators that precede it.
MachineBasicBlock::iterator I = RestoreBlock.getFirstTerminator();
+ const MachineBasicBlock::iterator BeforeRestoresI =
+ I == RestoreBlock.begin() ? I : std::prev(I);
// FIXME: Just emit the readlane/writelane directly
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
@@ -188,6 +198,15 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
if (LIS)
LIS->removeAllRegUnitsForPhysReg(Reg);
}
+ } else {
+ // TFI doesn't update Indexes and LIS, so we have to do it separately.
+ if (Indexes)
+ Indexes->repairIndexesInRange(&RestoreBlock, BeforeRestoresI,
+ RestoreBlock.getFirstTerminator());
+
+ if (LIS)
+ for (const CalleeSavedInfo &CS : CSI)
+ LIS->removeAllRegUnitsForPhysReg(CS.getReg());
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll
index 91ad9742f7b28..bfadfd860edf6 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll
@@ -14,12 +14,13 @@ define i32 @non_entry_func(i32 %x) {
; CHECK-NEXT: scratch_store_b32 off, v2, s32 offset:100 ; 4-byte Folded Spill
; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_mov_b32 exec_lo, s0
-; CHECK-NEXT: s_mov_b32 m0, 0x110003
; CHECK-NEXT: v_writelane_b32 v2, s48, 0
+; CHECK-NEXT: s_mov_b32 m0, 0x110003
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: ; transferring at most v40 v41 v56 v60 ; 128-byte Folded Spill
; CHECK-NEXT: scratch_store_block off, v[40:71], s32 offset:4
; CHECK-NEXT: s_mov_b32 m0, 1
-; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_writelane_b32 v2, s49, 1
; CHECK-NEXT: ; transferring at most v120 ; 128-byte Folded Spill
; CHECK-NEXT: scratch_store_block off, v[120:151], s32
; CHECK-NEXT: ;;#ASMSTART
@@ -32,6 +33,7 @@ define i32 @non_entry_func(i32 %x) {
; CHECK-NEXT: ; transferring at most v40 v41 v56 v60 ; 128-byte Folded Reload
; CHECK-NEXT: scratch_load_block v[40:71], off, s32 offset:4
; CHECK-NEXT: v_mov_b32_e32 v0, v1
+; CHECK-NEXT: v_readlane_b32 s49, v2, 1
; CHECK-NEXT: v_readlane_b32 s48, v2, 0
; CHECK-NEXT: s_xor_saveexec_b32 s0, -1
; CHECK-NEXT: scratch_load_b32 v2, off, s32 offset:100 ; 4-byte Folded Reload
@@ -41,7 +43,7 @@ define i32 @non_entry_func(i32 %x) {
; CHECK-NEXT: s_setpc_b64 s[30:31]
%local = alloca i32, i32 3, addrspace(5)
store i32 %x, ptr addrspace(5) %local
- call void asm "s_nop", "~{v0},~{v8},~{v40},~{v41},~{v49},~{v52},~{v56},~{v60},~{v120},~{s0},~{s48}"()
+ call void asm "s_nop", "~{v0},~{v8},~{v40},~{v41},~{v49},~{v52},~{v56},~{v60},~{v120},~{s0},~{s48},~{s49}"()
ret i32 %x
}
>From 8cc51f5f674840823e4f7d5485c54c7b31295adf Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Thu, 24 Apr 2025 13:43:00 +0200
Subject: [PATCH 4/5] Use helper for restores in SILowerSGPRSpills
---
llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 11 ++---------
1 file changed, 2 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 3920bcdb73b0e..36d5b1bab017d 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -179,16 +179,9 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
// FIXME: Just emit the readlane/writelane directly
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
for (const CalleeSavedInfo &CI : reverse(CSI)) {
- Register Reg = CI.getReg();
- const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(
- Reg, Reg == RI->getReturnAddressReg(MF) ? MVT::i64 : MVT::i32);
-
- TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI,
- Register());
- assert(I != RestoreBlock.begin() &&
- "loadRegFromStackSlot didn't insert any code!");
// Insert in reverse order. loadRegFromStackSlot can insert
// multiple instructions.
+ TFI->restoreCalleeSavedRegister(RestoreBlock, I, CI, &TII, TRI);
if (Indexes) {
MachineInstr &Inst = *std::prev(I);
@@ -196,7 +189,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
}
if (LIS)
- LIS->removeAllRegUnitsForPhysReg(Reg);
+ LIS->removeAllRegUnitsForPhysReg(CI.getReg());
}
} else {
// TFI doesn't update Indexes and LIS, so we have to do it separately.
>From d34a889b803b602b31d4a14b4b78f2b9d1938b6f Mon Sep 17 00:00:00 2001
From: Diana Picus <diana.picus at gmail.com>
Date: Thu, 24 Apr 2025 14:24:43 +0200
Subject: [PATCH 5/5] Remove stray line
---
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 2632aa3863c37..0e7635a045588 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -388,7 +388,6 @@ class PrologEpilogSGPRSaveRestoreInfo {
SGPRSaveKind getKind() const { return Kind; }
};
-
struct VGPRBlock2IndexFunctor {
using argument_type = Register;
unsigned operator()(Register Reg) const {
More information about the llvm-commits
mailing list