[PATCH 2/2] R600/SI: Implement VGPR register spilling v2
Matt Arsenault
arsenm2 at gmail.com
Tue Apr 29 16:09:49 PDT 2014
On Apr 29, 2014, at 11:31 PM, Tom Stellard <thomas.stellard at amd.com> wrote:
> VGPRs are spilled to LDS.
>
> v2: Only calculate thread id once
> ---
> lib/Target/R600/AMDGPUAsmPrinter.cpp | 3 +-
> lib/Target/R600/SIInstrInfo.cpp | 163 +++++++++++++++++++++++++++++-
> lib/Target/R600/SIInstrInfo.h | 8 ++
> lib/Target/R600/SIInstructions.td | 23 +++++
> lib/Target/R600/SIMachineFunctionInfo.cpp | 51 ++++++++--
> lib/Target/R600/SIMachineFunctionInfo.h | 10 ++
> 6 files changed, 247 insertions(+), 11 deletions(-)
>
> diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
> index 170f479..4ac702e 100644
> --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
> +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
> @@ -311,7 +311,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
> LDSAlignShift = 9;
> }
> unsigned LDSBlocks =
> - RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
> + RoundUpToAlignment(MFI->LDSSize + (MFI->LDSWaveSpillSize * 64),
> + 1 << LDSAlignShift) >> LDSAlignShift;
>
> if (MFI->ShaderType == ShaderType::COMPUTE) {
> OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
> diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
> index 8cf3f38..038e16b 100644
> --- a/lib/Target/R600/SIInstrInfo.cpp
> +++ b/lib/Target/R600/SIInstrInfo.cpp
> @@ -199,7 +199,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
> .addReg(SrcReg, KillFlag)
> .addImm(Lane);
> MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
> - } else {
> + } else if (RI.isSGPRClass(RC)){
> // We are only allowed to create one new instruction when spilling
> // registers, so we need to use pseudo instruction for vector
> // registers.
> @@ -224,6 +224,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
> BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
> .addReg(SrcReg)
> .addImm(FrameIndex);
> + } else if(RI.hasVGPRs(RC)) {
> + unsigned Opcode;
> + switch(RC->getSize() * 8) {
> + case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break;
> + case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break;
> + case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break;
> + case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break;
> + case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break;
> + case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break;
> + default: llvm_unreachable("Cannot spill register class");
> + }
> + MFI->allocateLDSSpaceForSpill(FrameIndex, RC->getSize());
> + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> + BuildMI(MBB, MI, DL, get(Opcode), TmpReg)
> + .addReg(MFI->getSpillTIDVirtualReg(MRI, MBB.getParent()))
> + .addReg(SrcReg)
> + .addImm(FrameIndex);
> + } else {
> + llvm_unreachable("Don't know how to spill register class");
> }
> }
>
> @@ -233,6 +252,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
> const TargetRegisterClass *RC,
> const TargetRegisterInfo *TRI) const {
> SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
> + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
> DebugLoc DL = MBB.findDebugLoc(MI);
> if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
> SIMachineFunctionInfo::SpilledReg Spill =
> @@ -259,7 +279,69 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
> .addReg(Spill.VGPR)
> .addImm(FrameIndex);
> insertNOPs(MI, 3);
> + } else if(RI.hasVGPRs(RC)) {
> + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
> + unsigned Opcode;
> + switch(RC->getSize() * 8) {
> + case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break;
> + case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break;
> + case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break;
> + case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break;
> + case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break;
> + case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break;
> + default: llvm_unreachable("Cannot spill register class");
> + }
> + BuildMI(MBB, MI, DL, get(Opcode), DestReg)
> + .addReg(TmpReg, RegState::Define)
> + .addReg(MFI->getSpillTIDVirtualReg(MRI, MBB.getParent()))
> + .addImm(FrameIndex);
> + }
> +}
> +
> +/// \param @Offset Offset in bytes of the FrameIndex being spilled
> +unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
> + MachineBasicBlock::iterator MI,
> + unsigned TmpReg,
> + unsigned TIDOffsetReg,
> + unsigned FrameOffset,
> + unsigned Size) const {
> + SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
> + DebugLoc DL = MBB.findDebugLoc(MI);
> + unsigned ThreadsInWave = 64;
Should use AMDGPUSubtarget::getWavefrontSize()
> + unsigned LDSOffset = MFI->LDSSize + (FrameOffset * ThreadsInWave);
> +
> + if (!MFI->HasCalculatedTIDOffset) {
> + MachineBasicBlock &Entry = MBB.getParent()->front();
> + MachineBasicBlock::iterator Insert = Entry.front();
> + DebugLoc DL = Insert->getDebugLoc();
> + // Get the wave id
> + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
> + TIDOffsetReg)
> + .addImm(-1)
> + .addImm(0)
> + .addImm(0)
> + .addImm(0)
> + .addImm(0)
> + .addImm(0);
> +
> + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32),
> + TIDOffsetReg)
> + .addImm(-1)
> + .addReg(TIDOffsetReg);
> +
> + BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
> + TIDOffsetReg)
> + .addImm(2)
> + .addReg(TIDOffsetReg);
> + MFI->HasCalculatedTIDOffset = true;
> }
> +
> + // Add FrameIndex to LDS offset
> + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
> + .addImm(LDSOffset)
> + .addReg(TIDOffsetReg);
> +
> + return TmpReg;
> }
>
> static unsigned getNumSubRegsForSpillOp(unsigned Op) {
> @@ -267,16 +349,30 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
> switch (Op) {
> case AMDGPU::SI_SPILL_S512_SAVE:
> case AMDGPU::SI_SPILL_S512_RESTORE:
> + case AMDGPU::SI_SPILL_V512_SAVE:
> + case AMDGPU::SI_SPILL_V512_RESTORE:
> return 16;
> case AMDGPU::SI_SPILL_S256_SAVE:
> case AMDGPU::SI_SPILL_S256_RESTORE:
> + case AMDGPU::SI_SPILL_V256_SAVE:
> + case AMDGPU::SI_SPILL_V256_RESTORE:
> return 8;
> case AMDGPU::SI_SPILL_S128_SAVE:
> case AMDGPU::SI_SPILL_S128_RESTORE:
> + case AMDGPU::SI_SPILL_V128_SAVE:
> + case AMDGPU::SI_SPILL_V128_RESTORE:
> return 4;
> + case AMDGPU::SI_SPILL_V96_SAVE:
> + case AMDGPU::SI_SPILL_V96_RESTORE:
> + return 3;
> case AMDGPU::SI_SPILL_S64_SAVE:
> case AMDGPU::SI_SPILL_S64_RESTORE:
> + case AMDGPU::SI_SPILL_V64_SAVE:
> + case AMDGPU::SI_SPILL_V64_RESTORE:
> return 2;
> + case AMDGPU::SI_SPILL_V32_SAVE:
> + case AMDGPU::SI_SPILL_V32_RESTORE:
> + return 1;
> default: llvm_unreachable("Invalid spill opcode");
> }
> }
> @@ -347,7 +443,72 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
> MI->eraseFromParent();
> break;
> }
> +
> + // VGPR register spill to LDS
> + case AMDGPU::SI_SPILL_V512_SAVE:
> + case AMDGPU::SI_SPILL_V256_SAVE:
> + case AMDGPU::SI_SPILL_V128_SAVE:
> + case AMDGPU::SI_SPILL_V64_SAVE:
> + case AMDGPU::SI_SPILL_V32_SAVE: {
> + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
> + unsigned TmpReg = MI->getOperand(0).getReg();
> + unsigned TIDOffsetReg = MI->getOperand(1).getReg();
> + unsigned SrcReg = MI->getOperand(2).getReg();
> + unsigned FrameIndex = MI->getOperand(3).getImm();
> + unsigned Offset = MFI->LDSSpillOffsets[FrameIndex];
> + unsigned Size = NumSubRegs * 4;
> +
> + for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
> + unsigned SubReg = NumSubRegs > 1 ?
> + RI.getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) :
> + SrcReg;
> + Offset += (i * 4);
> + unsigned AddrReg = calculateLDSSpillAddress(MBB, MI, TmpReg, TIDOffsetReg,
> + Offset, Size);
> +
> + // Store the value in LDS
> + BuildMI(MBB, MI, DL, get(AMDGPU::DS_WRITE_B32))
> + .addImm(0) // gds
> + .addReg(AddrReg) // addr
> + .addReg(SubReg) // data0
> + .addImm(0); // offset
> + }
> +
> + MI->eraseFromParent();
> + break;
> + }
> + case AMDGPU::SI_SPILL_V32_RESTORE:
> + case AMDGPU::SI_SPILL_V64_RESTORE:
> + case AMDGPU::SI_SPILL_V128_RESTORE:
> + case AMDGPU::SI_SPILL_V256_RESTORE:
> + case AMDGPU::SI_SPILL_V512_RESTORE: {
> + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
> + unsigned DstReg = MI->getOperand(0).getReg();
> + unsigned TempReg = MI->getOperand(1).getReg();
> + unsigned TIDOffsetReg = MI->getOperand(2).getReg();
> + unsigned FrameIndex = MI->getOperand(3).getImm();
> + unsigned Offset = MFI->LDSSpillOffsets[FrameIndex];
> + unsigned Size = NumSubRegs * 4;
> +
> + // FIXME: We could use DS_READ_B64 here to optimize for larger registers.
> + for (unsigned i = 0, e = NumSubRegs; i != e; ++i) {
> + unsigned SubReg = NumSubRegs > 1 ?
> + RI.getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) :
> + DstReg;
> +
> + Offset += (i * 4);
> + unsigned AddrReg = calculateLDSSpillAddress(MBB, MI, TempReg, TIDOffsetReg,
> + Offset, Size);
> + BuildMI(MBB, MI, DL, get(AMDGPU::DS_READ_B32), SubReg)
> + .addImm(0) // gds
> + .addReg(AddrReg) // addr
> + .addImm(0); //offset
> + }
> + MI->eraseFromParent();
> + break;
> }
> + }
> +
> return true;
> }
>
> diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
> index 59ebc35..a2fb473 100644
> --- a/lib/Target/R600/SIInstrInfo.h
> +++ b/lib/Target/R600/SIInstrInfo.h
> @@ -48,6 +48,14 @@ private:
> MachineInstr *Inst, unsigned Opcode) const;
>
> void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
> + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB,
> + MachineBasicBlock::iterator MI,
> + unsigned TmpReg,
> + unsigned TIDOffsetReg,
> + unsigned Offset,
> + unsigned Size) const;
> +
> +
>
> public:
> explicit SIInstrInfo(AMDGPUTargetMachine &tm);
> diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> index b93de36..1cd60fc 100644
> --- a/lib/Target/R600/SIInstructions.td
> +++ b/lib/Target/R600/SIInstructions.td
> @@ -1595,6 +1595,29 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
> defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
> defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
>
> +multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
> + def _SAVE : InstSI <
> + (outs VGPR_32:$temp),
> + (ins VGPR_32:$tid, vgpr_class:$src, i32imm:$frame_idx),
> + "", []
> + > {
> + let Constraints = "@earlyclobber $temp";
> + }
> +
> + def _RESTORE : InstSI <
> + (outs vgpr_class:$dst, VGPR_32:$temp),
> + (ins VGPR_32:$tid, i32imm:$frame_idx),
> + "", []
> + >;
> +}
> +
> +defm SI_SPILL_V32 : SI_SPILL_VGPR <VReg_32>;
> +defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>;
> +defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>;
> +defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
> +defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
> +defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
> +
> } // end IsCodeGenOnly, isPseudo
>
> def : Pat<
> diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
> index af60995..e4a39f6 100644
> --- a/lib/Target/R600/SIMachineFunctionInfo.cpp
> +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
> @@ -12,6 +12,7 @@
> #include "SIMachineFunctionInfo.h"
> #include "SIInstrInfo.h"
> #include "SIRegisterInfo.h"
> +#include "llvm/CodeGen/MachineInstrBuilder.h"
> #include "llvm/CodeGen/MachineRegisterInfo.h"
> #include "llvm/IR/Function.h"
> #include "llvm/IR/LLVMContext.h"
> @@ -26,8 +27,22 @@ void SIMachineFunctionInfo::anchor() {}
>
> SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
> : AMDGPUMachineFunction(MF),
> + SpillTIDVirtualReg(AMDGPU::NoRegister),
> PSInputAddr(0),
> - SpillTracker() { }
> + SpillTracker(),
> + LDSWaveSpillSize(0),
> + HasCalculatedTIDOffset(false) { }
> +
> +static void addFunctionLiveOut(unsigned Reg, MachineFunction *MF) {
> + for (MachineBasicBlock &MBB : *MF) {
> + if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
> + MBB.back().addOperand(*MF, MachineOperand::CreateReg(Reg, false, true));
> + return;
> + }
> + }
> + MF->getFunction()->getContext().emitError(
> + "Could not found S_ENGPGM instrtuction.");
> +}
Typo: S_ENGPGM
Why is this an error? Is S_ENDPGM really always required? I’ve seen some cases with infinite loops where it ends up missing.
>
> static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
> unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
> @@ -55,15 +70,8 @@ static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
> //
> // To work around this, we add Lane VGPRs to the functions live out list,
> // so that we can guarantee its live range will cover all of its uses.
> + addFunctionLiveOut(VGPR, MF);
>
> - for (MachineBasicBlock &MBB : *MF) {
> - if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
> - MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true));
> - return VGPR;
> - }
> - }
> - MF->getFunction()->getContext().emitError(
> - "Could not found S_ENGPGM instrtuction.");
> return VGPR;
> }
>
> @@ -92,3 +100,28 @@ const SIMachineFunctionInfo::SpilledReg&
> SIMachineFunctionInfo::RegSpillTracker::getSpilledReg(unsigned FrameIndex) {
> return SpilledRegisters[FrameIndex];
> }
> +
> +unsigned SIMachineFunctionInfo::getSpillTIDVirtualReg(
> + MachineRegisterInfo &MRI,
> + MachineFunction *MF) {
> + if (SpillTIDVirtualReg == AMDGPU::NoRegister) {
> + SpillTIDVirtualReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
> + MachineBasicBlock &Entry = MF->front();
> + MachineBasicBlock::iterator Insert = Entry.front();
> + Insert->addOperand(MachineOperand::CreateReg(SpillTIDVirtualReg,
> + true, true));
> + addFunctionLiveOut(SpillTIDVirtualReg, MF);
> + }
> +
> + return SpillTIDVirtualReg;
> +}
> +
> +unsigned SIMachineFunctionInfo::allocateLDSSpaceForSpill(unsigned FrameIndex,
> + unsigned NumBytes) {
> + if (!LDSSpillOffsets.count(FrameIndex)) {
> + LDSSpillOffsets[FrameIndex] = LDSWaveSpillSize;
> + LDSWaveSpillSize += NumBytes;
> + }
> +
> + return LDSSpillOffsets[FrameIndex];
> +}
> diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
> index 8eb35c9..5eb05d1 100644
> --- a/lib/Target/R600/SIMachineFunctionInfo.h
> +++ b/lib/Target/R600/SIMachineFunctionInfo.h
> @@ -26,6 +26,9 @@ class MachineRegisterInfo;
> /// tells the hardware which interpolation parameters to load.
> class SIMachineFunctionInfo : public AMDGPUMachineFunction {
> virtual void anchor();
> +
> + unsigned SpillTIDVirtualReg;
> +
> public:
>
> struct SpilledReg {
> @@ -59,6 +62,13 @@ public:
> SIMachineFunctionInfo(const MachineFunction &MF);
> unsigned PSInputAddr;
> struct RegSpillTracker SpillTracker;
> + unsigned LDSWaveSpillSize;
> + bool HasCalculatedTIDOffset;
> + /// Key is FrameIndex, value is byte offset
> + std::map<unsigned, unsigned> LDSSpillOffsets;
DenseMap is probably appropriate for this
> + unsigned getSpillTIDVirtualReg(MachineRegisterInfo &MRI, MachineFunction *MF);
> + /// Returns the wave local offset for this \p FrameIndex
> + unsigned allocateLDSSpaceForSpill(unsigned FrameIndex, unsigned NumBytes);
> };
>
> } // End namespace llvm
> --
> 1.8.1.5
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list